In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report 

In [None]:
data = pd.read_csv("../input/course-study/appendix.csv")

In [None]:
data

In [None]:
data.info()

In [None]:
df = data.copy() 

In [None]:
df.columns 

In [None]:
# Dropping unnecessary columns

df.drop(["Course Number","Instructors","Course Title"], axis=1, inplace=True)

In [None]:
df.head() 

In [None]:
df.info() 

In [None]:
# Conversion of "% Played Video" column to float

df["% Played Video"] = df["% Played Video"].replace("---", np.NaN).astype(np.float) 

In [None]:
# Filling the missing value in the "% Played Video column"

df["% Played Video"] = df["% Played Video"].fillna(df["% Played Video"].mean()) 

In [None]:
# Converting the "Launch Date" column into datetime columns

df["Launch Date"] = pd.to_datetime(df["Launch Date"])
df["Launch Year"] = df["Launch Date"].apply(lambda x:x.year)
df["Launch Month"] = df["Launch Date"].apply(lambda x:x.month)
df["Launch Day"] = df["Launch Date"].apply(lambda x:x.day)
df.drop("Launch Date", axis=1, inplace=True) 


In [None]:
df.head() 

In [None]:
# One-hot encoding of Institution and Course Subject 

dummies = pd.get_dummies(df["Institution"], prefix="school")
df = pd.concat([df, dummies], axis=1)
df = df.drop("Institution", axis=1) 

dummies_2 = pd.get_dummies(df["Course Subject"], prefix="subject")
df = pd.concat([df, dummies_2], axis=1)
df = df.drop("Course Subject", axis=1)  

In [None]:
df.head() 

In [None]:
# Splitting and scaling the data

X = df.drop("Honor Code Certificates", axis=1)
y = df["Honor Code Certificates"]

scaler = StandardScaler() 
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns) 

X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size=0.75, shuffle=True, random_state=42)

In [None]:
X_train 

In [None]:
y_train 

In [None]:
# Modelling and training

model = RandomForestClassifier() 
model.fit(X_train, y_train) 

In [None]:
model.score(X_test, y_test) 

In [None]:
# Results

y_preds = model.predict(X_test) 
accuracy = model.score(X_test, y_test) 
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
# Confusion matrix

cm = confusion_matrix(y_test, y_preds, labels=[0,1])
plt.figure(figsize=(8,8))
sns.heatmap(cm, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion matrix")
plt.show() 

In [None]:
# Classification report

clr = classification_report(y_test,y_preds, labels=[0,1],target_names=["Negative", "Positive"])
print("Classification report: \n \n",clr) 