In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report 

In [None]:
data = pd.read_csv("../input/course-study/appendix.csv")

In [None]:
data

In [None]:
data.info() 

In [None]:
{column: len(data[column].unique()) for column in data.columns}

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Dropping unnecessary columns
    
    df = df.drop(["Course Title", "Course Number", "Instructors"], axis=1)
    
    # Filling missing values
    df["% Played Video"] = df["% Played Video"].replace("---", np.NaN).astype(np.float)
    df["% Played Video"] = df["% Played Video"].fillna(df["% Played Video"].mean())
    
    # Extract date features
    df["Launch Date"] =  pd.to_datetime(df["Launch Date"])
    df["Launch Year"] = df["Launch Date"].apply(lambda x:x.year)
    df["Launch Month"] = df["Launch Date"].apply(lambda x:x.month)
    df["Launch Day"] = df["Launch Date"].apply(lambda x:x.day)
    df = df.drop("Launch Date", axis=1)
    
    # Split the data
    
    y = df["Honor Code Certificates"]
    X = df.drop("Honor Code Certificates", axis=1)
    
    # Train test split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, random_state=42)
    
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)  

In [None]:
X_train

In [None]:
y_train.value_counts()  

# Building Pipeline

In [None]:
onehot_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(sparse=False, drop="if_binary"))
])


preprocessor = ColumnTransformer(transformers=[
    ("onehot", onehot_transformer,["Institution", "Course Subject"])
], remainder = "passthrough")

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()),
    ("classifier", RandomForestClassifier(
    class_weight={
        0:1.0,
        1:10.0
    }))
])

In [None]:
# Training

model.fit(X_train, y_train) 

In [None]:
# Results

y_preds = model.predict(X_test)
acc = model.score(X_test, y_test)
cm = confusion_matrix(y_test,y_preds, labels=[0,1])
clr = classification_report(y_test, y_preds, labels=[0,1], target_names=["Negative", "Positive"])
                            
print("Accuracy: {:.2f}%".format(acc * 100))
                            
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, cmap="Blues")
plt.xticks(ticks=[0.5,1.5], labels=["Negative","Positive"])      
plt.yticks(ticks=[0.5,1.5], labels=["Negative","Positive"])                            
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show() 
                            
print("Classification report: \n---------------------\n", clr)                                       