# Task for Today  

***

## Online Course Certificate Type Prediction  
  
Given *data about online courses from MIT and Harvard*, let's try to predict whether a given course offers **honor code certificates**.  
  
We will use a random forest classification model within a scikit-learn pipeline to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/course-study/appendix.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['Course Number', 'Course Title', 'Instructors'], axis=1)
    
    # Fill missing values
    df['% Played Video'] = df['% Played Video'].replace('---', np.NaN).astype(np.float)
    df['% Played Video'] = df['% Played Video'].fillna(df['% Played Video'].mean())
    
    # Extract date features
    df['Launch Date'] = pd.to_datetime(df['Launch Date'])
    df['Launch Year'] = df['Launch Date'].apply(lambda x: x.year)
    df['Launch Month'] = df['Launch Date'].apply(lambda x: x.month)
    df['Launch Day'] = df['Launch Date'].apply(lambda x: x.day)
    df = df.drop('Launch Date', axis=1)
    
    # Split df into X and y
    y = df['Honor Code Certificates']
    X = df.drop('Honor Code Certificates', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train.value_counts()

# Building Pipeline

In [None]:
onehot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, drop='if_binary'))
])

preprocessor = ColumnTransformer(transformers=[
    ('onehot', onehot_transformer, ['Institution', 'Course Subject'])
], remainder='passthrough')

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        random_state=1,
        class_weight={
            0: 1.0,
            1: 1.0
        }
    ))
])

# Training

In [None]:
model.fit(X_train, y_train)

# Results

In [None]:
y_pred = model.predict(X_test)

acc = model.score(X_test, y_test)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
clr = classification_report(y_test, y_pred, labels=[0, 1], target_names=["Negative", "Positive"])

print("Accuracy: {:.2f}%".format(acc * 100))

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=["Negative", "Positive"])
plt.yticks(ticks=[0.5, 1.5], labels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Classification Report:\n---------------------\n", clr)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/PLRZW6Az4hw