In [49]:
"""
chance of student finishing the MOOC
Training data: Climate
Testing data: China
SKLearn Preprocessing Normalize: Avg_Dt
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pydotplus 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics


PERSON_COURSE_CLEANED = "../UBCx__Climate101x__3T2015_cleaned/person_course_cleaned.tsv"
PERSON_COURSE_DAY_CLEANED = "../UBCx__Climate101x__3T2015_cleaned/person_course_day_cleaned.tsv"
PERSON_COURSE_CLEANED_CHINA = "../UBCx__China300_1x__3T2015_cleaned/person_course_cleaned.tsv"
PERSON_COURSE_DAY_CLEANED_CHINA = "../UBCx__China300_1x__3T2015_cleaned/person_course_day_cleaned.tsv"

In [61]:
def get_data(data_path1, data_path2, filename):
    topdir = os.getcwd()
    person_course_cleaned_path = os.path.join(topdir, data_path1)
    person_course_day_cleaned_path = os.path.join(topdir, data_path2)

    person_course_cleaned_df = pd.read_table(person_course_cleaned_path)
    person_course_day_cleaned_df = pd.read_table(person_course_day_cleaned_path)

    result_1 = pd.merge(person_course_cleaned_df, person_course_day_cleaned_df, on='user_id', how = 'inner')
    def get_grade_label(grade):
        '''
        Return 1 if course was completed (non-NA value), 0 otherwise.
        '''
        if (pd.notnull(grade)):
            return 1
        else:
            return 0

    result_1['grade'] = result_1['grade'].map(get_grade_label)

    '''
    Dropping all the non-numerical values (for now)
    '''
    result = result_1.apply(pd.to_numeric, errors='coerce')
    '''
    Save all the categorical columns
    '''
    categorical_columns = result.columns[pd.isnull(result).all()].tolist()
    result = result.dropna(axis=1, how='all')

    '''
    For the numerical values, replace all the NA's with 0's
    (because if we only keep the rows that don't have NAs we only get 5 datapoints :'()
    '''
    result = result.fillna(0)
    # result.to_csv('left-join.tsv', sep="\t")

    '''
    Since we don't want to normalize the grades, and want to use it as the labels instead, we store them separately
    '''
    grades = result['grade']
    result = result.drop(['grade', 'user_id'], axis=1)

    '''
    Normalize the remaining numerical data: set it to have mean of 0 and standard deviation of 1.
    '''
    result_norm = result.apply(lambda x: (x - np.mean(x)) / np.std(x))
    # result_norm.to_csv('result-norm.tsv', sep="\t")

    '''
    Now convert the categorical features into binary features
    '''
    cat_data = result_1[categorical_columns]
    cat_data.applymap(str)
    cat_data.fillna("")


    cat_data = pd.get_dummies(cat_data.drop(['continent','start_time','last_event_x','last_event_y','date'], axis=1))
    final_frame = cat_data.join(result)
    return [final_frame, grades]
    
    

set0 = get_data(PERSON_COURSE_CLEANED, PERSON_COURSE_DAY_CLEANED, 'final-result.tsv')
set1 = get_data(PERSON_COURSE_CLEANED_CHINA, PERSON_COURSE_DAY_CLEANED_CHINA, 'final-result-china.tsv')

In [58]:
data_path1, data_path2, filename = PERSON_COURSE_CLEANED, PERSON_COURSE_DAY_CLEANED, 'final-result.tsv'
topdir = os.getcwd()
person_course_cleaned_path = os.path.join(topdir, data_path1)
person_course_day_cleaned_path = os.path.join(topdir, data_path2)

person_course_cleaned_df = pd.read_table(person_course_cleaned_path)
person_course_day_cleaned_df = pd.read_table(person_course_day_cleaned_path)

result_1 = pd.merge(person_course_cleaned_df, person_course_day_cleaned_df, on='user_id', how = 'inner')
def get_grade_label(grade):
    '''
    Return 1 if course was completed (non-NA value), 0 otherwise.
    '''
    if (pd.notnull(grade)):
        return 1
    else:
        return 0

result_1['grade'] = result_1['grade'].map(get_grade_label)

'''
Dropping all the non-numerical values (for now)
'''
result = result_1.apply(pd.to_numeric, errors='coerce')
'''
Save all the categorical columns
'''
categorical_columns = result.columns[pd.isnull(result).all()].tolist()
result = result.dropna(axis=1, how='all')

'''
For the numerical values, replace all the NA's with 0's
(because if we only keep the rows that don't have NAs we only get 5 datapoints :'()
'''
result = result.fillna(0)
# result.to_csv('left-join.tsv', sep="\t")

'''
Since we don't want to normalize the grades, and want to use it as the labels instead, we store them separately
'''
grades = result['grade']
result = result.drop(['grade', 'user_id'], axis=1)

'''
Normalize the remaining numerical data: set it to have mean of 0 and standard deviation of 1.
'''
result_norm = result.apply(lambda x: (x - np.mean(x)) / np.std(x))
# result_norm.to_csv('result-norm.tsv', sep="\t")

'''
Now convert the categorical features into binary features
'''
cat_data = result_1[categorical_columns]
cat_data.applymap(str)
cat_data.fillna("")


cat_data = pd.get_dummies(cat_data.drop(['continent','start_time','last_event_x','last_event_y','date'], axis=1))
final_frame = cat_data.join(result)

In [62]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(set0[0],set0[1])
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=list(set0[0]),  
                         class_names=['Pass','Fail'],  
                         filled=True, rounded=True) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("gradetree.pdf")

True

In [67]:
def get_scores(clf, final_frame, grades):
    scores = []
    fps = []
    fns = []
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(final_frame):
        X_train, X_test = final_frame.ix[train_index, :], final_frame.ix[test_index,:]
        y_train, y_test = grades.ix[train_index], grades.ix[test_index]
        clf = clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        scores.append(clf.score(X_test,y_test))
        num_ones = np.mean((pred == 1) & (y_test != 1))
        num_zeros = np.mean((pred == 0) & (y_test != 0))
        fps.append(num_ones)
        fns.append(num_zeros)
    print("Overall accuracy: " + str(np.mean(scores)))
    print("False positive rate: " + str(np.mean(fps)))
    #print(np.mean(fns))

print("For the climate course: ")
get_scores(clf, set0[0], set0[1])
print("For the China course: ")
get_scores(clf, set1[0], set1[1])

For the climate course: 
Overall accuracy: 0.839343459089
False positive rate: 0.104752572268
For the China course: 
Overall accuracy: 0.846011987091
False positive rate: 0.130843706777


In [37]:
clf = RandomForestClassifier(max_depth = 5, n_estimators = 10)
clf = clf.fit(final_frame, grades)
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]


# Plot the feature importances of the forest
figure, axes = plt.subplots(nrows = 1, ncols = 1)
plt.title("Feature importances")
axes = plt.bar(range(len(final_frame.columns)), importances[indices],
       color="r", yerr=std[indices], align="center")
axes.set_xticklabels(final_frame.columns)
plt.xlim([-1, len(final_frame.columns)])
plt.show()

AttributeError: 'BarContainer' object has no attribute 'set_xticklabels'