## Experiments on features generated by Stanford: http://cs229.stanford.edu/proj2015/235_report.pdf


In [1]:
import os
#from google.colab import drive
import pandas as pd
import numpy as np

# Additional Scikit-Learn imports
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn's ML models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

# Additional Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

# Fast.ai DNN Model
from fastai.tabular import *

# Keras DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2
from keras.utils import to_categorical, normalize
from keras import backend as K

# Fast.ai DNN Model
from fastai.tabular import *

# import utility
from utility import *

Using TensorFlow backend.


In [2]:
# set pandas options
pd.options.display.max_columns = 30
pd.options.display.max_rows = 20

## Import Data
- uses features that we generated from provided log files

In [3]:
#drive.mount('/content/gdrive')
path = 'data/stanford'
db_path = os.path.join(path, 'normalized_all_features.csv')

In [6]:
df = pd.read_csv(db_path, index_col=0)

In [7]:
df.head(10)

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,act_cnt_day_03,act_cnt_day_04,act_cnt_day_05,act_cnt_day_06,...,act_cnt_hour_21,act_cnt_hour_20,act_cnt_hour_23,server_access,server_outlink_percent,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.401837,0.0,0.317621,0.0,3.960922,0.0,0.0,-0.222873,-0.230229,0.765347,-0.200413,-0.19098,-0.186999,...,-0.053737,-0.050718,0.981355,1.689258,0.0,0.0,0.0,0.0,0.0,2.076406,-0.02911,1.685275,2.636167,2.141673,0.0
1,135300,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,-0.222789,-0.116767,-0.19098,-0.186999,...,-0.053737,-0.050718,-0.110213,-0.50093,0.0,0.0,0.0,0.0,0.0,-0.441684,-0.524652,-0.393056,-0.422764,-0.284897,0.0
2,131075,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,-0.222789,-0.200413,-0.19098,-0.186999,...,-0.053737,-0.050718,-0.110213,-0.449995,0.0,0.0,0.0,0.0,0.0,0.061934,-0.524652,-0.393056,-0.422764,-0.284897,0.0
3,4,0,0.866122,0.0,-0.094149,0.0,0.65278,0.0,0.0,-0.222873,-0.230229,-0.222789,1.472503,0.77955,-0.186999,...,-0.053737,-0.050718,1.163282,1.128978,0.0,0.0,0.0,0.0,0.0,1.572788,1.457518,-0.393056,2.126345,-0.284897,0.0
4,5,0,0.429975,0.0,1.964698,0.0,-0.449933,0.0,0.0,-0.109431,-0.230229,10.798728,-0.200413,-0.19098,-0.186999,...,13.175379,8.743085,11.351246,2.198604,0.0,0.0,0.0,0.0,0.0,7.616205,0.466433,6.361519,0.087058,4.568243,0.0
5,7,1,0.467901,0.0,1.376456,0.0,0.65278,0.0,0.0,0.911542,-0.230229,9.278519,-0.200413,-0.19098,-0.186999,...,17.455387,5.302032,12.988597,1.561922,0.0,0.0,0.0,0.0,0.0,1.572788,0.466433,1.165692,0.59688,-0.284897,0.0
6,131080,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,-0.222789,-0.200413,-0.19098,-0.186999,...,-0.053737,-0.050718,-0.110213,-0.449995,0.0,0.0,0.0,0.0,0.0,0.061934,-0.524652,-0.393056,-0.422764,-0.284897,0.0
7,131083,1,0.033478,0.0,0.464681,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,-0.222789,-0.200413,-0.19098,-0.186999,...,-0.053737,-0.050718,3.16449,2.402343,0.0,0.0,0.0,0.0,0.0,1.572788,-0.524652,-0.393056,0.59688,-0.284897,0.0
8,131084,0,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,-0.222789,-0.200413,-0.19098,-0.186999,...,-0.053737,-0.050718,-0.110213,-0.348126,0.0,0.0,0.0,0.0,0.0,0.565552,-0.524652,-0.393056,-0.422764,-0.284897,0.0
9,13,0,-0.006172,0.0,1.582341,0.0,0.65278,0.0,0.0,-0.222873,-0.230229,-0.222789,0.552399,-0.19098,2.808187,...,-0.053737,-0.050718,1.34521,1.892997,0.0,0.0,0.0,0.0,0.0,-0.441684,0.961976,7.400685,0.087058,-0.284897,0.0


In [8]:
# Features that could lead to overfitting the models
bad_features = ['enrollment_id']
df.drop(labels=bad_features, axis='columns', inplace=True)

In [9]:
dep_var = 'label'
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

## Experiments

In [None]:
mm = MetricsManager()
fold = 5
quick_test = False
y_names = ['truth']
train_and_eval(df, X, y, X.columns, y_names, mm, fold, quick_test)
mm.printMeasures()

Running 5-fold cross validation evaluation
Training with 139 features
fold num 1
Training and evaulating model: rf
Training and evaulating model: xgb
Training and evaulating model: ab
Training and evaulating model: dt
Training and evaulating model: knn
Training and evaulating model: svm


In [None]:
mm.printMeasures()

## Data cleanup

In [40]:
# import data
#drive.mount('/content/gdrive')
path = 'data/stanford'

df1 = pd.read_csv(os.path.join(path, 'trainFeatureVectorWithLabel.csv'))

In [41]:
df1.head()

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,...,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.16129,0,21,0,4,0,0,0,...,0,0,0,0,5,1,4,6,2,0
1,135300,1,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,131075,1,0.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,0,0.25,0,7,0,1,0,0,0,...,0,0,0,0,4,4,0,5,0,0
4,5,0,0.166667,0,77,0,0,0,0,2,...,0,0,0,0,16,2,13,1,4,0


In [42]:
df1.shape

(72395, 141)

In [43]:
df2 = pd.read_csv(os.path.join(path, 'testFeatureVectorWithLabel.csv'))

In [44]:
df2.shape

(24013, 141)

In [45]:
df3 = pd.read_csv(os.path.join(path, 'validationFeatureVectorWithLabel.csv'))

In [46]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [47]:
df.shape

(120542, 141)

In [48]:
df.head()

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,...,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.16129,0,21,0,4,0,0,0,...,0,0,0,0,5,1,4,6,2,0
1,135300,1,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,131075,1,0.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,0,0.25,0,7,0,1,0,0,0,...,0,0,0,0,4,4,0,5,0,0
4,5,0,0.166667,0,77,0,0,0,0,2,...,0,0,0,0,16,2,13,1,4,0


In [11]:
bad_features = ['enrollment_id']
df.drop(labels=bad_features, axis='columns', inplace=True)

In [12]:
dep_var = 'label'
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [13]:
df.head()

Unnamed: 0,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,...,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,0,0.16129,0,21,0,4,0,0,0,0,...,0,0,0,0,5,1,4,6,2,0
1,1,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0.25,0,7,0,1,0,0,0,0,...,0,0,0,0,4,4,0,5,0,0
4,0,0.166667,0,77,0,0,0,0,2,0,...,0,0,0,0,16,2,13,1,4,0


In [14]:
X.columns

Index(['avg_chapter_delays', 'server_discussion_percent', 'act_cnt_weekDay_01',
       'browser_html_percent', 'parallel_enrollments', 'browser_dictation',
       'act_cnt_day_00', 'act_cnt_day_01', 'act_cnt_day_02', 'act_cnt_day_03',
       ...
       'server_course_percent', 'browser_course_info_percent',
       'browser_course', 'browser_vertical_percent', 'sessions_in_week_1',
       'sessions_in_week_0', 'sessions_in_week_3', 'sessions_in_week_2',
       'sessions_in_week_4', 'browser_about'],
      dtype='object', length=139)

In [15]:
mm = MetricsManager()
fold = 5
quick_test = True
y_names = ['label']
train_and_eval(df, X, y, X.columns, y_names, mm, fold, quick_test)
mm.printMeasures()

Running 2-fold cross validation evaluation
Training with 139 features
fold num 1
Training and evaulating model: rf
Training and evaulating model: xgb
Training and Evaluating Tensoflow...
Training and Evaluating Fast.ai...


KeyError: "None of [Index([('avg_chapter_delays', 'server_discussion_percent', 'act_cnt_weekDay_01', 'browser_html_percent', 'parallel_enrollments', 'browser_dictation', 'act_cnt_day_00', 'act_cnt_day_01', 'act_cnt_day_02', 'act_cnt_day_03', 'act_cnt_day_04', 'act_cnt_day_05', 'act_cnt_day_06', 'act_cnt_day_07', 'act_cnt_day_08', 'act_cnt_day_09', 'server_sequential', 'server_combinedopenended_percent', 'browser_discussion_percent', 'browser_wiki', 'browser_outlink_percent', 'server_peergrading', 'browser_about_percent', 'browser_outlink', 'act_cnt_day_19', 'act_cnt_day_18', 'act_cnt_day_17', 'act_cnt_day_16', 'act_cnt_day_15', 'act_cnt_day_14', 'act_cnt_day_13', 'act_cnt_day_12', 'act_cnt_day_11', 'act_cnt_day_10', 'act_cnt_day_29', 'wiki', 'browser_problem', 'server_peergrading_percent', 'server_video_percent', 'server_outlink', 'act_cnt_weekDay_04', 'act_cnt_weekDay_05', 'act_cnt_weekDay_06', 'server_wiki', 'act_cnt_weekDay_00', 'server_vertical', 'act_cnt_weekDay_02', 'act_cnt_weekDay_03', 'access', 'browser_combinedopenended', 'server_course', 'browser_access', 'browser_course_info', 'server_about_percent', 'browser_static_tab', 'server_chapter_percent', 'browser_video', 'browser_peergrading', 'browser_navigate', 'browser_vertical', 'browser_dictation_percent', 'server_vertical_percent', 'server_about', 'browser_sequential_percent', 'browser_combinedopenended_percent', 'server_course_info_percent', 'server_html', 'browser_html', 'server_problem_percent', 'browser_discussion', 'server_static_tab_percent', 'act_cnt_hour_22', 'browser_chapter_percent', 'class_size', 'server_combinedopenended', 'browser_course_percent', 'server_sequential_percent', 'act_cnt_hour_03', 'act_cnt_hour_02', 'act_cnt_hour_01', 'act_cnt_hour_00', 'act_cnt_hour_07', 'act_cnt_hour_06', 'act_cnt_hour_05', 'act_cnt_hour_04', 'act_cnt_hour_09', 'act_cnt_hour_08', 'browser_static_tab_percent', 'server_video', 'browser_video_percent', 'server_discussion', 'navigate', 'server_static_tab', 'server_course_info', 'server_problem', 'act_cnt_hour_10', 'act_cnt_hour_11', 'act_cnt_hour_12', 'act_cnt_hour_13', 'act_cnt_hour_14', ...)], dtype='object')] are in the [columns]"

In [16]:
mm.printMeasures()

model     acc        
--------------
rf        87.50±0.00
xgb       87.34±0.00
keras     85.58±0.00


In [61]:
numeric_features = list(df.columns)

In [62]:
numeric_features

['enrollment_id',
 'label',
 'avg_chapter_delays',
 'server_discussion_percent',
 'act_cnt_weekDay_01',
 'browser_html_percent',
 'parallel_enrollments',
 'browser_dictation',
 'act_cnt_day_00',
 'act_cnt_day_01',
 'act_cnt_day_02',
 'act_cnt_day_03',
 'act_cnt_day_04',
 'act_cnt_day_05',
 'act_cnt_day_06',
 'act_cnt_day_07',
 'act_cnt_day_08',
 'act_cnt_day_09',
 'server_sequential',
 'server_combinedopenended_percent',
 'browser_discussion_percent',
 'browser_wiki',
 'browser_outlink_percent',
 'server_peergrading',
 'browser_about_percent',
 'browser_outlink',
 'act_cnt_day_19',
 'act_cnt_day_18',
 'act_cnt_day_17',
 'act_cnt_day_16',
 'act_cnt_day_15',
 'act_cnt_day_14',
 'act_cnt_day_13',
 'act_cnt_day_12',
 'act_cnt_day_11',
 'act_cnt_day_10',
 'act_cnt_day_29',
 'wiki',
 'browser_problem',
 'server_peergrading_percent',
 'server_video_percent',
 'server_outlink',
 'act_cnt_weekDay_04',
 'act_cnt_weekDay_05',
 'act_cnt_weekDay_06',
 'server_wiki',
 'act_cnt_weekDay_00',
 'server_

In [63]:
del numeric_features[0]

In [64]:
del numeric_features[0]

In [65]:
numeric_features

['avg_chapter_delays',
 'server_discussion_percent',
 'act_cnt_weekDay_01',
 'browser_html_percent',
 'parallel_enrollments',
 'browser_dictation',
 'act_cnt_day_00',
 'act_cnt_day_01',
 'act_cnt_day_02',
 'act_cnt_day_03',
 'act_cnt_day_04',
 'act_cnt_day_05',
 'act_cnt_day_06',
 'act_cnt_day_07',
 'act_cnt_day_08',
 'act_cnt_day_09',
 'server_sequential',
 'server_combinedopenended_percent',
 'browser_discussion_percent',
 'browser_wiki',
 'browser_outlink_percent',
 'server_peergrading',
 'browser_about_percent',
 'browser_outlink',
 'act_cnt_day_19',
 'act_cnt_day_18',
 'act_cnt_day_17',
 'act_cnt_day_16',
 'act_cnt_day_15',
 'act_cnt_day_14',
 'act_cnt_day_13',
 'act_cnt_day_12',
 'act_cnt_day_11',
 'act_cnt_day_10',
 'act_cnt_day_29',
 'wiki',
 'browser_problem',
 'server_peergrading_percent',
 'server_video_percent',
 'server_outlink',
 'act_cnt_weekDay_04',
 'act_cnt_weekDay_05',
 'act_cnt_weekDay_06',
 'server_wiki',
 'act_cnt_weekDay_00',
 'server_vertical',
 'act_cnt_weekDay

In [66]:
# Normalize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df[numeric_features])

In [67]:
for i, n_f in enumerate(numeric_features):
    print(i, n_f)
    df[n_f] = scaled_df[:,i]

0 avg_chapter_delays
1 server_discussion_percent
2 act_cnt_weekDay_01
3 browser_html_percent
4 parallel_enrollments
5 browser_dictation
6 act_cnt_day_00
7 act_cnt_day_01
8 act_cnt_day_02
9 act_cnt_day_03
10 act_cnt_day_04
11 act_cnt_day_05
12 act_cnt_day_06
13 act_cnt_day_07
14 act_cnt_day_08
15 act_cnt_day_09
16 server_sequential
17 server_combinedopenended_percent
18 browser_discussion_percent
19 browser_wiki
20 browser_outlink_percent
21 server_peergrading
22 browser_about_percent
23 browser_outlink
24 act_cnt_day_19
25 act_cnt_day_18
26 act_cnt_day_17
27 act_cnt_day_16
28 act_cnt_day_15
29 act_cnt_day_14
30 act_cnt_day_13
31 act_cnt_day_12
32 act_cnt_day_11
33 act_cnt_day_10
34 act_cnt_day_29
35 wiki
36 browser_problem
37 server_peergrading_percent
38 server_video_percent
39 server_outlink
40 act_cnt_weekDay_04
41 act_cnt_weekDay_05
42 act_cnt_weekDay_06
43 server_wiki
44 act_cnt_weekDay_00
45 server_vertical
46 act_cnt_weekDay_02
47 act_cnt_weekDay_03
48 access
49 browser_combined

In [68]:
df.head()

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,...,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.401837,0.0,0.317621,0.0,3.960922,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,2.076406,-0.02911,1.685275,2.636167,2.141673,0.0
1,135300,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,-0.441684,-0.524652,-0.393056,-0.422764,-0.284897,0.0
2,131075,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,0.061934,-0.524652,-0.393056,-0.422764,-0.284897,0.0
3,4,0,0.866122,0.0,-0.094149,0.0,0.65278,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,1.572788,1.457518,-0.393056,2.126345,-0.284897,0.0
4,5,0,0.429975,0.0,1.964698,0.0,-0.449933,0.0,0.0,-0.109431,...,0.0,0.0,0.0,0.0,7.616205,0.466433,6.361519,0.087058,4.568243,0.0


In [77]:
# write features to a file
df.to_csv(os.path.join(path, 'normalized_all_features.csv'), index=False)

In [30]:
dep_var = 'label'
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [31]:
mm = MetricsManager()
fold = 5
quick_test = True
y_names = ['label']
train_and_eval(df, X, y, X.columns, y_names, mm, fold, quick_test)
mm.printMeasures()

Running 2-fold cross validation evaluation
Training with 139 features
fold num 1
Training and evaulating model: rf
Training and evaulating model: xgb
Training and Evaluating Tensoflow...
Training and Evaluating Fast.ai...


KeyError: "None of [Index([('avg_chapter_delays', 'server_discussion_percent', 'act_cnt_weekDay_01', 'browser_html_percent', 'parallel_enrollments', 'browser_dictation', 'act_cnt_day_00', 'act_cnt_day_01', 'act_cnt_day_02', 'act_cnt_day_03', 'act_cnt_day_04', 'act_cnt_day_05', 'act_cnt_day_06', 'act_cnt_day_07', 'act_cnt_day_08', 'act_cnt_day_09', 'server_sequential', 'server_combinedopenended_percent', 'browser_discussion_percent', 'browser_wiki', 'browser_outlink_percent', 'server_peergrading', 'browser_about_percent', 'browser_outlink', 'act_cnt_day_19', 'act_cnt_day_18', 'act_cnt_day_17', 'act_cnt_day_16', 'act_cnt_day_15', 'act_cnt_day_14', 'act_cnt_day_13', 'act_cnt_day_12', 'act_cnt_day_11', 'act_cnt_day_10', 'act_cnt_day_29', 'wiki', 'browser_problem', 'server_peergrading_percent', 'server_video_percent', 'server_outlink', 'act_cnt_weekDay_04', 'act_cnt_weekDay_05', 'act_cnt_weekDay_06', 'server_wiki', 'act_cnt_weekDay_00', 'server_vertical', 'act_cnt_weekDay_02', 'act_cnt_weekDay_03', 'access', 'browser_combinedopenended', 'server_course', 'browser_access', 'browser_course_info', 'server_about_percent', 'browser_static_tab', 'server_chapter_percent', 'browser_video', 'browser_peergrading', 'browser_navigate', 'browser_vertical', 'browser_dictation_percent', 'server_vertical_percent', 'server_about', 'browser_sequential_percent', 'browser_combinedopenended_percent', 'server_course_info_percent', 'server_html', 'browser_html', 'server_problem_percent', 'browser_discussion', 'server_static_tab_percent', 'act_cnt_hour_22', 'browser_chapter_percent', 'class_size', 'server_combinedopenended', 'browser_course_percent', 'server_sequential_percent', 'act_cnt_hour_03', 'act_cnt_hour_02', 'act_cnt_hour_01', 'act_cnt_hour_00', 'act_cnt_hour_07', 'act_cnt_hour_06', 'act_cnt_hour_05', 'act_cnt_hour_04', 'act_cnt_hour_09', 'act_cnt_hour_08', 'browser_static_tab_percent', 'server_video', 'browser_video_percent', 'server_discussion', 'navigate', 'server_static_tab', 'server_course_info', 'server_problem', 'act_cnt_hour_10', 'act_cnt_hour_11', 'act_cnt_hour_12', 'act_cnt_hour_13', 'act_cnt_hour_14', ...)], dtype='object')] are in the [columns]"

In [32]:
mm.printMeasures()

model     acc        
--------------
rf        87.49±0.00
xgb       87.34±0.00
keras     84.97±0.00


## inner join the two dataset based on enrollment_id

In [71]:
#drive.mount('/content/gdrive')
path = 'data/kddcup15'
db_path = os.path.join(path, 'kdd_all_normalized_features.csv')
df_ours = pd.read_csv(db_path)

In [72]:
df_ours.head()

Unnamed: 0,enrollment_id,action_count,server_navigate_count,server_access_count,server_problem_count,server_page_close_count,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count,truth
0,1,1.76231,1.289479,1.727285,1.287421,0.0,0.0,-0.150003,-0.173181,0.0,0.685899,2.356712,2.664597,1.538584,0.0,0.0,0
1,2,5.771796,6.093934,5.292106,0.509028,0.0,0.0,8.921766,2.553722,0.0,0.455788,0.235113,6.345421,2.978788,0.0,0.0,0
2,3,1.576487,0.437075,0.660441,0.314429,0.0,0.0,0.577978,-0.173181,0.0,1.284187,4.242578,0.561268,0.166962,0.0,0.0,0
3,4,0.225697,0.514567,1.154832,-0.074767,0.0,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.135325,-0.012367,-0.175944,0.0,0.0,0
4,5,4.042214,1.676935,2.247697,5.957782,0.0,0.0,0.801973,-0.173181,0.0,5.242094,4.343606,3.668458,5.447708,0.0,0.0,0


In [73]:
df_ours.shape

(200904, 17)

In [78]:
df.shape

(120542, 141)

In [79]:
df.head()

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,...,server_course_percent,browser_course_info_percent,browser_course,browser_vertical_percent,sessions_in_week_1,sessions_in_week_0,sessions_in_week_3,sessions_in_week_2,sessions_in_week_4,browser_about
0,1,0,0.401837,0.0,0.317621,0.0,3.960922,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,2.076406,-0.02911,1.685275,2.636167,2.141673,0.0
1,135300,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,-0.441684,-0.524652,-0.393056,-0.422764,-0.284897,0.0
2,131075,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,0.061934,-0.524652,-0.393056,-0.422764,-0.284897,0.0
3,4,0,0.866122,0.0,-0.094149,0.0,0.65278,0.0,0.0,-0.222873,...,0.0,0.0,0.0,0.0,1.572788,1.457518,-0.393056,2.126345,-0.284897,0.0
4,5,0,0.429975,0.0,1.964698,0.0,-0.449933,0.0,0.0,-0.109431,...,0.0,0.0,0.0,0.0,7.616205,0.466433,6.361519,0.087058,4.568243,0.0


In [36]:
df.shape

(120542, 140)

In [80]:
merged_df = pd.merge(left=df, right=df_ours, left_on='enrollment_id', right_on='enrollment_id')

In [81]:
merged_df.shape

(120542, 157)

In [83]:
merged_df.head(10)

Unnamed: 0,enrollment_id,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,...,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count,truth
0,1,0,0.401837,0.0,0.317621,0.0,3.960922,0.0,0.0,-0.222873,...,-0.150003,-0.173181,0.0,0.685899,2.356712,2.664597,1.538584,0.0,0.0,0
1,135300,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,-0.150003,-0.173181,0.0,-0.280567,-0.303706,-0.490396,-0.450268,0.0,0.0,1
2,131075,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,-0.122004,-0.173181,0.0,-0.280567,-0.303706,-0.442593,-0.381687,0.0,0.0,1
3,4,0,0.866122,0.0,-0.094149,0.0,0.65278,0.0,0.0,-0.222873,...,-0.150003,-0.173181,0.0,-0.280567,-0.135325,-0.012367,-0.175944,0.0,0.0,0
4,5,0,0.429975,0.0,1.964698,0.0,-0.449933,0.0,0.0,-0.109431,...,0.801973,-0.173181,0.0,5.242094,4.343606,3.668458,5.447708,0.0,0.0,0
5,7,1,0.467901,0.0,1.376456,0.0,0.65278,0.0,0.0,0.911542,...,0.773973,-0.173181,0.0,5.334138,2.188331,2.377779,4.281829,0.0,0.0,1
6,131080,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,-0.094005,0.054061,0.0,-0.280567,-0.303706,-0.442593,-0.381687,0.0,0.0,1
7,131083,1,0.033478,0.0,0.464681,0.0,-0.449933,0.0,0.0,-0.222873,...,-0.122004,0.054061,0.0,1.422254,1.009665,1.326115,0.098381,0.0,0.0,1
8,131084,0,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,...,-0.010007,3.008206,0.0,-0.004434,-0.202678,-0.299184,-0.175944,0.0,0.0,0
9,13,0,-0.006172,0.0,1.582341,0.0,0.65278,0.0,0.0,-0.222873,...,-0.038006,-0.173181,0.0,4.597783,3.535378,0.704677,4.281829,0.0,0.0,0


In [85]:
bad_features = ['enrollment_id', 'truth']
merged_df.drop(labels=bad_features, axis='columns', inplace=True)

In [87]:
merged_df.head()

Unnamed: 0,label,avg_chapter_delays,server_discussion_percent,act_cnt_weekDay_01,browser_html_percent,parallel_enrollments,browser_dictation,act_cnt_day_00,act_cnt_day_01,act_cnt_day_02,...,server_video_count,server_discussion_count,server_wiki_count,browser_navigate_count,browser_access_count,browser_problem_count,browser_page_close_count,browser_video_count,browser_discussion_count,browser_wiki_count
0,0,0.401837,0.0,0.317621,0.0,3.960922,0.0,0.0,-0.222873,-0.230229,...,0.0,-0.150003,-0.173181,0.0,0.685899,2.356712,2.664597,1.538584,0.0,0.0
1,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,...,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.303706,-0.490396,-0.450268,0.0,0.0
2,1,-0.442319,0.0,-0.300034,0.0,-0.449933,0.0,0.0,-0.222873,-0.230229,...,0.0,-0.122004,-0.173181,0.0,-0.280567,-0.303706,-0.442593,-0.381687,0.0,0.0
3,0,0.866122,0.0,-0.094149,0.0,0.65278,0.0,0.0,-0.222873,-0.230229,...,0.0,-0.150003,-0.173181,0.0,-0.280567,-0.135325,-0.012367,-0.175944,0.0,0.0
4,0,0.429975,0.0,1.964698,0.0,-0.449933,0.0,0.0,-0.109431,-0.230229,...,0.0,0.801973,-0.173181,0.0,5.242094,4.343606,3.668458,5.447708,0.0,0.0


In [86]:
merged_df.to_csv('data/merged_normalized_all.csv', index=False)

In [92]:
dep_var = 'label'
X = merged_df.loc[:, merged_df.columns != dep_var]
y = merged_df[dep_var]

In [94]:
mm = MetricsManager()
fold = 5
quick_test = True
y_names = ['label']
train_and_eval(merged_df, X, y, X.columns, y_names, mm, fold, quick_test)
mm.printMeasures()

Running 2-fold cross validation evaluation
Training with 154 features
fold num 1
Training and evaulating model: rf
Training and evaulating model: xgb
Training and Evaluating Tensoflow...
Training and Evaluating Fast.ai...


KeyError: "None of [Index([('avg_chapter_delays', 'server_discussion_percent', 'act_cnt_weekDay_01', 'browser_html_percent', 'parallel_enrollments', 'browser_dictation', 'act_cnt_day_00', 'act_cnt_day_01', 'act_cnt_day_02', 'act_cnt_day_03', 'act_cnt_day_04', 'act_cnt_day_05', 'act_cnt_day_06', 'act_cnt_day_07', 'act_cnt_day_08', 'act_cnt_day_09', 'server_sequential', 'server_combinedopenended_percent', 'browser_discussion_percent', 'browser_wiki', 'browser_outlink_percent', 'server_peergrading', 'browser_about_percent', 'browser_outlink', 'act_cnt_day_19', 'act_cnt_day_18', 'act_cnt_day_17', 'act_cnt_day_16', 'act_cnt_day_15', 'act_cnt_day_14', 'act_cnt_day_13', 'act_cnt_day_12', 'act_cnt_day_11', 'act_cnt_day_10', 'act_cnt_day_29', 'wiki', 'browser_problem', 'server_peergrading_percent', 'server_video_percent', 'server_outlink', 'act_cnt_weekDay_04', 'act_cnt_weekDay_05', 'act_cnt_weekDay_06', 'server_wiki', 'act_cnt_weekDay_00', 'server_vertical', 'act_cnt_weekDay_02', 'act_cnt_weekDay_03', 'access', 'browser_combinedopenended', 'server_course', 'browser_access', 'browser_course_info', 'server_about_percent', 'browser_static_tab', 'server_chapter_percent', 'browser_video', 'browser_peergrading', 'browser_navigate', 'browser_vertical', 'browser_dictation_percent', 'server_vertical_percent', 'server_about', 'browser_sequential_percent', 'browser_combinedopenended_percent', 'server_course_info_percent', 'server_html', 'browser_html', 'server_problem_percent', 'browser_discussion', 'server_static_tab_percent', 'act_cnt_hour_22', 'browser_chapter_percent', 'class_size', 'server_combinedopenended', 'browser_course_percent', 'server_sequential_percent', 'act_cnt_hour_03', 'act_cnt_hour_02', 'act_cnt_hour_01', 'act_cnt_hour_00', 'act_cnt_hour_07', 'act_cnt_hour_06', 'act_cnt_hour_05', 'act_cnt_hour_04', 'act_cnt_hour_09', 'act_cnt_hour_08', 'browser_static_tab_percent', 'server_video', 'browser_video_percent', 'server_discussion', 'navigate', 'server_static_tab', 'server_course_info', 'server_problem', 'act_cnt_hour_10', 'act_cnt_hour_11', 'act_cnt_hour_12', 'act_cnt_hour_13', 'act_cnt_hour_14', ...)], dtype='object')] are in the [columns]"

In [95]:
mm.printMeasures()

model     acc        
--------------
rf        87.63±0.00
xgb       87.39±0.00
keras     85.09±0.00
