# Data

## Download MOOC Data
- download data from server: http://moocdata.cn/data/user-activity
- Note: This may take a while!!
- Bash script can be found here [data_download.sh](data_download.sh)

In [3]:
%%bash

echo "Downloading data from server"
file="data/prediction_data.tar.gz"
if [[ -f "$file" ]]; then
    echo "$file exists."
else
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/prediction_data.tar.gz
    echo "Extracting files from $file..."
    tar -C data -xzvf data/prediction_data.tar.gz
    echo "Done extracting files."
fi

file="data/user_info.csv"
if [[ -f "$file" ]]; then
    echo "$file exists."
else
    echo "Downloading $file..."
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/user_info.csv
fi

file="data/course_info.csv"
if [[ -f "$file" ]]; then
    echo "$file exists"
else
    echo "Downloading $file..."
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/course_info.csv
fi

echo "All done..."

Downloading data from server
data/prediction_data.tar.gz exists.
data/user_info.csv exists.
data/course_info.csv exists
All done...


## Feature Extraction
- preprocessing step: https://github.com/wzfhaha/dropout_prediction/blob/master/feat_extract.py

In [2]:
import os
import pandas as pd

In [3]:
pd.__version__

'1.1.1'

In [4]:
data_path = 'data'

# load training log
train = pd.read_csv(os.path.join(data_path, 'prediction_log/train_log.csv'))

In [5]:
# let's look at the first 5 records
train.head()
# description of feature can be found here: http://moocdata.cn/data/user-activity

Unnamed: 0,enroll_id,username,course_id,session_id,action,object,time
0,772,5981,course-v1:TsinghuaX+70800232X+2015_T2,d8a9b787fa69063c34c73b9c29190b1c,click_about,,2015-09-27T15:42:59
1,772,5981,course-v1:TsinghuaX+70800232X+2015_T2,d8a9b787fa69063c34c73b9c29190b1c,click_info,,2015-09-27T15:43:12
2,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,pause_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:37:42
3,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,load_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:33:27
4,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,play_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:33:30


In [6]:
# let's look at the last 5 records
train.tail()

Unnamed: 0,enroll_id,username,course_id,session_id,action,object,time
29165535,466786,2659552,course-v1:TsinghuaX+AP000001X+2016_T1,dff9476c7f05f26a69a06506fe471668,click_courseware,,2016-04-01T21:04:38
29165536,466786,2659552,course-v1:TsinghuaX+AP000001X+2016_T1,dff9476c7f05f26a69a06506fe471668,problem_get,556186425e7448e189918bafcb2ff30b,2016-04-01T21:04:38
29165537,466786,2659552,course-v1:TsinghuaX+AP000001X+2016_T1,dff9476c7f05f26a69a06506fe471668,problem_get,1757e4fd85994ba9a2350780f96da89e,2016-04-01T21:04:38
29165538,466786,2659552,course-v1:TsinghuaX+AP000001X+2016_T1,dff9476c7f05f26a69a06506fe471668,problem_get,686d5efcd27845379b4b90e41ece181d,2016-04-01T21:04:38
29165539,466786,2659552,course-v1:TsinghuaX+AP000001X+2016_T1,d94bf93252b8936bef09f6e46c24d50d,click_about,,2016-03-02T20:21:55


In [8]:
# read the ground truch for training data
train_truth = pd.read_csv(os.path.join(data_path, 'prediction_log/train_truth.csv'), index_col='enroll_id')

In [9]:
# 1 -> drop-out; and 0 -> not-drop-out
train_truth.head()

Unnamed: 0_level_0,truth
enroll_id,Unnamed: 1_level_1
772,1
773,1
774,1
776,0
777,1


In [10]:
train_truth.tail()

Unnamed: 0_level_0,truth
enroll_id,Unnamed: 1_level_1
466774,1
466776,1
466781,1
466782,1
466786,0


In [12]:
# user action categories
video_actions = ['seek_video','play_video','pause_video','stop_video','load_video']
problem_actions = ['problem_get','problem_check','problem_save','reset_problem','problem_check_correct', 'problem_check_incorrect']
forum_actions = ['create_thread','create_comment','delete_thread','delete_comment']
click_actions = ['click_info','click_courseware','click_about','click_forum','click_progress']
close_actions = ['close_courseware']

In [14]:
# load test logs
test = pd.read_csv(os.path.join(data_path, 'prediction_log/test_log.csv'))
test_truth = pd.read_csv(os.path.join(data_path, 'prediction_log/test_truth.csv'), index_col='enroll_id')

# cobmine train and test truth
all_truth = pd.concat([train_truth, test_truth])

# combine train and test logs
all_log = pd.concat([train, test])

In [23]:
all_log.head()

Unnamed: 0,enroll_id,username,course_id,session_id,action,object,time
0,772,5981,course-v1:TsinghuaX+70800232X+2015_T2,d8a9b787fa69063c34c73b9c29190b1c,click_about,,2015-09-27T15:42:59
1,772,5981,course-v1:TsinghuaX+70800232X+2015_T2,d8a9b787fa69063c34c73b9c29190b1c,click_info,,2015-09-27T15:43:12
2,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,pause_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:37:42
3,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,load_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:33:27
4,773,1544995,course-v1:TsinghuaX+70800232X+2015_T2,2f02b86eb3ea2cbf0be11385a8dc62e5,play_video,3dac5590435e43b3a65a9ae7426c16db,2015-10-19T19:33:30


In [24]:
all_log.tail()

Unnamed: 0,enroll_id,username,course_id,session_id,action,object,time
12944857,466785,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,b12bc83e13cd943cf61bf78f09c72158,problem_check_incorrect,31aa97345c46473badab334379f995d8,2016-03-19T21:37:15
12944858,466785,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,b12bc83e13cd943cf61bf78f09c72158,problem_check_incorrect,31aa97345c46473badab334379f995d8,2016-03-19T21:37:22
12944859,466785,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,b12bc83e13cd943cf61bf78f09c72158,problem_check_correct,31aa97345c46473badab334379f995d8,2016-03-19T21:37:37
12944860,466785,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,7eca0904ae14dc8af809c0362632dd8e,click_courseware,,2016-03-19T19:24:44
12944861,466785,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,b12bc83e13cd943cf61bf78f09c72158,problem_get,a028651adbdc4bc0b53191117f874fbc,2016-03-19T21:38:34


In [15]:
train_enroll = list(set(list(train['enroll_id'])))
test_enroll = list(set(list(test['enroll_id'])))

In [16]:
# let's check train and test list
print(len(train_enroll))
print(len(test_enroll))

157943
67699


In [19]:
# count all the actions for each user
user_action_count = all_log.groupby('enroll_id').count()[['action']]

In [20]:
user_action_count

Unnamed: 0_level_0,action
enroll_id,Unnamed: 1_level_1
772,2
773,21
774,74
775,39
776,19
...,...
466781,29
466782,24
466783,2
466785,50


In [21]:
# give columns names
user_action_count.columns = ['action_count']

In [22]:
user_action_count

Unnamed: 0_level_0,action_count
enroll_id,Unnamed: 1_level_1
772,2
773,21
774,74
775,39
776,19
...,...
466781,29
466782,24
466783,2
466785,50


In [25]:
# create online session_enroll df dropping all duplicate session_ids
session_enroll = all_log[['session_id']].drop_duplicates()

In [26]:
session_enroll.head()

Unnamed: 0,session_id
0,d8a9b787fa69063c34c73b9c29190b1c
2,2f02b86eb3ea2cbf0be11385a8dc62e5
23,26449d5b9c0dfd76bdf981410ef33d94
24,fd2ad174b4aa8f6dd3e8900e1ba62735
56,138155092c1a9ce258b0da27a169b14d


In [27]:
session_count = all_log.groupby('enroll_id').count()

In [28]:
session_count.head()

Unnamed: 0_level_0,username,course_id,session_id,action,object,time
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
772,2,2,2,2,0,2
773,21,21,21,21,14,21
774,74,74,74,74,42,74
775,39,39,39,39,30,39
776,19,19,19,19,9,19


In [29]:
user_action_count['session_count'] = session_count['session_id']

In [30]:
user_action_count.head()

Unnamed: 0_level_0,action_count,session_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1
772,2,2
773,21,21
774,74,74
775,39,39
776,19,19


In [31]:
user_action_count.tail()

Unnamed: 0_level_0,action_count,session_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1
466781,29,29
466782,24,24
466783,2,2
466785,50,50
466786,93,93


In [32]:
for action in video_actions + problem_actions + forum_actions + click_actions + close_actions:
    action_label = action+'_count'
    action_ = (all_log['action'] == action).astype(int)
    all_log[action_label] = action_
    action_num = all_log.groupby('enroll_id').sum()[[action_label]]
    user_action_count = pd.merge(user_action_count, action_num, left_index=True, right_index=True)

In [33]:
user_action_count.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,create_thread_count,create_comment_count,delete_thread_count,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
773,21,21,6,4,3,0,1,0,0,0,...,0,0,0,0,2,2,2,0,0,1
774,74,74,9,14,10,1,8,0,0,0,...,0,0,0,0,6,17,0,0,0,9
775,39,39,0,12,6,2,10,0,0,0,...,0,0,0,0,0,0,0,0,0,9
776,19,19,0,3,3,1,2,0,0,0,...,0,0,0,0,2,4,2,0,0,2


In [34]:
user_action_count.tail()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,create_thread_count,create_comment_count,delete_thread_count,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
466781,29,29,0,0,0,0,0,22,0,0,...,0,0,0,0,0,0,0,0,0,0
466782,24,24,0,1,1,0,1,3,0,0,...,0,0,0,0,4,5,4,2,0,3
466783,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
466785,50,50,1,2,4,0,2,14,0,0,...,0,0,0,0,2,4,4,0,0,2
466786,93,93,0,0,2,0,2,25,0,0,...,0,0,0,0,16,16,12,4,8,8


In [35]:
 user_action_count = pd.merge(user_action_count, all_truth, left_index=True, right_index=True)

In [36]:
user_action_count.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,create_comment_count,delete_thread_count,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
773,21,21,6,4,3,0,1,0,0,0,...,0,0,0,2,2,2,0,0,1,1
774,74,74,9,14,10,1,8,0,0,0,...,0,0,0,6,17,0,0,0,9,1
775,39,39,0,12,6,2,10,0,0,0,...,0,0,0,0,0,0,0,0,9,1
776,19,19,0,3,3,1,2,0,0,0,...,0,0,0,2,4,2,0,0,2,0


In [37]:
user_action_count.tail()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,create_comment_count,delete_thread_count,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
466781,29,29,0,0,0,0,0,22,0,0,...,0,0,0,0,0,0,0,0,0,1
466782,24,24,0,1,1,0,1,3,0,0,...,0,0,0,4,5,4,2,0,3,1
466783,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,1
466785,50,50,1,2,4,0,2,14,0,0,...,0,0,0,2,4,4,0,0,2,1
466786,93,93,0,0,2,0,2,25,0,0,...,0,0,0,16,16,12,4,8,8,0


In [38]:
# remove duplicates based on username, course_id and enroll_id
enroll_info = all_log[['username','course_id','enroll_id']].drop_duplicates()

In [39]:
enroll_info.head()

Unnamed: 0,username,course_id,enroll_id
0,5981,course-v1:TsinghuaX+70800232X+2015_T2,772
2,1544995,course-v1:TsinghuaX+70800232X+2015_T2,773
23,1072798,course-v1:TsinghuaX+70800232X+2015_T2,774
97,561867,course-v1:TsinghuaX+70800232X+2015_T2,776
116,1368125,course-v1:TsinghuaX+70800232X+2015_T2,777


In [40]:
enroll_info.tail()

Unnamed: 0,username,course_id,enroll_id
12944714,2670958,course-v1:TsinghuaX+AP000001X+2016_T1,466770
12944762,2582621,course-v1:TsinghuaX+AP000001X+2016_T1,466775
12944781,133215,course-v1:TsinghuaX+AP000001X+2016_T1,466777
12944810,2665176,course-v1:TsinghuaX+AP000001X+2016_T1,466783
12944812,2513464,course-v1:TsinghuaX+AP000001X+2016_T1,466785


In [41]:
enroll_info.index = enroll_info['enroll_id']
del enroll_info['enroll_id']

In [42]:
user_action_count = pd.merge(user_action_count, enroll_info, left_index=True, right_index=True)

In [43]:
user_action_count.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2
773,21,21,6,4,3,0,1,0,0,0,...,0,2,2,2,0,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2
774,74,74,9,14,10,1,8,0,0,0,...,0,6,17,0,0,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2
775,39,39,0,12,6,2,10,0,0,0,...,0,0,0,0,0,0,9,1,1520977,course-v1:TsinghuaX+70800232X+2015_T2
776,19,19,0,3,3,1,2,0,0,0,...,0,2,4,2,0,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2


In [53]:
# save into corresponding train and test data set
user_action_count.loc[test_enroll].to_csv(os.path.join(data_path, 'test_features.csv'))
user_action_count.loc[train_enroll].to_csv(os.path.join(data_path, 'train_features.csv'))

## Feature Preprocessing

In [45]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
import math
from sklearn.preprocessing import StandardScaler

In [46]:
def age_convert(year):
    if year == None or math.isnan(year):
        return 0
    age = 2018 - int(year)
    if age > 70 or age < 10:
        age = 0
    return age

In [47]:
def gender_convert(gender):
    if gender == 'm':
        return 1
    elif gender == 'f':
        return 2
    else:
        return 0

In [48]:
def edu_convert(edu):
    edus = ["Bachelor's","High", "Master's", "Primary", "Middle","Associate","Doctorate"]
    #if x == None or or math.isnan(x):
    #    return 0
    if not isinstance(edu, str):
        return 0
    eduIndex = edus.index(edu)
    return eduIndex+1

In [90]:
def category_convert(category):
    if not isinstance(category, str):
        return 0
    categories = ['math','physics','electrical', 'computer','foreign language', 
                'business', 'economics','biology','medicine','literature','philosophy',
                'history','social science', 'art','engineering','education','environment','chemistry']
                
    catIndex = categories.index(category)
    return catIndex+1

In [54]:
data_path = 'data'

train_features = pd.read_csv(os.path.join(data_path, 'train_features.csv'), index_col=0)
test_features = pd.read_csv(os.path.join(data_path, 'test_features.csv'), index_col=0)

In [55]:
train_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2
773,21,21,6,4,3,0,1,0,0,0,...,0,2,2,2,0,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2
774,74,74,9,14,10,1,8,0,0,0,...,0,6,17,0,0,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2
776,19,19,0,3,3,1,2,0,0,0,...,0,2,4,2,0,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2
777,4,4,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2


In [56]:
train_features.tail()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,delete_comment_count,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
466774,42,42,0,0,2,0,2,6,4,0,...,0,3,4,11,0,4,2,1,2588048,course-v1:TsinghuaX+AP000001X+2016_T1
466776,19,19,3,2,2,1,2,0,0,0,...,0,2,3,2,0,0,2,1,2736225,course-v1:TsinghuaX+AP000001X+2016_T1
466781,29,29,0,0,0,0,0,22,0,0,...,0,0,0,0,0,0,0,1,2830711,course-v1:TsinghuaX+AP000001X+2016_T1
466782,24,24,0,1,1,0,1,3,0,0,...,0,4,5,4,2,0,3,1,2680742,course-v1:TsinghuaX+AP000001X+2016_T1
466786,93,93,0,0,2,0,2,25,0,0,...,0,16,16,12,4,8,8,0,2659552,course-v1:TsinghuaX+AP000001X+2016_T1


In [57]:
all_features = pd.concat([train_features, test_features])

In [59]:
all_features.shape

(225642, 26)

In [60]:
user_profile_df = pd.read_csv(os.path.join(data_path, 'user_info.csv'), index_col='user_id')

  mask |= (ar1 == a)


In [61]:
user_profile_df.head()

Unnamed: 0_level_0,gender,education,birth
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
631,male,High,1997.0
2631,male,Bachelor's,1990.0
4231,male,Associate,1991.0
6031,male,Bachelor's,1988.0
7831,,,


In [62]:
user_profile_df.tail()

Unnamed: 0_level_0,gender,education,birth
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10435606,,,
10437206,,,
10438806,,,
10440406,,,
10442006,,,


In [65]:
# extract user age
birth_year = user_profile_df['birth'].to_dict()

In [66]:
birth_year

{631: 1997.0,
 2631: 1990.0,
 4231: 1991.0,
 6031: 1988.0,
 7831: nan,
 9631: 1992.0,
 11231: 1985.0,
 13631: nan,
 16031: 1986.0,
 18231: 1987.0,
 22431: 1995.0,
 26231: 1968.0,
 28431: 1991.0,
 31231: 1987.0,
 33231: 1981.0,
 35631: 1990.0,
 37831: 1987.0,
 40031: 1990.0,
 42031: 1995.0,
 44031: 1990.0,
 46431: nan,
 48231: nan,
 50631: 1983.0,
 52431: 1963.0,
 55631: 1998.0,
 59231: 1989.0,
 61431: 1989.0,
 64031: nan,
 66231: nan,
 68431: 1986.0,
 70231: 1983.0,
 71831: 1990.0,
 74031: 1995.0,
 75631: 1993.0,
 77631: 1975.0,
 79431: 1993.0,
 81031: 1982.0,
 83031: nan,
 84831: 1989.0,
 86631: 1978.0,
 88631: 1975.0,
 90231: 1981.0,
 92431: 1984.0,
 94031: nan,
 95631: nan,
 97631: nan,
 100831: 1992.0,
 102631: 1986.0,
 104831: 1987.0,
 107031: nan,
 109231: 1995.0,
 111431: 1989.0,
 113031: 1996.0,
 115231: 1994.0,
 116831: 1988.0,
 118431: 1990.0,
 120031: 1986.0,
 121831: 1991.0,
 123631: 1989.0,
 125631: 1989.0,
 127631: 1990.0,
 129831: 1986.0,
 131431: 1992.0,
 133431: 1993.0

In [67]:
# add user's age as feature
all_features['age'] = [age_convert(birth_year.get(int(u), None)) for u in all_features['username']]

In [68]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,click_info_count,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id,age
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29
773,21,21,6,4,3,0,1,0,0,0,...,2,2,2,0,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2,0
774,74,74,9,14,10,1,8,0,0,0,...,6,17,0,0,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2,0
776,19,19,0,3,3,1,2,0,0,0,...,2,4,2,0,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2,37
777,4,4,0,0,0,0,0,0,0,0,...,1,1,1,0,0,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2,0


In [70]:
# extract user gender
user_gender = user_profile_df['gender'].to_dict()
# add gender feature to all_features
all_features['gender'] = [gender_convert(user_gender.get(int(u), None)) for u in all_features['username']]

In [71]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id,age,gender
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29,0
773,21,21,6,4,3,0,1,0,0,0,...,2,2,0,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2,0,0
774,74,74,9,14,10,1,8,0,0,0,...,17,0,0,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2,0,0
776,19,19,0,3,3,1,2,0,0,0,...,4,2,0,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2,37,0
777,4,4,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2,0,0


In [72]:
# extract education and add it as a feature
user_edu = user_profile_df['education'].to_dict()
all_features['education'] = [edu_convert(user_edu.get(int(u), None)) for u in all_features['username']]

In [73]:
user_enroll_count = all_features.groupby('username').count()[['course_id']]

In [74]:
user_enroll_count

Unnamed: 0_level_0,course_id
username,Unnamed: 1_level_1
5,12
7,2
26,7
32,3
35,3
...,...
7011178,1
7019483,1
7023562,1
7036067,1


In [75]:
course_enroll_count = all_features.groupby('course_id').count()[['username']]

In [76]:
course_enroll_count.head()

Unnamed: 0_level_0,username
course_id,Unnamed: 1_level_1
CAU/08112500x/2015_T2,394
MITx/15_390x_2015_T1/2015_T1,742
MITx/6_041x/2014_T2,582
NCTU/nctucmpsd/2015_T2,510
TsinghuaX/00680082X/2016_T1,621


In [77]:
user_enroll_count.columns = ['user_enroll_count']
course_enroll_count.columns = ['course_enroll_count']

In [78]:
all_features = pd.merge(all_features, user_enroll_count, left_on='username', right_index=True)

In [79]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,click_forum_count,click_progress_count,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29,0,3,6
73833,3,3,0,0,0,0,0,0,0,0,...,0,1,0,1,5981,course-v1:HIT+13SC20301820+2015_T2,29,0,3,6
198737,4,4,0,0,0,0,0,0,0,0,...,0,2,0,1,5981,course-v1:TsinghuaX+30240184_1X+2016_T1,29,0,3,6
209393,2,2,0,0,0,0,0,0,0,0,...,0,0,0,1,5981,course-v1:NTHU+MOOC_00_005+2015_T2,29,0,3,6
418643,12,12,0,1,2,1,1,0,0,0,...,0,2,1,1,5981,course-v1:TsinghuaX+80511503X+2016_T1,29,0,3,6


In [80]:
all_features = pd.merge(all_features, course_enroll_count, left_on='course_id', right_index=True)

In [83]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,click_progress_count,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count,course_enroll_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29,0,3,6,1333
773,21,21,6,4,3,0,1,0,0,0,...,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,2,1333
774,74,74,9,14,10,1,8,0,0,0,...,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,5,1333
776,19,19,0,3,3,1,2,0,0,0,...,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2,37,0,6,6,1333
777,4,4,0,0,0,0,0,0,0,0,...,0,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,11,1333


In [82]:
all_features.tail()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,click_progress_count,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count,course_enroll_count
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
424714,274,274,65,56,62,5,14,0,0,0,...,0,17,1,6776637,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,2,426
424720,10,10,2,1,2,0,0,0,0,0,...,0,0,1,6776128,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426
424243,371,371,4,71,89,15,29,3,6,0,...,0,36,1,6776321,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426
424555,636,636,190,64,97,21,44,10,13,4,...,4,52,1,6775492,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426
424585,151,151,15,24,38,7,14,1,2,1,...,0,2,0,6776600,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426


In [85]:
#extract course category
course_info_df = pd.read_csv(os.path.join(data_path, 'course_info.csv'), index_col='id')
category_dict = course_info_df['category'].to_dict()

In [87]:
category_dict

{'6561': nan,
 '5557': nan,
 '9433': nan,
 '8320': nan,
 '231': nan,
 '7645': nan,
 '9953': nan,
 '7625': nan,
 '8657': 'philosophy',
 '8833': nan,
 '3814': nan,
 '9343': nan,
 '8006': nan,
 '2159': nan,
 '10988': nan,
 '9645': nan,
 '11935': nan,
 '577': 'engineering',
 '11105': nan,
 '9248': 'history',
 '10488': nan,
 '5745': nan,
 '11899': nan,
 '2296': nan,
 '5873': nan,
 '680': nan,
 '10171': nan,
 '1623': nan,
 '8957': nan,
 '5763': nan,
 '1682': nan,
 '10450': nan,
 '7171': 'engineering',
 '830s0': nan,
 '10164': nan,
 '8974': nan,
 '9304': nan,
 '10627': nan,
 '5735': 'social science',
 '8375': nan,
 '10459': nan,
 '11612': 'economics',
 '8507': nan,
 '8202': nan,
 '7444': nan,
 '9290': 'literature',
 '9378': nan,
 '8697': nan,
 '11495': nan,
 '7225': nan,
 '7256': nan,
 '9287': nan,
 '3997': nan,
 '5714': nan,
 '614': nan,
 '9083': 'history',
 '7401': nan,
 '10324': nan,
 '1973': nan,
 '2070': 'business',
 '557': 'biology',
 '12210': nan,
 '7691': nan,
 '1815': 'engineering',


In [91]:
# add course_category feature
all_features['course_category'] = [category_convert(category_dict.get(str(x), None)) for x in all_features['course_id']]

In [92]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count,course_enroll_count,course_category
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29,0,3,6,1333,0
773,21,21,6,4,3,0,1,0,0,0,...,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,2,1333,0
774,74,74,9,14,10,1,8,0,0,0,...,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,5,1333,0
776,19,19,0,3,3,1,2,0,0,0,...,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2,37,0,6,6,1333,0
777,4,4,0,0,0,0,0,0,0,0,...,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,11,1333,0


In [97]:
numeric_features = [c for c in train_features.columns if 'count' in c or 'time' in c or 'num' in c]

In [98]:
numeric_features

['action_count',
 'session_count',
 'seek_video_count',
 'play_video_count',
 'pause_video_count',
 'stop_video_count',
 'load_video_count',
 'problem_get_count',
 'problem_check_count',
 'problem_save_count',
 'reset_problem_count',
 'problem_check_correct_count',
 'problem_check_incorrect_count',
 'create_thread_count',
 'create_comment_count',
 'delete_thread_count',
 'delete_comment_count',
 'click_info_count',
 'click_courseware_count',
 'click_about_count',
 'click_forum_count',
 'click_progress_count',
 'close_courseware_count']

In [99]:
# dump actual features as a pickle file
pkl.dump(numeric_features, open(os.path.join(data_path, 'act_features.pkl'),'wb'))

In [100]:
all_feature_names = numeric_features + ['age','course_enroll_num','user_enroll_num']

In [103]:
# perform Standard Scalar Transformation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler = StandardScaler()
transformed_df = scaler.fit_transform(all_features[numeric_features])

In [104]:
transformed_df.shape

(225642, 23)

In [105]:
for i, n_f in enumerate(numeric_features):
    print(i, n_f)
    all_features[n_f] = transformed_df[:,i]  

0 action_count
1 session_count
2 seek_video_count
3 play_video_count
4 pause_video_count
5 stop_video_count
6 load_video_count
7 problem_get_count
8 problem_check_count
9 problem_save_count
10 reset_problem_count
11 problem_check_correct_count
12 problem_check_incorrect_count
13 create_thread_count
14 create_comment_count
15 delete_thread_count
16 delete_comment_count
17 click_info_count
18 click_courseware_count
19 click_about_count
20 click_forum_count
21 click_progress_count
22 close_courseware_count


In [106]:
all_features.head()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count,course_enroll_count,course_category
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,-0.228387,-0.228387,-0.243043,-0.296739,-0.333535,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,...,-0.464309,1,5981,course-v1:TsinghuaX+70800232X+2015_T2,29,0,3,6,1333,0
773,-0.204883,-0.204883,-0.115253,-0.242027,-0.296012,-0.059298,-0.445267,-0.042412,-0.305844,-0.202161,...,-0.428065,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,2,1333,0
774,-0.13932,-0.13932,-0.051358,-0.105248,-0.20846,-0.057709,-0.195358,-0.042412,-0.305844,-0.202161,...,-0.138111,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,5,1333,0
776,-0.207357,-0.207357,-0.243043,-0.255705,-0.296012,-0.057709,-0.409566,-0.042412,-0.305844,-0.202161,...,-0.391821,0,561867,course-v1:TsinghuaX+70800232X+2015_T2,37,0,6,6,1333,0
777,-0.225912,-0.225912,-0.243043,-0.296739,-0.333535,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,...,-0.428065,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2,0,0,0,11,1333,0


In [108]:
all_features.tail()

Unnamed: 0_level_0,action_count,session_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,...,close_courseware_count,truth,username,course_id,age,gender,education,user_enroll_count,course_enroll_count,course_category
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
424714,0.108086,0.108086,1.141348,0.469226,0.441931,-0.051356,0.018849,-0.042412,-0.305844,-0.202161,...,0.151842,1,6776637,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,2,426,0
424720,-0.21849,-0.21849,-0.200446,-0.283061,-0.30852,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,...,-0.464309,1,6776128,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426,0
424243,0.228078,0.228078,-0.15785,0.674395,0.779634,-0.035474,0.554367,-0.03369,0.033785,-0.202161,...,0.840483,1,6776321,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426,0
424555,0.555891,0.555891,3.803637,0.578649,0.879694,-0.025945,1.089886,-0.01334,0.430019,0.554097,...,1.42039,1,6775492,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426,0
424585,-0.044069,-0.044069,0.076432,0.031532,0.141751,-0.04818,0.018849,-0.039505,-0.192634,-0.013096,...,-0.391821,0,6776600,course-v1:TsinghuaX+00670122X+2017_T1,0,0,0,1,426,0


In [107]:
#extract user cluster
#cluster_label = np.load('cluster/label_5_10time.npy', allow_pickle=True)
#user_cluster_id = pkl.load(open('cluster/user_dict','r'))

#all_feat['cluster_label'] = [cluster_label[user_cluster_id[u]] for u in all_feat['username']]

# save training and testing as CSV files
all_features.loc[train_features.index].to_csv(os.path.join(data_path, 'train_normalized_features.csv'))
all_features.loc[test_features.index].to_csv(os.path.join(data_path, 'test_normalized_features.csv'))

In [109]:
# let's clean drop some features just to see...
all_features.drop(columns=['session_count', 'username', 'course_id', 'user_enroll_count', 'course_enroll_count'])

Unnamed: 0_level_0,action_count,seek_video_count,play_video_count,pause_video_count,stop_video_count,load_video_count,problem_get_count,problem_check_count,problem_save_count,reset_problem_count,...,click_courseware_count,click_about_count,click_forum_count,click_progress_count,close_courseware_count,truth,age,gender,education,course_category
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,-0.228387,-0.243043,-0.296739,-0.333535,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.428333,-0.293695,-0.177350,-0.284796,-0.464309,1,29,0,3,0
773,-0.204883,-0.115253,-0.242027,-0.296012,-0.059298,-0.445267,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.394335,-0.159877,-0.177350,-0.284796,-0.428065,1,0,0,0,0
774,-0.139320,-0.051358,-0.105248,-0.208460,-0.057709,-0.195358,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.139346,-0.427513,-0.177350,-0.284796,-0.138111,1,0,0,0,0
776,-0.207357,-0.243043,-0.255705,-0.296012,-0.057709,-0.409566,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.360336,-0.159877,-0.177350,-0.284796,-0.391821,0,37,0,6,0
777,-0.225912,-0.243043,-0.296739,-0.333535,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.411334,-0.293695,-0.177350,-0.284796,-0.428065,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424714,0.108086,1.141348,0.469226,0.441931,-0.051356,0.018849,-0.042412,-0.305844,-0.202161,-0.103969,...,0.234639,0.643029,-0.177350,-0.284796,0.151842,1,0,0,0,0
424720,-0.218490,-0.200446,-0.283061,-0.308520,-0.059298,-0.480968,-0.042412,-0.305844,-0.202161,-0.103969,...,-0.411334,-0.026059,-0.177350,-0.284796,-0.464309,1,0,0,0,0
424243,0.228078,-0.157850,0.674395,0.779634,-0.035474,0.554367,-0.033690,0.033785,-0.202161,-0.103969,...,0.948608,1.847389,-0.177350,-0.284796,0.840483,1,0,0,0,0
424555,0.555891,3.803637,0.578649,0.879694,-0.025945,1.089886,-0.013340,0.430019,0.554097,-0.103969,...,1.509585,0.107758,0.120029,0.624071,1.420390,1,0,0,0,0


In [110]:
all_features.loc[train_features.index].to_csv(os.path.join(data_path, 'train_normalized_trimmed_features.csv'))
all_features.loc[test_features.index].to_csv(os.path.join(data_path, 'test_normalized_trimmed_features.csv'))