# Data

## Download MOOC Data
- download data from server: http://moocdata.cn/data/user-activity
- Note: This may take a while!!
- Bash script can be found here [data_download.sh](data_download.sh)

In [3]:
%%bash

echo "Downloading data from server"
file="data/prediction_data.tar.gz"
if [[ -f "$file" ]]; then
    echo "$file exists."
else
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/prediction_data.tar.gz
    echo "Extracting files from $file..."
    tar -C data -xzvf data/prediction_data.tar.gz
    echo "Done extracting files."
fi

file="data/user_info.csv"
if [[ -f "$file" ]]; then
    echo "$file exists."
else
    echo "Downloading $file..."
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/user_info.csv
fi

file="data/course_info.csv"
if [[ -f "$file" ]]; then
    echo "$file exists"
else
    echo "Downloading $file..."
    curl -s -o $file http://lfs.aminer.cn/misc/moocdata/data/course_info.csv
fi

echo "All done..."

Downloading data from server
data/prediction_data.tar.gz exists.
data/user_info.csv exists.
data/course_info.csv exists
All done...


## Feature Extraction
- preprocessing step: https://github.com/wzfhaha/dropout_prediction/blob/master/feat_extract.py

In [5]:
import os
import pandas as pd

In [8]:
pd.__version__

'1.1.1'

In [7]:
path = 'data'

train = pd.read_csv(os.path.join(path, 'prediction_log/train_log.csv'))
test = pd.read_csv(os.path.join(path, 'prediction_log/test_log.csv'))
train_truth = pd.read_csv(os.path.join(path, 'prediction_log/train_truth.csv'), index_col='enroll_id')
test_truth = pd.read_csv(os.path.join(path, 'prediction_log/test_truth.csv'), index_col='enroll_id')
all_truth = pd.concat([train_truth, test_truth])
all_log = pd.concat([train, test])

train_enroll = list(set(list(train['enroll_id'])))
test_enroll = list(set(list(test['enroll_id'])))

video_action = ['seek_video','play_video','pause_video','stop_video','load_video']
problem_action = ['problem_get','problem_check','problem_save','reset_problem','problem_check_correct', 'problem_check_incorrect']
forum_action = ['create_thread','create_comment','delete_thread','delete_comment']
click_action = ['click_info','click_courseware','click_about','click_forum','click_progress']
close_action = ['close_courseware']

all_num = all_log.groupby('enroll_id').count()[['action']]
all_num.columns = ['all#count']
session_enroll = all_log[['session_id']].drop_duplicates()
session_num = all_log.groupby('enroll_id').count()
all_num['session#count'] = session_num['session_id']
for a in video_action + problem_action + forum_action + click_action + close_action:
    action_ = (all_log['action'] == a).astype(int)
    all_log[a+'#num'] = action_
    action_num = all_log.groupby('enroll_id').sum()[[a+'#num']]
    all_num = pd.merge(all_num, action_num, left_index=True, right_index=True)
all_num = pd.merge(all_num, all_truth, left_index=True, right_index=True)
enroll_info = all_log[['username','course_id','enroll_id']].drop_duplicates()
enroll_info.index = enroll_info['enroll_id']

del enroll_info['enroll_id']

all_num = pd.merge(all_num, enroll_info, left_index=True, right_index=True)
all_num.loc[test_enroll].to_csv(os.path.join(path, 'test_features.csv'))
all_num.loc[train_enroll].to_csv(os.path.join(path, 'train_features.csv'))

## Feature Preprocessing

In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
import math
from sklearn.preprocessing import StandardScaler

In [2]:
def age_convert(year):
    if year == None or math.isnan(year):
        return 0
    age = 2018 - int(year)
    if age > 70 or age < 10:
        age = 0
    return age

In [3]:
def gender_convert(gender):
    if gender == 'm':
        return 1
    elif gender == 'f':
        return 2
    else:
        return 0

In [4]:
def edu_convert(edu):
    edus = ["Bachelor's","High", "Master's", "Primary", "Middle","Associate","Doctorate"]
    #if x == None or or math.isnan(x):
    #    return 0
    if not isinstance(edu, str):
        return 0
    eduIndex = edus.index(edu)
    return eduIndex+1

In [6]:
def category_convert(cc, en_categorys):
    if isinstance(cc, str):
        for i, c in zip(range(len(en_categorys)), en_categorys):
            if cc == c:
                return i+1
    else:
        return 0

In [14]:
path = 'data'

train_feat= pd.read_csv(os.path.join(path, 'train_features.csv'), index_col=0)
test_feat= pd.read_csv(os.path.join(path, 'test_features.csv'), index_col=0)
all_feat = pd.concat([train_feat, test_feat])

user_profile = pd.read_csv(os.path.join(path, 'user_info.csv'), index_col='user_id')

# extract user age
birth_year = user_profile['birth'].to_dict()

all_feat['age'] = [age_convert(birth_year.get(int(u), None)) for u in all_feat['username']]

# extract user gender
user_gender = user_profile['gender'].to_dict()

all_feat['gender'] = [gender_convert(user_gender.get(int(u),None)) for u in all_feat['username']]

user_edu = user_profile['education'].to_dict()

all_feat['education'] = [edu_convert(user_edu.get(int(u), None)) for u in all_feat['username']]

user_enroll_num = all_feat.groupby('username').count()[['course_id']]
course_enroll_num = all_feat.groupby('course_id').count()[['username']]

user_enroll_num.columns = ['user_enroll_num']
course_enroll_num.columns = ['course_enroll_num']

all_feat = pd.merge(all_feat, user_enroll_num, left_on='username', right_index=True)
all_feat = pd.merge(all_feat, course_enroll_num, left_on='course_id', right_index=True)


#extract user cluster
cluster_label = np.load('cluster/label_5_10time.npy', allow_pickle=True)
user_cluster_id = pkl.load(open('cluster/user_dict','r'))

all_feat['cluster_label'] = [cluster_label[user_cluster_id[u]] for u in all_feat['username']]


#extract course category
courseinfo = pd.read_csv(os.path.join(path, 'course_info.csv'), index_col='id')
en_categorys = ['math','physics','electrical', 'computer','foreign language', 
                'business', 'economics','biology','medicine','literature','philosophy',
                'history','social science', 'art','engineering','education','environment','chemistry']

category_dict = courseinfo['category'].to_dict()

all_feat['course_category'] = [category_convert(category_dict.get(str(x), None), en_categorys) for x in all_feat['course_id']]

act_feats = [c for c in train_feat.columns if 'count' in c or 'time' in c or 'num' in c]

pkl.dump(act_feats, open(os.path.join(path, 'act_feats.pkl'),'wb'))
num_feats = act_feats + ['age','course_enroll_num','user_enroll_num']
scaler= StandardScaler()
newX = scaler.fit_transform(all_feat[num_feats])

print(newX.shape)

for i, n_f in enumerate(num_feats):
    all_feat[n_f] = newX[:,i]   

all_feat.loc[train_feat.index].to_csv(os.path.join(path, 'train_feat.csv'))
all_feat.loc[test_feat.index].to_csv(os.path.join(path, 'test_feat.csv'))

TypeError: a bytes-like object is required, not 'str'