In [13]:
"""
chance of student finishing the MOOC
Training data: Climate
Testing data: China
SKLearn Preprocessing Normalize: Avg_Dt
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
import os
import pydotplus 
from sklearn import tree

PERSON_COURSE_CLEANED = "../UBCx__Climate101x__3T2015_cleaned/person_course_cleaned.tsv"
PERSON_COURSE_DAY_CLEANED = "../UBCx__Climate101x__3T2015_cleaned/person_course_day_cleaned.tsv"
PERSON_COURSE_CLEANED_CHINA = "../UBCx__China300_1x__3T2015_cleaned/person_course_cleaned.tsv"
PERSON_COURSE_DAY_CLEANED_CHINA = "../UBCx__China300_1x__3T2015_cleaned/person_course_day_cleaned.tsv"

In [21]:
def get_data(data_path1, data_path2, filename):
    topdir = os.getcwd()
    person_course_cleaned_path = os.path.join(topdir, data_path1)
    person_course_day_cleaned_path = os.path.join(topdir, data_path2)

    person_course_cleaned_df = pd.read_table(person_course_cleaned_path)
    person_course_day_cleaned_df = pd.read_table(person_course_day_cleaned_path)

    result_1 = pd.merge(person_course_cleaned_df, person_course_day_cleaned_df, on='user_id', how = 'inner')
    def get_grade_label(grade):
        '''
        Return 1 if course was completed (non-NA value), 0 otherwise.
        '''
        if (pd.notnull(grade)):
            return 1
        else:
            return 0

    result_1['grade'] = result_1['grade'].map(get_grade_label)

    '''
    Dropping all the non-numerical values (for now)
    '''
    result = result_1.apply(pd.to_numeric, errors='coerce')
    '''
    Save all the categorical columns
    '''
    categorical_columns = result.columns[pd.isnull(result).all()].tolist()
    result = result.dropna(axis=1, how='all')

    '''
    For the numerical values, replace all the NA's with 0's
    (because if we only keep the rows that don't have NAs we only get 5 datapoints :'()
    '''
    result = result.fillna(0)
    # result.to_csv('left-join.tsv', sep="\t")

    '''
    Since we don't want to normalize the grades, and want to use it as the labels instead, we store them separately
    '''
    grades = result['grade']
    result = result.drop(['grade', 'user_id'], axis=1)

    '''
    Normalize the remaining numerical data: set it to have mean of 0 and standard deviation of 1.
    '''
    result_norm = result.apply(lambda x: (x - np.mean(x)) / np.std(x))
    # result_norm.to_csv('result-norm.tsv', sep="\t")
    
    '''
    Now convert the categorical features into binary features
    '''
    cat_data = result_1[categorical_columns]
    
    le = preprocessing.LabelEncoder()
    for column in cat_data:
        cat_data[column] = le.fit_transform(cat_data[column])
    enc = preprocessing.OneHotEncoder()
    for column in cat_data:
        cat_data[column] = enc.fit_transform(cat_data[column]) 

    frames = [result_norm, grades]
    final_result = pd.concat(frames, axis=1)
    final_result.to_csv(filename, sep="\t")
    

get_data(PERSON_COURSE_CLEANED, PERSON_COURSE_DAY_CLEANED, 'final-result.tsv')
get_data(PERSON_COURSE_CLEANED_CHINA, PERSON_COURSE_DAY_CLEANED_CHINA, 'final-result-china.tsv')

In [9]:
data_path1, data_path2, filename = PERSON_COURSE_CLEANED, PERSON_COURSE_DAY_CLEANED, 'final-result.tsv'
topdir = os.getcwd()
person_course_cleaned_path = os.path.join(topdir, data_path1)
person_course_day_cleaned_path = os.path.join(topdir, data_path2)

person_course_cleaned_df = pd.read_table(person_course_cleaned_path)
person_course_day_cleaned_df = pd.read_table(person_course_day_cleaned_path)

result_1 = pd.merge(person_course_cleaned_df, person_course_day_cleaned_df, on='user_id', how = 'inner')
def get_grade_label(grade):
    '''
    Return 1 if course was completed (non-NA value), 0 otherwise.
    '''
    if (pd.notnull(grade)):
        return 1
    else:
        return 0

result_1['grade'] = result_1['grade'].map(get_grade_label)

'''
Dropping all the non-numerical values (for now)
'''
result = result_1.apply(pd.to_numeric, errors='coerce')
'''
Save all the categorical columns
'''
categorical_columns = result.columns[pd.isnull(result).all()].tolist()
result = result.dropna(axis=1, how='all')

'''
For the numerical values, replace all the NA's with 0's
(because if we only keep the rows that don't have NAs we only get 5 datapoints :'()
'''
result = result.fillna(0)
# result.to_csv('left-join.tsv', sep="\t")

'''
Since we don't want to normalize the grades, and want to use it as the labels instead, we store them separately
'''
grades = result['grade']
result = result.drop(['grade', 'user_id'], axis=1)

'''
Normalize the remaining numerical data: set it to have mean of 0 and standard deviation of 1.
'''
result_norm = result.apply(lambda x: (x - np.mean(x)) / np.std(x))
# result_norm.to_csv('result-norm.tsv', sep="\t")

'''
Now convert the categorical features into binary features
'''
cat_data = result_1[categorical_columns]
cat_data.applymap(str)
cat_data.fillna("")


cat_data = pd.get_dummies(cat_data.drop(['continent'], axis=1))
final_frame = cat_data.join(result_norm)

Unnamed: 0,continent,education_associate_degree,education_bachelors,education_elementary,education_high_school,education_jr_high_school,education_masters,education_professional_degree,countryLabel_Albania,countryLabel_Algeria,...,last_event_y_2016-05-28T03:38:36Z,last_event_y_2016-05-28T11:05:48Z,last_event_y_2016-05-28T16:50:00Z,last_event_y_2016-05-29T04:43:19Z,last_event_y_2016-05-29T17:10:56Z,last_event_y_2016-05-29T20:40:14Z,last_event_y_2016-05-30T02:34:16Z,last_event_y_2016-05-30T04:54:46Z,last_event_y_2016-05-30T11:05:36Z,last_event_y_2016-05-30T22:23:57Z
0,,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


education        object
countryLabel     object
continent       float64
city             object
gender           object
start_time       object
last_event_x     object
date             object
last_event_y     object
dtype: object