In [5]:
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
import re



In [38]:
data_path_list = [
    'DR0008_activity_accumulator_2016_09.csv',
    'DR0008_activity_accumulator_2016-10.csv',
    'DR0008_activity_accumulator_2016-11.csv',
    'DR0008_activity_accumulator_2016-12.csv'
]

def create_day_seq(days, length):

    tmp_dict = {}
    for day in days:
        try:
            tmp_dict[day] += 1
        except:
            tmp_dict[day] = 1
    res = [0]*(length+1)
    for k,v in tmp_dict.items():
        res[k] = v
    return res

def extract_function_seq(data_path, function, month='9', within_day=False):
    df                   = pd.read_csv(data_path, sep='\t')
    df_temp              = df[df['event_type'] == function][['De-id', 'timestamp']]
    df_temp['timestamp'] = df_temp['timestamp'].apply(pd.to_datetime)
    df_temp['day']       = df_temp['timestamp'].apply(lambda x: x.day)
    df_day_list          = df_temp[['De-id', 'day']].groupby('De-id').agg(create_day_seq, length=df_temp['day'].nunique()).reset_index()
    df_day_list.columns  = ['De-id', month + '_day_list']
    return df_day_list


def ApEn(U, m, r):

    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    N = len(U)

    return abs(_phi(m+1) - _phi(m))

def get_all_seq(data_path_list, function):
    first_flag = 1
    for data_path in data_path_list:
        df_day_list = extract_function_seq(data_path, function, data_path.split('.')[0][-2:])
        if first_flag:
            df_all = df_day_list.copy()
            first_flag = 0
        else:
            df_all = pd.merge(df_all, df_day_list, on='De-id', how='left')

    df_all['09_day_list'] = df_all['09_day_list'].fillna(0)
    df_all['10_day_list'] = df_all['10_day_list'].fillna(0)
    df_all['11_day_list'] = df_all['11_day_list'].fillna(0)
    df_all['12_day_list'] = df_all['12_day_list'].fillna(0)

    df_all['09_day_list'] = df_all['09_day_list'].apply(lambda x: [0]*31 if x == 0 else x)
    df_all['10_day_list'] = df_all['10_day_list'].apply(lambda x: [0]*32 if x == 0 else x)
    df_all['11_day_list'] = df_all['11_day_list'].apply(lambda x: [0]*31 if x == 0 else x)
    df_all['12_day_list'] = df_all['12_day_list'].apply(lambda x: [0]*32 if x == 0 else x)

    df_all['total_list']  = df_all.apply(lambda row: row['09_day_list'][1:] +  row['10_day_list'][1:]
                                           + row['11_day_list'][1:] +  row['12_day_list'][1:], axis=1)
    return df_all

def get_seq_entropy(df_all, m):
    
    df_all['09_entropy' + '_' + str(m)]    = df_all['09_day_list'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['10_entropy' + '_' + str(m)]    = df_all['10_day_list'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['11_entropy' + '_' + str(m)]    = df_all['11_day_list'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['12_entropy' + '_' + str(m)]    = df_all['12_day_list'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    
    df_all['total_entropy' + '_' + str(m)] = df_all['total_list'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    return df_all



def get_weekday_seq_entropy(df_all, m):

    
    df_all['09_weekday_entropy' + '_' + str(m)]    = df_all['09_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['10_weekday_entropy' + '_' + str(m)]    = df_all['10_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['11_weekday_entropy' + '_' + str(m)]    = df_all['11_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['12_weekday_entropy' + '_' + str(m)]    = df_all['12_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    
    df_all['total_weekday_entropy' + '_' + str(m)] = df_all['total_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    return df_all


def get_weekend_seq_entropy(df_all, m):
    
    df_all['09_weekend_entropy' + '_' + str(m)]    = df_all['09_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['10_weekend_entropy' + '_' + str(m)]    = df_all['10_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['11_weekend_entropy' + '_' + str(m)]    = df_all['11_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['12_weekend_entropy' + '_' + str(m)]    = df_all['12_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    
    df_all['total_weekend_entropy' + '_' + str(m)] = df_all['total_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    return df_all

def add_at_risk_label(df_all):
    at_rsk_label            = pd.read_csv('Std_list_atRist_2016_se1.csv')
    at_rsk_label['at_risk'] = at_rsk_label['CUM_GPA'].apply(lambda x: '1' if x <= 2.0 else '0')
    at_rsk_label.columns    = ['De-id', 'CUM_GPA', 'at_risk']
    df_all                  = pd.merge(df_all, at_rsk_label, on='De-id', how='left')
    df_all['at_risk']       = df_all['at_risk'].fillna('0')
    return df_all


def get_weekday(day_list):
    weekday_list = []
    for i in range(len(day_list)):
        if (i + 4) % 7 > 0 and (i + 4) % 7 <= 5: # 4 indicates 2016.9.1 is Thursday
            weekday_list.append(day_list[i])
    return weekday_list

def get_weekend(day_list):
    weekend_list = []
    for i in range(len(day_list)):
        if (i + 4) % 7 > 0 and (i + 4) % 7 <= 5: # 4 indicates 2016.9.1 is Thursday
            pass
        else:
            weekend_list.append(day_list[i])
    return weekend_list

def int_handle_cnt(internel_handle_list, df_int_handle, name):
    df_temp = df_int_handle[df_int_handle['internal_handle'].isin(internel_handle_list)]
    df_temp = df_temp.groupby(['De-id']).count().reset_index('De-id')
    df_temp.columns = ['De-id', PRE_FIX + name]
    return df_temp

def extract_one_month(df, PRE_FIX):
    df_t = df[(df['event_type']=='PAGE_ACCESS') |
              (df['event_type']=='COURSE_ACCESS') |
              (df['event_type']=='LOGIN_ATTEMPT') |
              (df['event_type']=='SESSION_TIMEOUT') |
              (df['event_type']=='LOGOUT')]
    df_t = df_t[['De-id', 'event_type', 'course_id', 'internal_handle', 'timestamp']]

    df_evt = df_t[['De-id', 'event_type']]
    df_login = df_evt[df_evt['event_type'] == 'LOGIN_ATTEMPT'].groupby(['De-id']).count().reset_index('De-id')
    df_login.columns = ['De-id', PRE_FIX + 'LOGIN_ATTEMPT']

    df_se_out = df_evt[df_evt['event_type'] == 'SESSION_TIMEOUT'].groupby(['De-id']).count().reset_index('De-id')
    df_se_out.columns = ['De-id', PRE_FIX + 'SESSION_TIMEOUT']

    df_logout = df_evt[df_evt['event_type'] == 'LOGOUT'].groupby(['De-id']).count().reset_index('De-id')
    df_logout.columns = ['De-id', PRE_FIX + 'LOGOUT']

    df_all = df_login
    df_all = pd.merge(df_all, df_se_out, on='De-id', how='left')
    df_all = pd.merge(df_all, df_logout, on='De-id', how='left')

    df_int_handle = df_t[['De-id', 'internal_handle']]

    group_list        = ['groups', 'cp_group_create_self_groupmem', 'group_file', 'group_file', 'group_forum', 'groups_sign_up', 'agroup', 'group_blogs','group_task_create', 'group_task_view','cp_group_edit_self_groupmem','group_file_add', 'group_email', 'cp_groups', 'cp_groups_settings','edit_group_blog_entry', 'db_forum_collection_group', 'group_tasks', 'group_journal','group_virtual_classroom', 'add_group_journal_entry','email_all_groups', 'edit_group_journal_entry', 'email_select_groups', 'add_group_blog_entry']
    db_list           = ['discussion_board_entry', 'db_thread_list_entry', 'discussion_board', 'db_thread_list','db_collection', 'db_collection_group', 'db_collection_entry', 'db_thread_list_group']
    myinfo_list       = ['my_inst_personal_info', 'my_inst_personal_settings','my_inst_personal_edit', 'my_inst_myplaces_settings','my_tasks', 'my_task_create', 'my_email_courses','my_task_view', 'my_announcements']
    course_list       = ['course_tools_area', 'course_task_view', 'enroll_course', 'classic_course_catalog']
    journal_list      = ['journal', 'journal_view', 'view_draft_journal_entry',  'add_journal_entry', 'edit_journal_entry']
    email_list        = ['send_email', 'email_all_instructors', 'email_all_students', 'email_select_students','email_all_users',  'email_select_groups','email_all_groups']
    staff_list        = ['staff_information', 'cp_staff_information']
    annoucements_list = ['my_announcements', 'announcements_entry', 'announcements', 'cp_announcements']
    content_list      = ['content', 'cp_content']
    grade_list        = ['check_grade']

    df_group        = int_handle_cnt(group_list, df_int_handle, 'group')
    df_db           = int_handle_cnt(db_list, df_int_handle, 'db')
    df_myinfo       = int_handle_cnt(myinfo_list, df_int_handle, 'myinfo')
    df_course       = int_handle_cnt(course_list, df_int_handle, 'course')
    df_journal      = int_handle_cnt(journal_list, df_int_handle, 'journal')
    df_email        = int_handle_cnt(email_list, df_int_handle, 'email')
    df_staff        = int_handle_cnt(staff_list, df_int_handle, 'staff')
    df_annoucements = int_handle_cnt(annoucements_list, df_int_handle, 'annoucements')
    df_content      = int_handle_cnt(content_list, df_int_handle, 'content')
    df_grade        = int_handle_cnt(grade_list, df_int_handle, 'grade')

    dfs = [df_group, df_db, df_myinfo, df_course, df_journal, df_email, df_staff, df_annoucements, df_content, df_grade]

    for df in dfs:
        df_all = pd.merge(df_all, df, on='De-id', how='left')   

    df_all = df_all.rename(columns={'De-id':'MASKED_STUDENT_ID'})
    return df_all


def get_weekday_entropy(df_all, m, function):
    df_all['09_weekday_entropy' + '_' + str(function)]    = df_all['09_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['10_weekday_entropy' + '_' + str(function)]    = df_all['10_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['11_weekday_entropy' + '_' + str(function)]    = df_all['11_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['12_weekday_entropy' + '_' + str(function)]    = df_all['12_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    # df_all['lib_total_weekday_entropy' + '_' + str(m)] = df_all['lib_total_weekday_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    return df_all

def get_weekend_entropy(df_all, m, function):
    df_all['09_weekend_entropy' + '_' + str(function)]    = df_all['09_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['10_weekend_entropy' + '_' + str(function)]    = df_all['10_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['11_weekend_entropy' + '_' + str(function)]    = df_all['11_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    df_all['12_weekend_entropy' + '_' + str(function)]    = df_all['12_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    # df_all['lib_total_weekend_entropy' + '_' + str(m)] = df_all['lib_total_weekend_seq'].apply(lambda x: ApEn(x, m=m, r=np.mean(x)*0.2))
    return df_all

def get_funtions_seq(df_temp=None, function=None):
    if df_temp is None:
        df_temp = get_all_seq(data_path_list, function)
    
    df_temp['09_weekday_seq']    = df_temp['09_day_list'].apply(get_weekday)
    df_temp['09_weekend_seq']    = df_temp['09_day_list'].apply(get_weekend)
    df_temp['10_weekday_seq']    = df_temp['10_day_list'].apply(get_weekday)
    df_temp['10_weekend_seq']    = df_temp['10_day_list'].apply(get_weekend)
    df_temp['11_weekday_seq']    = df_temp['11_day_list'].apply(get_weekday)
    df_temp['11_weekend_seq']    = df_temp['11_day_list'].apply(get_weekend)
    df_temp['12_weekday_seq']    = df_temp['12_day_list'].apply(get_weekday)
    df_temp['12_weekend_seq']    = df_temp['12_day_list'].apply(get_weekend)

    df_temp = get_weekday_entropy(df_temp, 5, function)
    df_temp = get_weekend_entropy(df_temp, 2, function)
    df_temp = df_temp.rename(columns={'De-id':'MASKED_STUDENT_ID'})
    return df_temp

In [15]:
df_se1 = pd.read_csv('2016_se1_lib_lms.csv')
df_se1.head()
df_se1_features = df_se1[[i for i in df_se1.columns if i != 'label_atRist']]
df_se1_labels = df_se1['label_atRist']

In [11]:
df_se1_features.columns

Index(['workday', 'weekend', 'morning', 'afternoon', 'evening', 'overnight',
       'workday_ExamMonth', 'weekend_ExamMonth', 'morning_ExamMonth',
       'afternoon_ExamMonth', 'evening_ExamMonth', 'overnight_ExamMonth',
       'workday_notExamMonth', 'weekend_notExamMonth', 'morning_notExamMonth',
       'afternoon_notExamMonth', 'evening_notExamMonth',
       'overnight_notExamMonth', 'workday_firstMonth', 'weekend_firstMonth',
       'morning_firstMonth', 'afternoon_firstMonth', 'evening_firstMonth',
       'overnight_firstMonth', 'examMonth', 'notExamMonth', 'firstMonth',
       'total_checkin', '09LOGIN_ATTEMPT', '09SESSION_TIMEOUT', '09LOGOUT',
       '09group', '09db', '09myinfo', '09course', '09journal', '09email',
       '09staff', '09annoucements', '09content', '09grade', '10LOGIN_ATTEMPT',
       '10SESSION_TIMEOUT', '10LOGOUT', '10group', '10db', '10myinfo',
       '10course', '10journal', '10email', '10staff', '10annoucements',
       '10content', '10grade', '11LOGIN_ATTEM

In [24]:
lms_functions = ['COURSE_ACCESS', 'PAGE_ACCESS', 'LOGIN_ATTEMPT', 'SESSION_TIMEOUT', 'LOGOUT']
index = 0

for fun in lms_functions[1:]:
    print('getting seq of', fun, '...')
    df = get_funtions_seq(function=fun)
    features = [i for i in list(df.columns) if i.endswith(fun)]
    features.append('MASKED_STUDENT_ID')
    df_se1_features = pd.merge(df_se1_features, df[features], on='MASKED_STUDENT_ID', how='left').fillna(0)

getting seq of PAGE_ACCESS ...
getting seq of LOGIN_ATTEMPT ...
getting seq of SESSION_TIMEOUT ...
getting seq of LOGOUT ...


In [44]:
df_lib_seq = pd.read_csv('Std_Lib_sequence_day_2016_se1.csv')
df_lib_seq = df_lib_seq[['0','1']]
df_lib_seq = df_lib_seq.rename(columns= {'0': 'De-id', '1':'lib_total_list'})


semester_days = 122
def clean_list(day_list):
    try: 
        len(day_list)
        ans = day_list[2:-2].split('.')[:-1]
        return [int(i) for i in ans]
    except:
        return [0] * semester_days 
df_lib_seq['lib_total_list'] = df_lib_seq['lib_total_list'].apply(clean_list)

df_lib_seq['09_day_list'] = df_lib_seq['lib_total_list'].apply(lambda x: x[:30])
df_lib_seq['10_day_list'] = df_lib_seq['lib_total_list'].apply(lambda x: x[30:61])
df_lib_seq['11_day_list'] = df_lib_seq['lib_total_list'].apply(lambda x: x[61:91])
df_lib_seq['12_day_list'] = df_lib_seq['lib_total_list'].apply(lambda x: x[91:122])

df_lib_seq_ = get_funtions_seq(df_lib_seq, 'LIB')
features = [i for i in list(df_lib_seq_.columns) if i.endswith('LIB')]
features.append('MASKED_STUDENT_ID')
df_se1_features = pd.merge(df_se1_features, df_lib_seq_[features], on='MASKED_STUDENT_ID', how='left').fillna(0)

Unnamed: 0,De-id,lib_total_list,09_day_list,10_day_list,11_day_list,12_day_list
0,8TMIKVZ5,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,OZ6FIGHH,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 7, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 5, 1, 4, 2, 2, 2, 1, 0, 2, 1, 2, ..."
2,QSGBC7CZ,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,EIC4AO9Q,"[0, 2, 1, 0, 2, 2, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...","[0, 2, 1, 0, 2, 2, 1, 1, 0, 0, 0, 1, 0, 1, 0, ...","[0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, ...","[0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 1, 2, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."
4,S9BIH11O,"[0, 3, 1, 0, 6, 2, 1, 1, 3, 0, 0, 4, 2, 3, 3, ...","[0, 3, 1, 0, 6, 2, 1, 1, 3, 0, 0, 4, 2, 3, 3, ...","[7, 3, 5, 3, 5, 2, 2, 3, 0, 4, 6, 1, 2, 6, 4, ...","[2, 3, 2, 4, 3, 0, 4, 2, 1, 2, 3, 3, 2, 5, 2, ...","[3, 0, 3, 2, 10, 3, 3, 4, 5, 1, 4, 3, 2, 2, 3,..."


In [49]:
df_se1_features.to_csv('all_seq_features.csv', index=False)

In [51]:
df_se1_features = df_se1_features[[i for i in df_se1_features.columns if i != 'label_atRist' and i != 'MASKED_STUDENT_ID']]

In [93]:
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE

from sklearn import tree, svm, naive_bayes,neighbors
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

def try_different_method(clf):
    clf.fit(X_resampled_smote,y_resampled_smote.ravel())
    # predictions = clf.predict(X_test)
    y_predprob = clf.predict_proba(X_test)[:,1]
    predictions = np.array([1 if i > 0.001 else 0 for i in y_predprob ])
    print(len(predictions))
    print('Sum of predictions:', predictions.sum())
    print('Sum of Y test:', y_test.sum())
    neg_recall = recall_score(y_test, predictions, pos_label=1)
    print('the neg recall is :', neg_recall)
    print(neg_recall*y_test.sum())
    pos_recall = recall_score(y_test, predictions, pos_label=0)
    print('the pos recall is :', pos_recall)
    score = f1_score(y_test, predictions, average='micro')
    print('the score is :', score)
    return y_predprob


X_train, X_test, y_train, y_test = train_test_split(df_se1_features, df_se1_labels, test_size = 0.1, stratify=df_se1_labels)
X_resampled_smote, y_resampled_smote = SVMSMOTE(random_state=42, sampling_strategy='auto',k_neighbors=10).fit_sample(X_train, y_train)
clfs = {
    #     'svm': svm.SVC(),\
    #         'decision_tree':tree.DecisionTreeClassifier(),
    #         'naive_gaussian': naive_bayes.GaussianNB(), \
    #         'naive_mul':naive_bayes.MultinomialNB(),\
    #         'K_neighbor' : neighbors.KNeighborsClassifier(),\
    #         'bagging_knn' : BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5,max_features=0.5), \
    #         'bagging_tree': BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5,max_features=0.5),
            #'random_forest' : RandomForestClassifier(n_estimators=50),\
    #         'adaboost':AdaBoostClassifier(n_estimators=50),\
        'gradient_boost' : GradientBoostingClassifier(n_estimators=50, learning_rate=0.5,max_depth=10, random_state=0, min_samples_leaf=2),
            }
y_predprob = np.array([0]*len(y_test))
for clf_key in clfs.keys():
    print('the classifier is :',clf_key)
    clf = clfs[clf_key]
    for i in range(0, 10):
        predprob = try_different_method(clf)
        y_predprob = y_predprob + predprob


the classifier is : gradient_boost
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 0.9168278529980658
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 0.9168278529980658
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 0.9168278529980658
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 0.9168278529980658
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 0.9168278529980658
1551
Sum of predictions: 111
Sum of Y test: 38
the neg recall is : 0.2631578947368421
10.0
the pos recall is : 0.9332452081956378
the score is : 

In [91]:
    predictions = np.array([1 if i > 0.001 else 0 for i in y_predprob ])
    print(len(predictions))
    print('Sum of predictions:', predictions.sum())
    print('Sum of Y test:', y_test.sum())
    neg_recall = recall_score(y_test, predictions, pos_label=1)
    print('the neg recall is :', neg_recall)
    print(neg_recall*y_test.sum())
    pos_recall = recall_score(y_test, predictions, pos_label=0)
    print('the pos recall is :', pos_recall)
    score = f1_score(y_test, predictions, average='micro')
    print('the score is :', score)

4651
Sum of predictions: 413
Sum of Y test: 114
the neg recall is : 0.24561403508771928
28.0
the pos recall is : 0.9151421644258321
the score is : 0.898731455600946


In [65]:
pd.DataFrame(y_predprob).describe()

Unnamed: 0,0
count,3101.0
mean,0.002954869
std,0.03938999
min,2.952236e-07
25%,9.198245e-06
50%,3.049353e-05
75%,0.0001348784
max,0.9994582


In [13]:
fs = [
    'workday',
 'weekend',
 'morning',
 'afternoon',
 'evening',
 'overnight',
 'workday_ExamMonth',
 'weekend_ExamMonth',
 'morning_ExamMonth',
 'afternoon_ExamMonth',
 'evening_ExamMonth',
 'overnight_ExamMonth',
 'workday_notExamMonth',
 'weekend_notExamMonth',
 'morning_notExamMonth',
 'afternoon_notExamMonth',
 'evening_notExamMonth',
 'overnight_notExamMonth',
 'workday_firstMonth',
 'weekend_firstMonth',
 'morning_firstMonth',
 'afternoon_firstMonth',
 'evening_firstMonth',
 'overnight_firstMonth',
 'examMonth',
 'notExamMonth',
 'firstMonth'
]

In [86]:
fs = list(df_se1_features.columns)

In [92]:
common = ['workday',
 'weekend',
 'morning',
 'afternoon',
 'evening',
 'overnight',
 'workday_ExamMonth',
 'weekend_ExamMonth',
 'morning_ExamMonth',
 'afternoon_ExamMonth',
 'evening_ExamMonth',
 'overnight_ExamMonth',
 'workday_notExamMonth',
 'weekend_notExamMonth',
 'morning_notExamMonth',
 'afternoon_notExamMonth',
 'evening_notExamMonth',
 'overnight_notExamMonth',
 'workday_firstMonth',
 'weekend_firstMonth',
 'morning_firstMonth',
 'afternoon_firstMonth',
 'evening_firstMonth',
 'overnight_firstMonth',
 'examMonth',
 'notExamMonth',
 'firstMonth']
fs_09 = [i for i in fs if i.startswith('09')]
fs_09 = common + fs_09
fs_10 = [i for i in fs if i.startswith('10')]
fs_10 = fs_09 + fs_10
fs_11 = [i for i in fs if i.startswith('11')]
fs_11 = fs_10 + fs_11
fs_12 = [i for i in fs if i.startswith('12')]
fs_12 = fs_11 + fs_12

In [98]:
fs_12

['workday',
 'weekend',
 'morning',
 'afternoon',
 'evening',
 'overnight',
 'workday_ExamMonth',
 'weekend_ExamMonth',
 'morning_ExamMonth',
 'afternoon_ExamMonth',
 'evening_ExamMonth',
 'overnight_ExamMonth',
 'workday_notExamMonth',
 'weekend_notExamMonth',
 'morning_notExamMonth',
 'afternoon_notExamMonth',
 'evening_notExamMonth',
 'overnight_notExamMonth',
 'workday_firstMonth',
 'weekend_firstMonth',
 'morning_firstMonth',
 'afternoon_firstMonth',
 'evening_firstMonth',
 'overnight_firstMonth',
 'examMonth',
 'notExamMonth',
 'firstMonth',
 '09LOGIN_ATTEMPT',
 '09SESSION_TIMEOUT',
 '09LOGOUT',
 '09group',
 '09db',
 '09myinfo',
 '09course',
 '09journal',
 '09email',
 '09staff',
 '09annoucements',
 '09content',
 '09grade',
 '09_weekday_entropy_LOGIN_ATTEMPT',
 '09_weekend_entropy_LOGIN_ATTEMPT',
 '09_weekday_entropy_PAGE_ACCESS',
 '09_weekend_entropy_PAGE_ACCESS',
 '09_weekday_entropy_COURSE_ACCESS',
 '09_weekend_entropy_COURSE_ACCESS',
 '10LOGIN_ATTEMPT',
 '10SESSION_TIMEOUT',
 