In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data parts 0-10 and append
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'
}

train_X = pd.read_csv("Parts/part_0.csv", dtype=dtypes)
for i in range(1,263):
    df = pd.read_csv("Parts/part_"+str(i)+".csv", dtype=dtypes)
    train_X = pd.concat([train_X, df])


print("Train data shape is {}".format(train_X.shape))
print(train_X.head)
print(train_X.columns)
unique_sessions = len(train_X['session_id'].unique())
print(unique_sessions)

Train data shape is (26296946, 20)
<bound method NDFrame.head of               session_id  index  elapsed_time      event_name       name  \
0      20090312431273200      0             0  cutscene_click      basic   
1      20090312431273200      1          1323    person_click      basic   
2      20090312431273200      2           831    person_click      basic   
3      20090312431273200      3          1147    person_click      basic   
4      20090312431273200      4          1863    person_click      basic   
...                  ...    ...           ...             ...        ...   
96941  22100221145014656   1600       5483231  navigate_click  undefined   
96942  22100221145014656   1601       5485166  navigate_click  undefined   
96943  22100221145014656   1602       5485917  navigate_click  undefined   
96944  22100221145014656   1603       5486753  navigate_click  undefined   
96945  22100221145014656   1604       5487952      checkpoint      basic   

       level  page  ro

In [2]:
labels = pd.read_csv('train_labels.csv')

labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

print(labels.shape)


(424116, 4)


In [3]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [4]:
def feature_engineer(train):
    
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
        
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [5]:
def feature_engineer2(train):
    dfs = []

    # Keep the most frequent categorical features
    for c in CATS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
        tmp.name = tmp.name + '_mode'
        dfs.append(tmp)

    # Aggregate numerical variables
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)

    # Aggregate event counts and elapsed_time
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [5]:
dataset_df = feature_engineer(train_X)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 31)


In [6]:
dataset_df2 = feature_engineer2(train_X)
print("Full prepared dataset shape is {}".format(dataset_df2.shape))

Full prepared dataset shape is (70686, 31)


In [7]:
dataset_df2.head()

Unnamed: 0_level_0,level_group,event_name_mode,fqid_mode,room_fqid_mode,text_mode,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,map_hover_sum,notification_click_sum,map_click_sum,observation_click_sum,checkpoint_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,navigate_click,groupconvo,tunic.historicalsociety.closet,undefined,85793.56,1.945455,-1.0,7.701275,-71.41375,...,81.0,22.0,28.0,11.0,4.0,8,2.0,4,1,14155940.0
20090312431273200,13-22,navigate_click,crane_ranger,tunic.wildlife.center,undefined,1040601.0,17.402381,-1.0,-130.347168,-162.004303,...,170.0,123.0,60.0,20.0,14.0,10,6.0,3,1,437052300.0
20090312431273200,5-12,person_click,worker,tunic.historicalsociety.frontdesk,Okay. Thanks anyway.,357205.2,8.054054,-1.0,14.306062,-57.269321,...,103.0,104.0,12.0,28.0,9.0,9,8.0,1,1,105732700.0
20090312433251036,0-4,navigate_click,groupconvo,tunic.historicalsociety.entry,undefined,97633.42,1.870504,0.0,-84.045959,-53.671082,...,49.0,18.0,36.0,15.0,3.0,5,3.0,2,1,13571040.0
20090312433251036,13-22,navigate_click,tunic.capitol_2,tunic.historicalsociety.entry,undefined,2498852.0,17.762529,5.1,-30.762283,-142.861893,...,637.0,145.0,65.0,83.0,186.0,14,45.0,5,1,3241011000.0


In [9]:
import os


# Replace the path with the actual path to your GitHub repository
repository_folder = "/Users/brodybarton/Documents/GitHub/student-performance-prediction"
csv_file_path = repository_folder + "/CatBoostData.csv"
dataset_df2.to_csv(csv_file_path, index=True)