In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data parts 0-10 and append
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'
}

train = pd.read_csv("Parts/part_0.csv", dtype=dtypes)
for i in range(1,263):
    df = pd.read_csv("Parts/part_"+str(i)+".csv", dtype=dtypes)
    train = pd.concat([train, df])


print("Train data shape is {}".format(train.shape))
print(train.head)
print(train.columns)
unique_sessions = len(train['session_id'].unique())
print(unique_sessions)

KeyboardInterrupt: 

In [1]:
print("Train data shape is {}".format(train.shape))
print(train.head)

NameError: name 'train' is not defined

In [9]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [7]:
def feature_engineer(train): 
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
        
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [4]:
def feature_engineer2(train):
    dfs = []

    # Keep the most frequent categorical features
    for c in CATS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
        tmp.name = tmp.name + '_mode'
        dfs.append(tmp)

    # Aggregate numerical variables
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)

    # Aggregate event counts and elapsed_time
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [5]:
dataset_df = feature_engineer(train)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 31)


In [6]:
dataset_df2 = feature_engineer2(train)
print("Full prepared dataset shape is {}".format(dataset_df2.shape))

Full prepared dataset shape is (70686, 31)


In [10]:
## Preprocess the test data and write to csv
test = pd.read_csv('test.csv')
test_preprocessed = feature_engineer(test)
cat_test_preprocessed = feature_engineer2(test)
test_preprocessed.to_csv('test_preprocessed.csv')
cat_test_preprocessed.to_csv('cat_test_preprocessed.csv')

In [9]:
import os

# Replace the path with the actual path to your GitHub repository
repository_folder = "/Users/brodybarton/Documents/GitHub/student-performance-prediction"
csv_file_path = repository_folder + "/CatBoostData.csv"
dataset_df2.to_csv(csv_file_path, index=True)