# Libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import numpy as np # linear algebra
import pandas as pd
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb

# Import Data

In [3]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')
train_labels = pd.read_csv('./data/train_labels.csv')

In [4]:
train_labels.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


# 1. Feature Engineering

In [5]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek  
    return df
    
def get_object_columns(df, columns):
    df = df.groupby(['installation_id', columns])['event_id'].count().reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [columns], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': ['mean', 'sum', 'min', 'max', 'std']})
    df.fillna(0, inplace = True)
    df.columns = [f'{column}_mean', f'{column}_sum', f'{column}_min', f'{column}_max', f'{column}_std']
    return df

def get_numeric_columns_add(df, agg_column, column):
    df = df.groupby(['installation_id', agg_column]).agg({f'{column}': ['mean', 'sum', 'min', 'max', 'std']}).reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [agg_column], values = [col for col in df.columns if col not in ['installation_id', 'type']])
    df.fillna(0, inplace = True)
    df.columns = list(df.columns)
    return df

def perform_features_engineering(train_df, test_df, train_labels_df):
    print(f'Perform features engineering')
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    comp_train_df = pd.DataFrame({'installation_id': train_df['installation_id'].unique()})
    comp_train_df.set_index('installation_id', inplace = True)
    comp_test_df = pd.DataFrame({'installation_id': test_df['installation_id'].unique()})
    comp_test_df.set_index('installation_id', inplace = True)

    test_df = extract_time_features(test_df)
    train_df = extract_time_features(train_df)

    for i in numerical_columns:
        comp_train_df = comp_train_df.merge(get_numeric_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_numeric_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        comp_train_df = comp_train_df.merge(get_object_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_object_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        for j in numerical_columns:
            comp_train_df = comp_train_df.merge(get_numeric_columns_add(train_df, i, j), left_index = True, right_index = True)
            comp_test_df = comp_test_df.merge(get_numeric_columns_add(test_df, i, j), left_index = True, right_index = True)
    
    
    comp_train_df.reset_index(inplace = True)
    comp_test_df.reset_index(inplace = True)
    
    print('Our training set have {} rows and {} columns'.format(comp_train_df.shape[0], comp_train_df.shape[1]))

    # get the mode of the title
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
    # merge target
    labels = train_labels_df[['installation_id', 'title', 'accuracy_group']]
    # replace title with the mode
    labels['title'] = labels['title'].map(labels_map)
    # get title from the test set
    comp_test_df['title'] = test_df.groupby('installation_id').last()['title'].map(labels_map).reset_index(drop = True)
    # join train with labels
    comp_train_df = labels.merge(comp_train_df, on = 'installation_id', how = 'left')
    print('We have {} training rows'.format(comp_train_df.shape[0]))
    
    return comp_train_df, comp_test_df

In [6]:
train_df, test_df = perform_features_engineering(train, test, train_labels)
del train, test, train_labels; gc.collect()

Perform features engineering
Our training set have 17000 rows and 54 columns
We have 17690 training rows


0

# 2. Data Pre-processing

In [7]:
x_cols = [col for col in train_df.columns if col not in ['installation_id', 'accuracy_group']]
X, y, X_test= train_df.loc[:,x_cols].values, train_df['accuracy_group'].values, test_df.loc[:,x_cols].values
test_sub = test_df[['installation_id']]

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X,y)
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# 3. Training & Evaluation

In [9]:
params = {'n_estimators':2000,
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'subsample': 0.75,
          'subsample_freq': 1,
          'learning_rate': 0.04,
          'feature_fraction': 0.9,
          'max_depth': 15,
          'lambda_l1': 1,  
          'lambda_l2': 1,
          'verbose': 100,
          'early_stopping_rounds': 100
         }

In [10]:
num_round = 1000

clf = lgb.train(params,
                train_data,
                num_round,
                valid_sets=[train_data, val_data],
                )

[1]	training's rmse: 1.24717	valid_1's rmse: 1.22921
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.23349	valid_1's rmse: 1.21633
[3]	training's rmse: 1.22063	valid_1's rmse: 1.20397
[4]	training's rmse: 1.20858	valid_1's rmse: 1.19239
[5]	training's rmse: 1.19723	valid_1's rmse: 1.18179
[6]	training's rmse: 1.18668	valid_1's rmse: 1.17179
[7]	training's rmse: 1.177	valid_1's rmse: 1.16272
[8]	training's rmse: 1.16768	valid_1's rmse: 1.15398
[9]	training's rmse: 1.15879	valid_1's rmse: 1.14575
[10]	training's rmse: 1.15045	valid_1's rmse: 1.13844
[11]	training's rmse: 1.14273	valid_1's rmse: 1.13157
[12]	training's rmse: 1.13548	valid_1's rmse: 1.12504
[13]	training's rmse: 1.12833	valid_1's rmse: 1.11849
[14]	training's rmse: 1.12178	valid_1's rmse: 1.11276
[15]	training's rmse: 1.11565	valid_1's rmse: 1.1074
[16]	training's rmse: 1.1126	valid_1's rmse: 1.10538
[17]	training's rmse: 1.10706	valid_1's rmse: 1.10066
[18]	training's rmse: 1.10182	val

[172]	training's rmse: 0.918829	valid_1's rmse: 1.00238
[173]	training's rmse: 0.918388	valid_1's rmse: 1.00218
[174]	training's rmse: 0.917962	valid_1's rmse: 1.00198
[175]	training's rmse: 0.917406	valid_1's rmse: 1.00198
[176]	training's rmse: 0.916976	valid_1's rmse: 1.00191
[177]	training's rmse: 0.916515	valid_1's rmse: 1.00186
[178]	training's rmse: 0.916105	valid_1's rmse: 1.00191
[179]	training's rmse: 0.915645	valid_1's rmse: 1.00178
[180]	training's rmse: 0.91523	valid_1's rmse: 1.00158
[181]	training's rmse: 0.914807	valid_1's rmse: 1.00149
[182]	training's rmse: 0.914389	valid_1's rmse: 1.00149
[183]	training's rmse: 0.913944	valid_1's rmse: 1.0013
[184]	training's rmse: 0.913497	valid_1's rmse: 1.00134
[185]	training's rmse: 0.913038	valid_1's rmse: 1.00128
[186]	training's rmse: 0.912658	valid_1's rmse: 1.0012
[187]	training's rmse: 0.912285	valid_1's rmse: 1.00124
[188]	training's rmse: 0.911823	valid_1's rmse: 1.001
[189]	training's rmse: 0.911411	valid_1's rmse: 1.001

[329]	training's rmse: 0.867464	valid_1's rmse: 0.99817
[330]	training's rmse: 0.867171	valid_1's rmse: 0.998234
[331]	training's rmse: 0.866947	valid_1's rmse: 0.998149
[332]	training's rmse: 0.866776	valid_1's rmse: 0.998038
[333]	training's rmse: 0.866515	valid_1's rmse: 0.998123
[334]	training's rmse: 0.866273	valid_1's rmse: 0.99813
[335]	training's rmse: 0.86604	valid_1's rmse: 0.998062
[336]	training's rmse: 0.865815	valid_1's rmse: 0.998063
[337]	training's rmse: 0.865521	valid_1's rmse: 0.998049
[338]	training's rmse: 0.865318	valid_1's rmse: 0.998169
[339]	training's rmse: 0.865108	valid_1's rmse: 0.998237
[340]	training's rmse: 0.864794	valid_1's rmse: 0.998259
[341]	training's rmse: 0.864556	valid_1's rmse: 0.99828
[342]	training's rmse: 0.864344	valid_1's rmse: 0.99829
[343]	training's rmse: 0.864166	valid_1's rmse: 0.998367
[344]	training's rmse: 0.863987	valid_1's rmse: 0.998442
[345]	training's rmse: 0.863771	valid_1's rmse: 0.998423
[346]	training's rmse: 0.863582	vali

In [11]:
y_pred = clf.predict(X_val)
cohen_kappa_score(y_val, np.round(y_pred,0), weights= 'quadratic')

0.5246363303586942

In [12]:
test_pred = clf.predict(X_val)
test_sub['accuracy_group'] = pd.Series(test_pred)

# 4. Save predictions

In [13]:
test_sub.to_csv('submission.csv', index=False)