In [1]:
import numpy as np
import pandas as pd

In [20]:
cols_to_load = ['row_id', 'user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time']
train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")[cols_to_load]
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('bool')

In [21]:
features_df = train.iloc[:int(9 /10 * len(train))]
train_df = train.iloc[int(9 /10 * len(train)):]

In [22]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    #'median_user_accuracy', 
    #'skew_user_accuracy'
]

user_answers_df

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115,0.695652,46,0.465215
124,0.233333,30,0.430183
2746,0.578947,19,0.507257
5382,0.672000,125,0.471374
8623,0.642202,109,0.481566
...,...,...,...
1933700710,0.624829,1466,0.484332
1933703805,0.650000,40,0.483046
1933711038,0.684211,38,0.471069
1933715576,0.375000,16,0.500000


In [23]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std']}).copy()
content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    #'median_accuracy', 
    #'skew_accuracy'
]

content_answers_df

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.908595,6236,0.288207
1,0.891682,6684,0.310805
2,0.554656,40499,0.497010
3,0.779348,20734,0.414696
4,0.613226,28549,0.487020
...,...,...,...
13518,0.789203,778,0.408137
13519,0.567797,826,0.495682
13520,0.678524,759,0.467351
13521,0.822560,789,0.382283


In [24]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [25]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    #'median_user_accuracy',
    #'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    #'median_accuracy',
    #'prior_question_elapsed_time', 
    #'prior_question_had_explanation',
    #'skew_accuracy'
    'prior_question_had_explanation', 
    'prior_question_elapsed_time'
]

target = 'answered_correctly'

In [26]:
train_df = train_df[train_df[target] != -1]

In [27]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

In [28]:
train_df.head()

Unnamed: 0,row_id,user_id,answered_correctly,content_id,prior_question_had_explanation,prior_question_elapsed_time,mean_user_accuracy,questions_answered,std_user_accuracy,mean_accuracy,question_asked,std_accuracy
0,91107298,1933715875,0,11259,True,13000.0,0.779843,5219.0,0.414392,0.532146,1291.0,0.499159
1,91107299,1933715875,1,4957,True,44000.0,0.779843,5219.0,0.414392,0.584772,2548.0,0.492858
2,91107300,1933715875,1,5113,True,22000.0,0.779843,5219.0,0.414392,0.603571,1960.0,0.48928
3,91107301,1933715875,1,4699,True,74000.0,0.779843,5219.0,0.414392,0.694888,2504.0,0.460547
4,91107302,1933715875,1,11430,True,9000.0,0.779843,5219.0,0.414392,0.765869,1922.0,0.423565


In [29]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
mean_prior = train_df.prior_question_elapsed_time.astype("float64").mean()
train_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
train_df['prior_question_had_explanation'].fillna(False, inplace = True)
train_df['prior_question_had_explanation'] = label_enc.fit_transform(train_df['prior_question_had_explanation'])

In [30]:
train_df = train_df.fillna(value=0.5)

In [31]:
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

train_df

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,mean_accuracy,question_asked,std_accuracy,prior_question_had_explanation,prior_question_elapsed_time,answered_correctly
0,0.779843,5219.0,0.414392,0.532146,1291.0,0.499159,1,13000.0,0
1,0.779843,5219.0,0.414392,0.584772,2548.0,0.492858,1,44000.0,1
2,0.779843,5219.0,0.414392,0.603571,1960.0,0.489280,1,22000.0,1
3,0.779843,5219.0,0.414392,0.694888,2504.0,0.460547,1,74000.0,1
4,0.779843,5219.0,0.414392,0.765869,1922.0,0.423565,1,9000.0,1
...,...,...,...,...,...,...,...,...,...
9926946,0.500000,0.5,0.500000,0.738732,3927.0,0.439382,1,18000.0,1
9926947,0.500000,0.5,0.500000,0.524581,9194.0,0.499423,1,14000.0,1
9926948,0.500000,0.5,0.500000,0.616455,28174.0,0.486258,1,14000.0,1
9926949,0.500000,0.5,0.500000,0.660559,5185.0,0.473565,1,22000.0,0


In [32]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_df, random_state=666, test_size=0.2)

In [33]:
import lightgbm as lgb
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 2020,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }
lgb_train = lgb.Dataset(train_df[features], train_df[target])
lgb_eval = lgb.Dataset(valid_df[features], valid_df[target])

In [34]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

Training until validation scores don't improve for 8 rounds
[50]	training's auc: 0.721769	valid_1's auc: 0.721523
[100]	training's auc: 0.722055	valid_1's auc: 0.721705
[150]	training's auc: 0.722197	valid_1's auc: 0.72175
[200]	training's auc: 0.722336	valid_1's auc: 0.721792
Early stopping, best iteration is:
[229]	training's auc: 0.722413	valid_1's auc: 0.721812
