In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

import optuna
from optuna.samplers import TPESampler

import riiideducation

In [2]:
used_data_types_dict = {
    #'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    #'prior_question_elapsed_time': 'float16',
    #'prior_question_had_explanation': 'boolean'
}

train_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols = used_data_types_dict.keys(),
    dtype=used_data_types_dict, 
    index_col = 0
)

  mask |= (ar1 == a)


In [3]:
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

In [4]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std']}).copy()
user_answers_df.columns = [
    'mean_user_accuracy', 
    'questions_answered', 
    'std_user_accuracy', 
    #'median_user_accuracy', 
    #'skew_user_accuracy'
]

user_answers_df

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
115,0.695652,46,0.465215
124,0.233333,30,0.430183
2746,0.578947,19,0.507257
5382,0.672000,125,0.471374
8623,0.642202,109,0.481566
...,...,...,...
1933700710,0.624829,1466,0.484332
1933703805,0.650000,40,0.483046
1933711038,0.684211,38,0.471069
1933715576,0.375000,16,0.500000


In [5]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std']}).copy()
content_answers_df.columns = [
    'mean_accuracy', 
    'question_asked', 
    'std_accuracy', 
    #'median_accuracy', 
    #'skew_accuracy'
]

content_answers_df

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.908595,6236,0.288207
1,0.891682,6684,0.310805
2,0.554656,40499,0.497010
3,0.779348,20734,0.414696
4,0.613226,28549,0.487020
...,...,...,...
13518,0.789203,778,0.408137
13519,0.567797,826,0.495682
13520,0.678524,759,0.467351
13521,0.822560,789,0.382283


In [6]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [7]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    #'median_user_accuracy',
    #'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    #'median_accuracy',
    #'prior_question_elapsed_time', 
    #'prior_question_had_explanation',
    #'skew_accuracy'
]

target = 'answered_correctly'

In [8]:
train_df = train_df[train_df[target] != -1]

In [9]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')

In [10]:
train_df.head()

Unnamed: 0,content_id,answered_correctly,mean_user_accuracy,questions_answered,std_user_accuracy,mean_accuracy,question_asked,std_accuracy
0,11259,0,0.779843,5219.0,0.414392,0.532146,1291.0,0.499159
1,4957,1,0.779843,5219.0,0.414392,0.584772,2548.0,0.492858
2,5113,1,0.779843,5219.0,0.414392,0.603571,1960.0,0.48928
3,4699,1,0.779843,5219.0,0.414392,0.694888,2504.0,0.460547
4,11430,1,0.779843,5219.0,0.414392,0.765869,1922.0,0.423565


In [11]:
#train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value=0.25)

In [None]:
train_df.head()

In [12]:
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.25)

train_df

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,mean_accuracy,question_asked,std_accuracy,answered_correctly
0,0.779843,5219.00,0.414392,0.532146,1291.0,0.499159,0
1,0.779843,5219.00,0.414392,0.584772,2548.0,0.492858,1
2,0.779843,5219.00,0.414392,0.603571,1960.0,0.489280,1
3,0.779843,5219.00,0.414392,0.694888,2504.0,0.460547,1
4,0.779843,5219.00,0.414392,0.765869,1922.0,0.423565,1
...,...,...,...,...,...,...,...
9926946,0.250000,0.25,0.250000,0.738732,3927.0,0.439382,1
9926947,0.250000,0.25,0.250000,0.524581,9194.0,0.499423,1
9926948,0.250000,0.25,0.250000,0.616455,28174.0,0.486258,1
9926949,0.250000,0.25,0.250000,0.660559,5185.0,0.473565,0


In [13]:
train_df, test_df = train_test_split(train_df, random_state=666, test_size=0.2)

In [14]:
sampler = TPESampler(seed=666)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
    )
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1])
    return score

# uncomment to use optuna
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=70)
# params = study.best_params
# params['random_state'] = 666

params = {
    'bagging_fraction': 0.5817242323514327,
    'feature_fraction': 0.6884588361650144,
    'learning_rate': 0.42887924851375825, 
    'max_depth': 6,
    'min_child_samples': 946, 
    'min_data_in_leaf': 47, 
    'n_estimators': 169,
    'num_leaves': 29,
    'random_state': 666
}

model = LGBMClassifier(**params)
model.fit(train_df[features], train_df[target])
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

LGB score:  0.7158951771534834


In [15]:
l = list()
env = riiideducation.make_env()
iter_test = env.iter_test()

In [16]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    #test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    #test_df.fillna(value = 0.5, inplace = True)
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    l.append(test_df['answered_correctly'])
    print('---------------------------------')
    print(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    print('---------------------------------')
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

---------------------------------
    row_id  answered_correctly
0        0            0.556430
1        1            0.995089
2        2            0.972939
3        3            0.984550
4        4            0.454086
5        5            0.732049
6        6            0.688414
7        7            0.995061
8        8            0.624908
9        9            0.719708
10      10            0.849008
11      11            0.783116
12      12            0.717380
13      13            0.880003
14      14            0.510812
15      15            0.940191
16      16            0.744993
17      17            0.833357
---------------------------------
---------------------------------
    row_id  answered_correctly
0       18            0.864748
1       19            0.340103
2       20            0.769156
3       21            0.577186
4       22            0.623845
5       23            0.436702
6       24            0.520314
7       25            0.935919
8       26            0.890017

In [21]:
l[0].values

array([0.55643009, 0.99508908, 0.97293864, 0.9845498 , 0.45408611,
       0.73204914, 0.68841405, 0.99506117, 0.62490756, 0.71970773,
       0.84900754, 0.78311556, 0.71737995, 0.88000286, 0.51081186,
       0.94019081, 0.74499266, 0.83335723])

In [22]:
t1=np.array([0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1])
t2=np.array([1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1])
t3=np.array([1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0])

In [25]:
from sklearn.metrics import roc_auc_score
roc_auc_score(t3, l[2].values)

0.7638888888888888