# Setup

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import lightgbm as lgb
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import roc_auc_score
import gc    # For garbage collector

In [2]:
%%time
train = pd.read_parquet('../input/riiid-train-features/train_features_subset30.parquet')

CPU times: user 2.9 s, sys: 4.09 s, total: 6.99 s
Wall time: 3.82 s


In [3]:
train = train.drop(['tags'], axis=1) # dropping tags for now until i can figure out something better to do with them

In [4]:
train.head()

Unnamed: 0,user_id,content_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,avg_questions,avg_questions_seen,explanation_mean_user,bundle_id,correct_answer,part,quest_pct,count,user_mean_accuracy,content_mean_accuracy,user_hmean_accuracy,prior_question_had_explanation_enc
0,124,7876,1,0,0,26000.0,1.006161,2.01281,0.033333,7876,3,1,0.418436,190170,1.0,2.0,1.333333,0
1,124,175,2,2,1,29000.0,1.005353,3.018163,0.033333,175,2,1,0.35997,195861,0.5,1.0,0.666667,0
2,124,1278,3,1,0,26000.0,1.004791,4.022954,0.033333,1278,3,2,0.649213,171098,0.666667,2.0,1.0,0
3,124,2064,4,2,0,18000.0,1.759428,5.782382,0.033333,2063,1,3,0.636907,176043,0.5,1.0,0.666667,0
4,124,2063,4,3,0,18000.0,1.759428,5.782382,0.033333,2063,0,3,0.163119,176043,0.4,1.0,0.571429,0


In [5]:
train.isna().sum()  # sanity check no NA values

user_id                               0
content_id                            0
task_container_id                     0
user_answer                           0
answered_correctly                    0
prior_question_elapsed_time           0
avg_questions                         0
avg_questions_seen                    0
explanation_mean_user                 0
bundle_id                             0
correct_answer                        0
part                                  0
quest_pct                             0
count                                 0
user_mean_accuracy                    0
content_mean_accuracy                 0
user_hmean_accuracy                   0
prior_question_had_explanation_enc    0
dtype: int64

In [6]:
valid_df = train.groupby('user_id').tail(10).reset_index(drop=True)
train_df = train[~train.index.isin(valid_df.index)].reset_index(drop=True)
target = 'answered_correctly'
print(f'train_size: {len(train)}, valid size: {len(valid_df)}')

train_size: 10587300, valid size: 3925402


The data frame contains a lot of features but here is a list of ones we actually want to use

In [7]:
features = [
    'quest_pct',
    'count',
    'content_mean_accuracy',
    'content_id',
    'user_mean_accuracy',
    'task_container_id',
    'part',
    'user_hmean_accuracy',
    'prior_question_had_explanation_enc',
    'prior_question_elapsed_time',
    'avg_questions',
    'avg_questions_seen',
    'explanation_mean_user',
]

# Linear model

In [8]:
import statsmodels.api as sm

In [9]:
%%time
model = sm.OLS(train_df['answered_correctly'], train_df[features])
lin_model = model.fit()
roc_auc_score(valid_df['answered_correctly'], lin_model.predict(valid_df[features]))

CPU times: user 9.09 s, sys: 1.99 s, total: 11.1 s
Wall time: 7.24 s


0.7286155138310852

In [10]:
del lin_model

# Neural models

In [11]:
import fastai
from fastai.tabular.all import *

In [12]:
EPOCHS=5
BATCH_SIZE=4096

In [13]:
train_df['answered_correctly']=train_df['answered_correctly'].astype('float32')

In [14]:
# %%time
# cat_features = ['prior_question_had_explanation_enc', 'part']
# tabular_ds = TabularDataLoaders.from_df(train_df, 
#     procs=[Categorify, FillMissing, Normalize],
#     cat_names=cat_features, 
#     cont_names=[x for x in features if x not in cat_features],
#     y_names='answered_correctly', valid_idx=valid_df.index, bs=BATCH_SIZE)


In [15]:
# def my_auc(inp, targ):
#     "Simple wrapper around scikit's roc_auc_score function for regression problems"
#     inp,targ = flatten_check(inp,targ)
#     return roc_auc_score(targ.cpu().numpy(), inp.cpu().numpy())

In [16]:
# def bce(inp,targ):
#     "Binary cross entropy"
#     inp,targ = flatten_check(inp,targ)
#     loss = F.binary_cross_entropy(inp, targ)
#     return loss

In [17]:
# learn = tabular_learner(tabular_ds, layers=[200,100], metrics=my_auc)

In [18]:
# learn.model.layers.add_module('sigmoid', nn.Sigmoid())
# learn.loss_func = bce

In [19]:
# learn.lr_find()

In [20]:
# %%time

# learn.fit_one_cycle(3) 

In [21]:
# def predict_batch(self, df):
#     dl = self.dls.test_dl(df)
#     dl.dataset.conts = dl.dataset.conts.astype(np.float32)
#     inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
#     return preds.numpy()

# setattr(learn, 'predict_batch', predict_batch)

# LightGBM

In [22]:
# params = {
#     'objective': 'binary',
#     'seed': 42,
#     'metric': 'auc',
#     'learning_rate': 0.05,
#     'max_bin': 800,
#     'num_leaves': 80
# }

In [23]:
# tr_data = lgb.Dataset(train_df[features], label=train_df['answered_correctly'])
# va_data = lgb.Dataset(valid_df[features], label=valid_df['answered_correctly'])

In [24]:
# model = lgb.train(
#     params, 
#     tr_data, 
#     num_boost_round=10000,
#     valid_sets=[tr_data, va_data], 
#     early_stopping_rounds=50,
#     verbose_eval=50
# )

# model.save_model(f'model.txt')
# lgb.plot_importance(model, importance_type='gain')
# plt.show()

In [25]:
model = lgb.Booster(model_file='../input/riiid-train-features/model.txt')

# Inference

here we read back the train data frame so we can generate features on the test set batches

In [26]:
%%time
del train_df
del valid_df
train = pd.read_parquet('../input/riiid-train-features/train_features_subset30.parquet')

CPU times: user 3.02 s, sys: 1.12 s, total: 4.13 s
Wall time: 1.59 s


In [27]:
user_agg = train.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train.groupby('content_id')[target].agg(['sum', 'count'])

isolating the question features we generated

In [28]:
question_features_df = train[['content_id', 'quest_pct', 'count', 'part']].groupby('content_id').agg(['unique'])
question_features_df.columns = ['quest_pct', 'count', 'part']
question_features_df = question_features_df.astype({'quest_pct': np.float64, 'count': np.int32, 'part': np.int8})

isolating the user features we generated

In [29]:
user_features_df = train[['user_id', 'avg_questions', 'avg_questions_seen', 'explanation_mean_user']].groupby('user_id').agg(['max'])
user_features_df.columns = ['avg_questions', 'avg_questions_seen', 'explanation_mean_user']

In [30]:
# memory optimization
del train
gc.collect()

40

In [31]:
features

['quest_pct',
 'count',
 'content_mean_accuracy',
 'content_id',
 'user_mean_accuracy',
 'task_container_id',
 'part',
 'user_hmean_accuracy',
 'prior_question_had_explanation_enc',
 'prior_question_elapsed_time',
 'avg_questions',
 'avg_questions_seen',
 'explanation_mean_user']

Here we create dictionaries for each user and content to serve as a state.
We later update the state during inference so we can update user and content performance.

In [32]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [33]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

inference loop

In [34]:
%%time

for (test_df, sample_prediction_df) in iter_test:  # loop through the environment test generator
    if prior_test_df is not None:  # here we update the state
        prior_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df['answered_correctly'] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df['answered_correctly'].values

        # update state's values for user and content sum and counts
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)  # ignore lecture content
    test_df = pd.merge(test_df, question_features_df, on='content_id', how='left')  # merge question features
    test_df = pd.merge(test_df, user_features_df, on='user_id', how='left')  # merge user features
    test_df['prior_question_had_explanation_enc'] = test_df['prior_question_had_explanation'].fillna(False).astype('int8')    
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    # adding features that rely on the updated state counts
    test_df['user_mean_accuracy'] = user_sum / user_count
#     test_df['content_count'] = content_count
    test_df['content_mean_accuracy'] = content_sum / content_count
    test_df['user_hmean_accuracy'] =  2 * (
        (test_df['user_mean_accuracy'] * test_df['content_mean_accuracy']) /
        (test_df['user_mean_accuracy'] + test_df['content_mean_accuracy'])
    )
       
    # inference
    test_df['answered_correctly'] = model.predict(test_df[features])
    env.predict(test_df[['row_id', 'answered_correctly']])



CPU times: user 968 ms, sys: 25 ms, total: 993 ms
Wall time: 608 ms
