This a simple baseline using the LGBM algorithm. This is a small modification to https://www.kaggle.com/jsylas/riiid-lgbm-starter

In [None]:
# Used most of coding from this kernel https://www.kaggle.com/lgreig/simple-lgbm-baseline

import riiideducation
#import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

env = riiideducation.make_env()

train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   usecols=[1, 2, 3, 4, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

train = train[train.content_type_id == False]
train = train.sort_values(['timestamp'], ascending=True)
train.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

results_c = train.iloc[0:90_000_000,:][['content_id','answered_correctly']].groupby(['content_id']).agg(['mean', 'sum', 'count'])
results_c.columns = ['answered_correctly_content_mean', 'answered_correctly_content_sum', 'answered_correctly_content_count']

results_u = train.iloc[0:90_000_000,:][['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ['answered_correctly_user_mean', 'answered_correctly_user_sum', 'answered_correctly_user_count']



In [None]:
X = train.iloc[90_000_000:train.shape[0], :]
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")
X = X[X.answered_correctly!= -1 ]
X = X.sort_values(['user_id'])
Y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)


In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])
X.head()

X = X[['answered_correctly_user_mean', 'answered_correctly_user_sum', 'answered_correctly_user_count',
       'answered_correctly_content_mean', 'answered_correctly_content_sum', 'answered_correctly_content_count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']] 

from  sklearn.tree import DecisionTreeClassifier
from  sklearn.model_selection import train_test_split

Xt, Xv, Yt, Yv = train_test_split(X, Y, test_size =0.2, shuffle=False)

import lightgbm as lgb

params = {
    'objective': 'binary',
    'max_bin': 600,
    'learning_rate': 0.01,
    'num_leaves': 80
}


lgb_train = lgb.Dataset(Xt, Yt)
lgb_eval = lgb.Dataset(Xv, Yv, reference=lgb_train)

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=10000,
    early_stopping_rounds=10
)

In [None]:
y_pred = model.predict(Xv)
y_true = np.array(Yv)
roc_auc_score(y_true, y_pred)

In [None]:
test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')

test["prior_question_had_explanation_enc"] = lb_make.fit_transform(test["prior_question_had_explanation"])

test = pd.merge(test, results_u, on=['user_id'],  how="left")
test = pd.merge(test, results_c, on=['content_id'],  how="left")

test.fillna(0.5, inplace=True)

y_pred = model.predict(test[['answered_correctly_user_mean', 'answered_correctly_user_sum', 'answered_correctly_user_count',
                             'answered_correctly_content_mean', 'answered_correctly_content_sum', 'answered_correctly_content_count',
                             'prior_question_elapsed_time','prior_question_had_explanation_enc']])

test['answered_correctly'] = y_pred

results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean', 'sum', 'count'])
results_c.columns = ['answered_correctly_content_mean', 'answered_correctly_content_sum', 'answered_correctly_content_count']

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ['answered_correctly_user_mean', 'answered_correctly_user_sum', 'answered_correctly_user_count']

In [None]:
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'],  how="left")

    test_df['answered_correctly_user_mean'].fillna(0.5, inplace=True)
    test_df['answered_correctly_user_sum'].fillna(0, inplace=True)
    test_df['answered_correctly_user_count'].fillna(0, inplace=True)
    test_df['answered_correctly_content_mean'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content_sum'].fillna(0, inplace=True)
    test_df['answered_correctly_content_count'].fillna(0, inplace=True)

    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    
    test_df['answered_correctly'] =  model.predict(test_df[['answered_correctly_user_mean', 'answered_correctly_user_sum', 'answered_correctly_user_count',
                                                            'answered_correctly_content_mean', 'answered_correctly_content_sum', 'answered_correctly_content_count',
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc']])
    
    env.predict(test_df.loc[test_df['content_type_id']==0, ['row_id', 'answered_correctly']])