In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [2]:
train = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',
                    usecols=[1, 2, 3, 4, 7, 8, 9], dtype={'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 'prior_question_had_explanation': 'boolean'}
                    )

train = train[train.content_type_id == False]
train = train.sort_values(['timestamp'], ascending=True)
train.drop(['timestamp', 'content_type_id'], axis=1,   inplace=True)

In [3]:
results_c = train[['content_id', 'answered_correctly']
                  ].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

results_u = train[['user_id', 'answered_correctly']].groupby(['user_id']).agg([
    'mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [4]:
X = train.iloc[90000000:, :]
X = X[X.answered_correctly != -1]
Y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)

In [5]:
lb_make = LabelEncoder()
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(
    X["prior_question_had_explanation"])

X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")
X = X.sort_values(['user_id'])

X = X[['answered_correctly_user', 'answered_correctly_content', 'sum',
       'prior_question_elapsed_time', 'prior_question_had_explanation_enc']]
X.fillna(0.5,  inplace=True)

X.head()

Unnamed: 0,answered_correctly_user,answered_correctly_content,sum,prior_question_elapsed_time,prior_question_had_explanation_enc
1119023,0.713628,0.42679,775.0,16000.0,1
1119056,0.713628,0.430757,775.0,22000.0,1
1119070,0.713628,0.421442,775.0,27000.0,1
1119081,0.713628,0.469838,775.0,14000.0,1
1119089,0.713628,0.752486,775.0,8000.0,1


In [6]:
Xt, Xv, Yt, Yv = train_test_split(X, Y, test_size=0.01, shuffle=False)
lgb_train = lgb.Dataset(Xt, Yt)
lgb_eval = lgb.Dataset(Xv, Yv, reference=lgb_train)

In [7]:
params = {
    'objective': 'binary',
    'max_bin': 600,
    'learning_rate': 0.01,
    'num_leaves': 80
}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=10000,
    early_stopping_rounds=10
)

[LightGBM] [Info] Number of positive: 6232547, number of negative: 2946040
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 9178587, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.679031 -> initscore=0.749323
[LightGBM] [Info] Start training from score 0.749323
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.62756	valid_1's binary_logloss: 0.624695
[20]	training's binary_logloss: 0.62753	valid_1's binary_logloss: 0.62469
[30]	training's binary_logloss: 0.627503	valid_1's binary_logloss: 0.624688
[40]	training's binary_logloss: 0.627478	valid_1's binary_logloss: 0.624686
[50]	training's binary_logloss: 0.627458	valid_1's binary_logloss: 0.624684
[60]	training's binary_logloss: 0.627439	valid_1's binary_logloss: 0.624678
[70]	training's binary_loglos

In [8]:
y_pred = model.predict(Xv)
y_true = np.array(Yv)

roc_auc_score(y_true, y_pred)

0.5051506518468748

In [9]:
test = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

test["prior_question_had_explanation_enc"] = lb_make.fit_transform(
    test["prior_question_had_explanation"])

test = pd.merge(test, results_u, on=['user_id'],  how="left")
test = pd.merge(test, results_c, on=['content_id'],  how="left")

test = test[['answered_correctly_user', 'answered_correctly_content', 'sum',
             'prior_question_elapsed_time', 'prior_question_had_explanation_enc']]
test.fillna(0.5, inplace=True)

y_pred = model.predict(test)
test['answered_correctly'] = y_pred

In [10]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'],  how="left")
    test_df['answered_correctly_user'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content'].fillna(0.5, inplace=True)
    test_df['sum'].fillna(0, inplace=True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(
        test_df["prior_question_had_explanation"])
    test_df['answered_correctly'] = model.predict(
        test_df[['answered_correctly_user', 'answered_correctly_content', 'sum', 'prior_question_elapsed_time', 'prior_question_had_explanation_enc']])
    env.predict(test_df.loc[test_df['content_type_id']
                            == 0, ['row_id', 'answered_correctly']])

ModuleNotFoundError: No module named 'riiideducation.competition'