In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation

_ = np.seterr(divide='ignore', invalid='ignore')

# Preprocess

In [None]:
data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
example_test = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)

In [None]:
train_df.head()

### target

In [None]:
agg_target_by_user = train_df.groupby('user_id')[target].agg(['sum', 'count', "std"])

In [None]:
agg_target_by_content = train_df.groupby('content_id')[target].agg(['sum', 'count', "std"])

In [None]:
agg_target_by_user_content = train_df.groupby(["user_id",'content_id'])[target].agg(['sum', 'count', "std"])

### prior_question_elapsed_time

In [None]:
agg_prior_question_elapsed_time_by_user = train_df.groupby('user_id')["prior_question_elapsed_time"].agg(['min', "max","mean", "std"])

In [None]:
train_df = train_df.groupby('user_id').tail(24).reset_index(drop=True)

## questions_df

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)

In [None]:
def feature_engineering(_df):
    _df = pd.merge(_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    _df.drop(columns=['question_id'], inplace=True)
    _df['user_target_mean']  = _df['user_id'].map(agg_target_by_user['sum'] / agg_target_by_user['count'])
    _df['content_target_count'] = _df['content_id'].map(agg_target_by_content['count']).astype('int32')
    _df['content_target_mean'] = _df['content_id'].map(agg_target_by_content['sum'] / agg_target_by_content['count'])
    
    return _df

In [None]:
train_df = feature_engineering(train_df)

In [None]:
valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace=True)

# Train

In [None]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

In [None]:
features = set(train_df.columns) - {"answered_correctly", "user_id", "content_id", "task_container_id"}
features = list(features)

In [None]:
tr_data = lgb.Dataset(train_df[features], label=train_df[target])
va_data = lgb.Dataset(valid_df[features], label=valid_df[target])

model = lgb.train(
    params, 
    tr_data, 
    num_boost_round=10000,
    valid_sets=[tr_data, va_data], 
    early_stopping_rounds=50,
    verbose_eval=50
)

# model.save_model(f'model.txt')
lgb.plot_importance(model, importance_type='gain')
plt.show()

## predict

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = feature_engineering(test_df)
    test_df[target] = model.predict(test_df[features])
    env.predict(test_df[['row_id', target]])
