# Use the package 'datatable' for fast handling

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

# Necessary packages

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
import torch

# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

# Preprocessing

* Data config

In [None]:
data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}

target = 'answered_correctly'

* Import data

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(data_types_dict.keys())).to_pandas()

* Information of the training dataset

In [None]:
print('Training dataset detailed information')
print('*' * 50)
print('Columns:', train_df.columns)
print('*' * 50)
print('Shape:', train_df.shape)
print('*' * 50)
print('NA values in each column:', sum(train_df.isna().sum()))
print('*' * 50)

In [None]:
# Exclude lectures
train_df = train_df[train_df[target] != -1].reset_index(drop = True, inplace = False)
# Fill NaN values in the 'prior_question_had_explanation' columns
train_df['prior_question_had_explanation'].fillna(False, inplace = True)
# Set type
train_df = train_df.astype(data_types_dict)

* Construct new features

In [None]:
# Answer for the previous questions of users
train_df['lag'] = train_df.groupby('user_id')[target].shift()
# For each user (groupby('user_id')), compute the cummulative number of correct answers and number answers in general
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
# User correctness (measure the users' learning progress)
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
# Drop the 'lag' feature
train_df.drop(columns = ['lag'], inplace = True)

In [None]:
# Overall correctness of users
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
# Overall difficulty of questions
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
# Take only 24 last observations of each user
train_df = train_df.groupby('user_id').tail(24).reset_index(drop = True)

In [None]:
train_df

* Question dataset comes into play

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols = [0, 3],
    dtype = {'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
train_df.drop(columns = ['question_id'], inplace = True)

In [None]:
# How many questions have been answered in each content ID?
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
# How hard are questions in each content ID?
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

# Extract the validation set

In [None]:
# Ratio is 6/24 = 25%
valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace = True)

# Training

* Construct data

In [None]:
features = ['content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count']

params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 2500,
    'learning_rate': 4e-2,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 15,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
}

In [None]:
from catboost import CatBoostClassifier, Pool

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target])
val_set = Pool(valid_df[features], label = valid_df[target])

In [None]:
# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)

# Inference

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
try:
    env = riiideducation.make_env()
except:
    pass
iter_test = env.iter_test()
prior_test_df = None

In [None]:
%%time

for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
       
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df[['row_id', target]])