### Version history
- V10: data sorted by timestamp, accuracy metrics, stratifiedkfold (score: 0.744)
- V11: tried TimeSeriesSplit with 500 iterations and predict_proba (score: error)
- V12: removed sort by timestamp, reducing iterations to 350, keeping TimeSeriesSplit (running for more than 5 hours)
- V13: increased data batch size, trying BlockingTimeSeriesSplit and AUC metrics (score: 0.672)
- V14: reverting back to timestamp and Accuracy metrics, BlockingTimeSeriesSplit retained (score: 0.742)
- V15: adding features from the questions table. Forgot to update the prediction part. D***!!!
- V16: trying [this](https://www.kaggle.com/shoheiazuma/riiid-lgbm-starter) out.
- V17: persisting with [this](https://www.kaggle.com/shoheiazuma/riiid-lgbm-starter) and TimeSeriesSplit (score: 0.747)
- V18: removed batch processing, turned on GPU (error in predict part)
- V19: corrected key error in predict cycle (score: 0.717)
- V20: trying LGB on the same data to compare results (no submission file)
- V21: LGB retry (score: 0.729)
- V22: retrying with catboost (score: 0.758)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Suppressing warnings because of skopt verbosity
import sys, warnings
warnings.filterwarnings("ignore")
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,  precision_score, recall_score,f1_score
from sklearn.preprocessing import LabelEncoder

_ = np.seterr(divide='ignore', invalid='ignore')

In [None]:
data_path = "../input/riiid-test-answer-prediction/train.csv"
questions_path = "../input/riiid-test-answer-prediction/questions.csv"

In [None]:
%%time

dtype = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": 'boolean'
}

train = pd.read_parquet("../input/riiid-parquet-files/train.parquet")
train = train[dtype.keys()]
train = train.astype(dtype)
train = train[train['answered_correctly']!=-1]
train['prior_question_had_explanation'].fillna(False, inplace=True)
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype(bool)
train = train[['user_id','content_id','answered_correctly',
               'prior_question_elapsed_time', 'prior_question_had_explanation']]
train.info()

In [None]:
# with open(data_path) as f:
#     first_line = f.readline()
# first_line

In [None]:
# cols = len(first_line.split(','))
# cols

In [None]:
# %%time
# with open(data_path) as fp:
#     for (rows, _) in enumerate(fp, 1):
#        pass
# rows

In [None]:
# OTHER CONSTANTS
TARGET = "answered_correctly"
TIME_MEAN = 21000.0
TIME_MIN = 0.0
TIME_MAX = 300000.0
#map_prior = {True:1, False:0}

In [None]:
dtype={'question_id':'int16','part':'int8','bundle_id':'int8', 'tags':'str'}
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                        usecols=dtype.keys(),
                        dtype=dtype)
questions['tags'].fillna('None', inplace=True)
questions['num_tags'] = questions['tags'].apply(lambda x:len(x.split()) if pd.notna(x) else 0) 
#questions = questions.rename(columns={'part':'qpart'})

questions['tags'].fillna('None', inplace=True)
le = LabelEncoder()
questions['tags_label'] = le.fit_transform(questions['tags'].values)

questions = questions[['question_id','part','tags_label']]
questions.isnull().sum()

In [None]:

def preprocess(df):
    df = df[df[TARGET] != -1].reset_index(drop=True)
    df = df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)
    df = df.merge(questions, left_on='content_id', right_on='question_id', how='left')
    df.drop(columns=['question_id'], inplace=True)
    df["prior_question_had_explanation"].fillna(False, inplace=True)
    df["prior_question_elapsed_time"] = df["prior_question_elapsed_time"].fillna(TIME_MEAN)
    #df["duration"] = (df["prior_question_elapsed_time"] - TIME_MIN) / (TIME_MAX - TIME_MIN)
    
    df['lag'] = df.groupby('user_id')[TARGET].shift()
    cum = df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
    df['user_correctness'] = cum['cumsum'] / cum['cumcount']
    df.drop(columns=['lag'], inplace=True)
    
    user_agg = df.groupby('user_id')[TARGET].agg(['sum', 'count'])
    content_agg = df.groupby('content_id')[TARGET].agg(['sum', 'count'])
    df['content_count'] = df['content_id'].map(content_agg['count']).astype('int32')
    df['content_mean'] = df['content_id'].map(content_agg['sum'] / content_agg['count'])
    
    return df

In [None]:
%%time

train['lag'] = train.groupby('user_id')[TARGET].shift()
cum = train.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train['user_mean'] = cum['cumsum'] / cum['cumcount']
train.drop(columns=['lag'], inplace=True)

user_agg = train.groupby('user_id')[TARGET].agg(['sum', 'count'])
content_agg = train.groupby('content_id')[TARGET].agg(['sum', 'count'])

#----------------------------
train_df = train.groupby('user_id').tail(24).reset_index(drop=True)
train_df = pd.merge(train_df, questions, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)
#----------------------------

train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')

train_df['content_mean'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace=True)
del train

In [None]:
FE = ['content_mean','content_count','user_mean',
      'prior_question_elapsed_time','prior_question_had_explanation',
      'part','tags_label']
CF = ['prior_question_had_explanation','part']  ##'bundle_id', 'num_tags']

In [None]:
X_train, y_train = train_df[FE], train_df[TARGET]
X_test, y_test = valid_df[FE], valid_df[TARGET]

In [None]:
# Initializing a CatBoostClassifier with best parameters
best_params = {#'bagging_temperature': 0.6,
               #'border_count': 128,
               #'depth': 8,
               'iterations': 20000,
               #'l2_leaf_reg': 30,
               #'learning_rate': 0.5,
               #'random_strength': 0.01,
               #'scale_pos_weight': 0.48
            }

catb = CatBoostClassifier(**best_params,
                          loss_function='CrossEntropy', #loss_function='Logloss',
                          eval_metric = 'AUC',
                          #nan_mode='Min',
                          thread_count=2,
                          use_best_model=True,
                          task_type = "GPU",
                          verbose = False)

roc_auc = list()
average_precision = list()
best_iteration = list()
    
X_train, y_train = train_df[FE], train_df[TARGET]
X_test, y_test = valid_df[FE], valid_df[TARGET]
    
train = Pool(data=X_train, 
             label=y_train,            
             feature_names=FE,
             cat_features=[])

test = Pool(data=X_test, 
            label=y_test,
            feature_names=FE,
            cat_features=[])

catb.fit(train,
         verbose_eval=50, 
         early_stopping_rounds=100,
         eval_set=test,
         use_best_model=True,
         plot=False)

best_iteration.append(catb.best_iteration_)
preds = catb.predict_proba(X_test)
roc_auc.append(roc_auc_score(y_true=y_test, y_score=preds[:,1]))
average_precision.append(average_precision_score(y_true=y_test, y_score=preds[:,1]))
print("Average cv roc auc score %0.3f ± %0.3f" % (np.mean(roc_auc), np.std(roc_auc)))
print("Average cv roc average precision %0.3f ± %0.3f" % (np.mean(average_precision), np.std(average_precision)))

catb.save_model('catb_model.cbm')

In [None]:
#catb = catb.load_model('../input/riiid-catboost-attempt/catb_model.cbm')

In [None]:
from collections import defaultdict

user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
%%time

for test_df, sample_prediction_df in iter_test:
    #------------------------------------------
    if prior_test_df is not None:
        prior_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[TARGET] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[TARGET].values
        
        for user_id, content_id, target in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += target
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += target
            content_count_dict[content_id] += 1
    prior_test_df = test_df.copy()
    #------------------------------------------
    #test_df = preprocess(test_df)
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = test_df.merge(questions, left_on='content_id', right_on='question_id', how='left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation']\
                                                               .fillna(False).astype(bool)
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, \
                                                  test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        
    test_df['user_mean'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_mean'] = content_sum / content_count
    
    #------------------------------------------
    Xtest = test_df[FE].values
    Xtest = Pool(data=Xtest,
                 feature_names=FE,
                 cat_features=[])
    test_df['answered_correctly'] = catb.predict_proba(Xtest)[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    #------------------------------------------
    #test_df[TARGET] = model.predict(test_df[FE])
    #env.predict(test_df[['row_id', TARGET]])