## Read Data

In [None]:
import logging

RUN_ON_KAGGLE = True
RANDOM_SEED = 3

import gc
import os
import pickle
import random
import numpy as np
import pandas as pd
from collections import defaultdict
random.seed(RANDOM_SEED)
np.random.seed(seed=RANDOM_SEED)

In [None]:
if RUN_ON_KAGGLE:
    data_folder = "/kaggle/input/riiid-test-answer-prediction"
    
    formatter = logging.Formatter('\x1b[1m\x1b[33m[%(levelname)s %(asctime)s.%(msecs)03d %(name)s]\x1b[0m: %(message)s', '%Y-%m-%d %H:%M:%S')
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    console.setFormatter(formatter)
    logger = logging.getLogger('I')
    logger.addHandler(console)
else:
    data_folder = "../data"
    
    logging.basicConfig(level=logging.INFO,
                    format='\x1b[1m\x1b[33m[%(levelname)s %(asctime)s.%(msecs)03d %(name)s]\x1b[0m: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger('I')

In [None]:
processed_data_dir = "/kaggle/input/riid-acp-data-joined-user-feat"

In [None]:
logger.info("reading training data...")
train = pd.read_pickle(
    os.path.join(processed_data_dir, "train-with-user-fea.pkl")
)

In [None]:
logger.info("reading validation data...")
valid = pd.read_pickle(
    os.path.join(processed_data_dir, "eval-with-user-fea.pkl")
)

In [None]:
train.drop("row_id", inplace=True, axis=1)
valid.drop("row_id", inplace=True, axis=1)

In [None]:
def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

In [None]:
answer_cnt = defaultdict(int)
answer_cnt.update(
    pickle_load(os.path.join(processed_data_dir, "answer_cnt.pkl"))
)
correct_sum = defaultdict(int)
correct_sum.update(
    pickle_load(os.path.join(processed_data_dir, "correct_sum.pkl"))
)

In [None]:
logger.info("reading questions...")
questions = pd.read_csv(
    os.path.join(data_folder, "questions.csv"),
    dtype={
        "question_id": "int16",
        "bundle_id": "int32",
        "correct_answer": "int8",
        "part": "int8"
    }
)

In [None]:
logger.info("reading lectures...")
lectures = pd.read_csv(
    os.path.join(data_folder, "lectures.csv"),
    dtype={
        "lecture_id": "int16",
        "tag": "int16",
        "part": "int8"
    }
)

In [None]:
def add_user_feats(
    df,
    user_correct_sum: dict,
    user_answer_cnt: dict,
    update=True
):
    correct_sum = np.zeros(len(df), dtype=np.int32)
    answer_cnt = np.zeros(len(df), dtype=np.int32)
    cols = ["user_id", "answered_correctly"] if update else ["user_id"]
    for i, row in df[cols].iterrows():
        user_id = row["user_id"]
        correct_sum[i] = user_correct_sum[user_id]
        answer_cnt[i] = user_answer_cnt[user_id]
        if update:
            answer_res = row["answered_correctly"]
            user_correct_sum[user_id] += answer_res
            user_answer_cnt[user_id] += 1
    user_feats = pd.DataFrame({
        "correct_sum": correct_sum,
        "answer_cnt": answer_cnt,
        "acc": (correct_sum / answer_cnt).astype(np.float32)
    })
    return pd.concat([df, user_feats], axis=1)

In [None]:
def update_user_feats(df, user_correct_sum: dict, user_answer_cnt: dict):
    df = df[df["content_type_id"] == 0]
    for _, row in df[["user_id", "answered_correctly"]].iterrows():
        user_id = row["user_id"]
        answer_res = row["answered_correctly"]
        user_correct_sum[user_id] += answer_res
        user_answer_cnt[user_id] += 1

In [None]:
def fast_left_join(left, right, on=None, left_on=None, right_on=None):
    left_on = left_on or on
    right_on = right_on or on
    assert left_on in left.columns, f"{on} not found in left DF"
    assert right_on == right.index.name, f"{right_on} not equals to right's index_name"
    return pd.concat([
        left.reset_index(drop=True),
        right.reindex(left[left_on].values).reset_index(drop=True)
    ], axis=1)

In [None]:
logger.info("generating content features...")
content_df = train[
    ["content_id", "answered_correctly"]
].groupby("content_id").agg(
    content_acc=("answered_correctly", "mean")
).astype({"content_acc": "float32"})

In [None]:
logger.info("joining content features to train...")
train = fast_left_join(
    train,
    content_df,
    on="content_id"
)

In [None]:
_ = gc.collect()

In [None]:
logger.info("join content features to validation...")
valid = fast_left_join(
    valid,
    content_df,
    on="content_id"
)

In [None]:
prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()

In [None]:
train['prior_question_elapsed_time'] = train.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
valid['prior_question_elapsed_time'] = valid.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

In [None]:
logger.info("generating questions features...")
question_df = questions[[
    "question_id", "part"
]].set_index("question_id")

In [None]:
logger.info("joining questions features to validation...")
valid = fast_left_join(
    valid,
    question_df,
    left_on = "content_id",
    right_on = "question_id"
)

In [None]:
logger.info("joining questions features to train...")
train = fast_left_join(
    train,
    question_df,
    left_on = "content_id",
    right_on = "question_id"
)

In [None]:
logger.info("filling missing values...")
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')

In [None]:
import lightgbm as lgb

In [None]:
TARGET = "answered_correctly"
FEATS = [
    "correct_sum",
    "answer_cnt",
    "acc",
    "part",
    "content_acc",
    "prior_question_elapsed_time",
    "prior_question_had_explanation"
]

In [None]:
dro_cols = list(set(train.columns) - set(FEATS))
y_tr = train[TARGET]
y_va = valid[TARGET]
train.drop(dro_cols, axis=1, inplace=True)
valid.drop(dro_cols, axis=1, inplace=True)
_ = gc.collect()

In [None]:
lgb_train = lgb.Dataset(train[FEATS], y_tr, categorical_feature=["part"])
lgb_valid = lgb.Dataset(valid[FEATS], y_va, reference=lgb_train, categorical_feature=["part"])
del train, y_tr
_=gc.collect()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
USE_OFFLINE_MODEL = True
model_path = os.path.join(processed_data_dir, "model_7fea.pkl")

if USE_OFFLINE_MODEL:
    logger.info("loading offline trained model...")
    model = pickle_load(model_path)
else:
    logger.info("start training model...")
    model = lgb.train(
        {
            "objective": "binary",
            "metric": "auc",
            "boosting_type": "gbdt",
            "learning_rate": 0.3,
            "seed": 3
        }, 
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        verbose_eval=20,
        num_boost_round=1000,
        early_stopping_rounds=10
    )
auc = roc_auc_score(y_va, model.predict(valid[FEATS]))
logger.info(f"auc on validation: {auc}")

In [None]:
%matplotlib inline
lgb.plot_importance(model)

In [None]:
del lgb_train, lgb_valid
_ = gc.collect()

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [None]:
import time

In [None]:
pre_test_df = None
infer_start, iter_cnt = time.time(), 0
actuals = []
preds = []

for (test_df, sample_pred_df) in iter_test:
    if pre_test_df is not None:
        targets = eval(test_df["prior_group_answers_correct"].iloc[0])
        actuals.extend(targets)
        pre_test_df[TARGET] = targets
        update_user_feats(pre_test_df, correct_sum, answer_cnt)
    pre_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats(
        test_df, correct_sum, answer_cnt, update=False
    )
    test_df = fast_left_join(
        test_df, content_df, on="content_id"
    )
    test_df = fast_left_join(
        test_df, question_df, left_on="content_id", right_on="question_id"
    )
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    preds_ =  model.predict(test_df[FEATS])
    test_df[TARGET] = preds_
    preds.extend(preds_.tolist())
    set_predict(test_df[['row_id', TARGET]])
    iter_cnt += 1

infer_duration = time.time() - infer_start
logger.info(f"total iterations: {iter_cnt}, cost {infer_duration:.4f} secs, {infer_duration/iter_cnt:.4f} secs/iteration")

In [None]:
roc_auc_score(actuals, preds[:len(actuals)])