In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import datatable as dt
import lightgbm as lgb
import gc
import psutil
import os
import sys
import math
import random
import shap
import json
import riiideducation

from collections import defaultdict
from time import time
from datetime import timedelta
from contextlib import contextmanager
from bitarray import bitarray
from typing import List
from pandas import DataFrame, Series

In [None]:

######## Global define ########
DEFAULT_SEED = 42
PLOT_SHAP = False

# FE
# df_train['user_id'].nunique() == 393656
MAX_QUESTIONS = 14000  # 13523 question_id in questions.csv
VAL_SIZE = 2500000
USE_DATA_RATIO = 1

TS_SCALING = 1000*3600

# LGBM
LEARNING_RATE = 0.1  # default = 0.1
MAX_BIN = 364  # default 255
NUM_LEAVES = 445  # default = 31
FEATURE_FRACTION = 0.639
BAGGING_FRACTION = 0.842
BAGGING_FREQ = 19

NUM_BOOST_ROUNDS = 10000
EARLY_STOP_ROUNDS = 20
VERBOSE_EVAL = 50

TRAIN_FILE_PATH = "../input/riiid-test-answer-prediction/train.csv"
QUESTIONS_FILE_PATH = "../input/riiid-test-answer-prediction/questions.csv"
LECTURES_FILE_PATH = "../input/riiid-test-answer-prediction/lectures.csv"

@contextmanager
def trace_mem(title):
    t0 = time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    yield
    m1 = p.memory_info()[0] / 2. ** 30
    delta = m1 - m0
    sign = '+' if delta >= 0 else '-'
    delta = math.fabs(delta)
    print(f"[{m1:.1f}GB({sign}{delta:.2f}GB): {time() - t0:.1f}s] {title} ", file=sys.stderr)

def seed_everything(seed=DEFAULT_SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything()

# Training features: Update TRA_FEATURES, cat_features, UserFeats, QuesFeats,
# name_to_extract_fn, get_feat_values when update features!!!
TRA_FEATURES = [
    # From row['']:
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    # From QuesFeats:
    'bundle_id',
    'part',
    'tags_encoded',
    'tags_first1',
    'tags_last2',
    'content_count',
    'content_correctness',
    # From UserFeats:
    'user_correctness',
    'user_correct_cumsum',
    'residual_user_mean',
    'pq_elapsed_time_user_mean',
    'explanation_user_cumsum',
    'lag_time',
    'lag_time2',
    # From global dict:
    'user_content_attempted',
    ]
cat_features = ['bundle_id', 'part', 'tags_encoded', 'tags_first1', 'tags_last2']


In [None]:

######## Data preparation and FE ########
target_col = 'answered_correctly'
data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    # 'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
}

with trace_mem("Loading train file, fillna, astype"):
    df_train = dt.fread(TRAIN_FILE_PATH, columns=set(data_types_dict.keys())).to_pandas()

    # df_train.isnull().sum().sum() = 2744044 == 2351538 + 392506
    # df_train['prior_question_elapsed_time'].isnull().sum() = 2351538
    # df_train['prior_question_had_explanation'].isnull().sum() = 392506
    df_train['prior_question_had_explanation'].fillna(False, inplace=True)
    mean_prior_question_elapsed_time = df_train['prior_question_elapsed_time'].mean()
    df_train['prior_question_elapsed_time'].fillna(mean_prior_question_elapsed_time, inplace=True)
    assert (0 == df_train.isnull().sum().sum())

    df_train = df_train.astype(data_types_dict)
    del data_types_dict
    gc.collect()

with trace_mem("df_train drop lecture rows"):
    df_train = df_train[df_train['content_type_id'] == 0].reset_index(drop=True)

with trace_mem("df_train drop useless columns"):
    df_train.drop(columns=['content_type_id'], inplace=True)

with trace_mem("Generate feature 'lag_time2'"):
    df_train['user_id_copy'] = df_train['user_id']
    timestamp_groupby_user_task = df_train[['user_id', 'task_container_id', 'row_id', 'user_id_copy', 'timestamp']]. \
        groupby(['user_id', 'task_container_id'], sort=False)['row_id', 'user_id_copy', 'timestamp']
    ts_user_task_agg = timestamp_groupby_user_task.first()
    del timestamp_groupby_user_task
    gc.collect()
    ts_user_task_agg.set_index('row_id', inplace=True)
    df_train.set_index('row_id', inplace=True)

    # convert timestamp to hours(int64 to float32)
    ts_user_task_agg['timestamp'] = ts_user_task_agg['timestamp']/TS_SCALING
    # print(f"ts_user_task_agg['timestamp'].dtypes: {ts_user_task_agg['timestamp'].dtypes}")
    # ts_user_task_agg['timestamp'] = ts_user_task_agg['timestamp'].astype('float32')

    ts_groupby = ts_user_task_agg.groupby('user_id_copy')['timestamp']
    ts_user_task_agg['timestamp_1'] = ts_groupby.shift(1)
    ts_user_task_agg['timestamp_1'].fillna(0, inplace=True)
    ts_user_task_agg['timestamp_1'] = ts_user_task_agg['timestamp_1'].astype('float32')
    user_max_timestamp_2_agg = ts_user_task_agg.groupby(['user_id_copy'])['timestamp_1'].max()
    # print(f"111 user_max_timestamp_2_agg.dtyps: {user_max_timestamp_2_agg.dtypes}")
    ts_user_task_agg.drop(columns=['timestamp_1'], inplace=True)

    ts_user_task_agg['timestamp'] = ts_groupby.shift(2)
    ts_user_task_agg['timestamp'].fillna(0, inplace=True)
    # print(f"222 ts_user_task_agg['timestamp'].dtypes: {ts_user_task_agg['timestamp'].dtypes}")
    # ts_user_task_agg['timestamp'] = ts_user_task_agg['timestamp'].astype('float32')
    df_train.drop(columns=['user_id_copy'], inplace=True)
    del ts_groupby
    gc.collect()

    df_train['lag_time2'] = 0
    df_train.loc[ts_user_task_agg.index, 'lag_time2'] = ts_user_task_agg['timestamp']
    ts_user_task_agg.drop(columns=['user_id_copy', 'timestamp'], inplace=True)
    del ts_user_task_agg  # How to free ts_user_task_agg mem here???
    gc.collect()

    df_train['lag_time2'] = df_train['timestamp']/TS_SCALING - df_train['lag_time2']
    df_train['lag_time2'] = df_train['lag_time2'].astype('float32')
    # print(f"555 df_train['lag_time2'].dtypes: {df_train['lag_time2'].dtypes}")

    lag2_groupby_user_task = df_train[['user_id', 'task_container_id', 'lag_time2']]. \
        groupby(['user_id', 'task_container_id'])['lag_time2']
    df_train['lag_time2'] = lag2_groupby_user_task.transform('first')
    del lag2_groupby_user_task
    gc.collect()

    lag2_groupby_user = df_train[['user_id', 'lag_time2']].groupby('user_id')['lag_time2']
    user_last_lagtime_2_agg = lag2_groupby_user.last()
    # print(f"user_last_lagtime_2_agg.dtypes: {user_last_lagtime_2_agg.dtypes}")

    # df_train.drop(columns=['user_id_copy'], inplace=True)
    # del timestamp_groupby_user_task, ts_groupby, ts_user_task_agg, lag2_groupby
    del lag2_groupby_user
    gc.collect()

with trace_mem("Calculate Train/Val splitting via 'timestamp'"):
    user_max_timestamp = df_train[['user_id', 'timestamp']].groupby(['user_id']).agg(['max']).reset_index()
    user_max_timestamp.columns = ['user_id', 'max_time_stamp']
    MAX_TIME_STAMP = user_max_timestamp['max_time_stamp'].max()

    def rand_time(max_time_stamp):
        interval = MAX_TIME_STAMP - max_time_stamp
        rand_time_stamp = random.randint(0, interval)
        return rand_time_stamp

    user_max_timestamp['rand_time_stamp'] = user_max_timestamp['max_time_stamp'].apply(rand_time)
    user_max_timestamp.drop(columns=['max_time_stamp'], inplace=True)
    df_train = df_train.merge(user_max_timestamp, on='user_id', how='left')
    del user_max_timestamp

    df_train['virtual_time_stamp'] = df_train['timestamp'] + df_train['rand_time_stamp']
    ts_arr = (df_train['timestamp']/TS_SCALING).to_numpy()  # .to_numpy(dtype='float32')
    df_train.drop(columns=['timestamp', 'rand_time_stamp'], inplace=True)

    df_train['val'] = pd.Series([0]*len(df_train), dtype=np.int8)

    use_data_size = int(len(df_train)*USE_DATA_RATIO)
    print(f"Use {USE_DATA_RATIO:.0%} of train data: {use_data_size} rows")
    df_train.iloc[df_train['virtual_time_stamp'].nlargest(use_data_size).index, -1] = 1
    df_train.iloc[df_train['virtual_time_stamp'].nlargest(VAL_SIZE).index, -1] = 2

    df_train.drop(columns=['virtual_time_stamp'], inplace=True)
    gc.collect()

"""
with trace_mem("Verify train/val split"):
    valid_tmp = df_train[df_train['val'] == 2]
    train_tmp = df_train[df_train['val'] == 1]

    # check new users and new contents
    new_user_count = len(valid_tmp[~valid_tmp['user_id'].isin(train_tmp['user_id'])].user_id.unique())
    new_content_count = len(valid_tmp[~valid_tmp.content_id.isin(train_tmp.content_id)].content_id.unique())

    print(f"Train rows: {train_tmp.shape[0]}, Valid rows: {valid_tmp.shape[0]}; "
          f"Train target mean: {train_tmp.answered_correctly.mean():.3f}; "
          f"Valid target mean: {valid_tmp.answered_correctly.mean():.3f}; "
          f"new_user_count: {new_user_count}; new_content_count: {new_content_count}")
    # Train rows: 47135650, Valid rows: 2500000; Train target mean: 0.657; Valid target mean: 0.645;
    # new_user_count: 15057(/393656 == 0.038); new_content_count: 3

    # np.random.choice:
    # Train rows: 46635650, Valid rows: 3000000; Train target mean: 0.658; Valid target mean: 0.646;
    # new_user_count: 17201; new_content_count: 3
    del train_tmp, valid_tmp
    gc.collect()
"""

with trace_mem("Generate feature 'lag_time'"):
    ## Generate feature 'lag_time': 每个用户的当前'timestamp' - 上一行的'timestamp'
    df_train['timestamp'] = ts_arr
    # print(f"111 df_train['timestamp'].dtypes: {df_train['timestamp'].dtypes}")
    del ts_arr
    gc.collect()

    timestamp_groupby_user = df_train[['user_id', 'timestamp']].groupby('user_id')
    df_train['lag_time'] = timestamp_groupby_user['timestamp'].shift(1)
    user_max_timestamp_1_agg = timestamp_groupby_user['timestamp'].max().astype('float32')
    # print(f"222 user_max_timestamp_1_agg.dtypes: {user_max_timestamp_1_agg.dtypes}")
    del timestamp_groupby_user
    gc.collect()
    df_train['lag_time'].fillna(0, inplace=True)  # CV+0.000161 if fillna(0) first
    # print(f"333 df_train['lag_time'].dtypes: {df_train['lag_time'].dtypes}")
    # df_train['lag_time'] = df_train['lag_time'].astype('float32')

    df_train['lag_time'] = df_train['timestamp'] - df_train['lag_time']
    df_train['lag_time'] = df_train['lag_time'].astype('float32')  # df_train['lag_time'].max() == 83884261286

    lagtime_groupby = df_train[['user_id', 'task_container_id', 'lag_time']]. \
        groupby(['user_id', 'task_container_id'])
    df_train['lag_time'] = lagtime_groupby['lag_time'].transform('first')
    # print(f"444 df_train['lag_time'].dtypes: {df_train['lag_time'].dtypes}")
    # df_train.loc[0:1000, ['user_id', 'task_container_id', 'timestamp', 'lag_time', 'lag_time2']].to_csv('./input/lag_time2_after.csv', index=False)
    df_train.drop(columns=['timestamp'], inplace=True)
    del lagtime_groupby
    gc.collect()

    lag_task_groupby_user = df_train[['user_id', 'lag_time', 'task_container_id']].groupby('user_id')
    user_last_lagtime_1_agg = lag_task_groupby_user['lag_time'].last()
    # print(f"555 user_last_lagtime_1_agg.dtypes: {user_last_lagtime_1_agg.dtypes}")
    user_last_task_agg = lag_task_groupby_user['task_container_id'].last()
    df_train.drop(columns=['task_container_id'], inplace=True)
    del lag_task_groupby_user
    gc.collect()

with trace_mem("Generate user_content_attempted_dict"):
    # Below approach would occupy 1GB RAM
    # user_content_attempted_dict = df_train.groupby(['user_id'])['content_id'].unique().to_dict(defaultdict(list))

    # [2.7GB(+0.7GB): 40.0s] Generate user_content_attempted_dict
    user_content_attempted_dict = dict()
    for _user, _content in zip(df_train['user_id'].to_numpy(), df_train['content_id'].to_numpy()):
        if _user not in user_content_attempted_dict:
            a = bitarray(MAX_QUESTIONS, endian='little')
            a.setall(False)
            a[_content] = True
            user_content_attempted_dict[_user] = a
        else:
            user_content_attempted_dict[_user][_content] = True

    del a
    gc.collect()

    def get_user_content_attempted(_user_id, _content_id):
        if _user_id in user_content_attempted_dict:
            _attempted = user_content_attempted_dict[_user_id][_content_id]
            if not _attempted:
                user_content_attempted_dict[_user_id][_content_id] = True

            return _attempted
        else:
            _a = bitarray(MAX_QUESTIONS, endian='little')
            _a.setall(False)
            _a[_content_id] = True
            user_content_attempted_dict[_user_id] = _a
            return False

## Generate feature 'user_content_attempted': 0 for never attempted, 1 for have attempted before
with trace_mem("Generate feature 'user_content_attempted'"):
    # [5.3GB(+2.11GB): 47.0s] Generate feature 'user_content_attempted'
    # df_train['user_content_attempted'] = df_train.groupby(['user_id', 'content_id'])['content_id'].\
    #     agg(['cumcount']).astype('int8').clip(0, 1)

    # [3.3GB(+0.09GB): 54.7s] Generate feature 'user_content_attempted'
    df_train["user_content_attempted"] = pd.Series([1]*len(df_train), dtype=np.int8)
    df_train["user_content_attempted"] = df_train.loc[:, ["user_id", "content_id", 'user_content_attempted']].\
        groupby(["user_id", "content_id"])["user_content_attempted"].cumsum().clip(0, 2) - 1

with trace_mem("Calculate aggregating values"):
    ## Calculate aggregating values
    groupby_user = df_train[['user_id', 'prior_question_elapsed_time', target_col]].groupby('user_id')
    groupby_content = df_train[['content_id', target_col, 'prior_question_elapsed_time',
                                'prior_question_had_explanation']].groupby('content_id')

    user_ques_elapsed_time_sum_agg = groupby_user['prior_question_elapsed_time'].sum()
    user_target_sum_agg = groupby_user[target_col].sum().astype('int16')  # 每个user的正确数
    user_count_agg = groupby_user[target_col].count().astype('int16')  # 每个user的个数

    content_target_sum_agg = groupby_content[target_col].sum().astype('int32')  # 每个content的正确数
    content_count_agg = groupby_content[target_col].count().astype('int32')  # 每个content的个数

    del groupby_user, groupby_content
    gc.collect()

with trace_mem("Generate feature 'residual_user_mean'"):
    df_train.reset_index(drop=True, inplace=True)

    ## Generate feature 'residual' = 当前行的'answered_correctly' - 当前行的content_id的content正确率
    df_train['residual'] = df_train[target_col] - \
                           df_train['content_id'].map(content_target_sum_agg/content_count_agg)
    df_train['residual'] = df_train['residual']

    ## Generate residual_user_mean as cum_mean
    residual_groupby_user = df_train[['user_id', 'residual']].groupby('user_id', sort=False)['residual']
    df_train['lag'] = residual_groupby_user.shift()
    df_train['lag'].fillna(0, inplace=True)

    lag_groupby_user = df_train[['user_id', 'lag']].groupby('user_id', sort=False)['lag']
    lag_cumsum, lag_cumcount = lag_groupby_user.cumsum(), lag_groupby_user.cumcount()

    df_train['residual_user_mean'] = lag_cumsum / lag_cumcount
    df_train['residual_user_mean'].fillna(0, inplace=True)
    df_train['residual_user_mean'] = df_train['residual_user_mean'].astype('float32')

    user_residual_sum_agg = residual_groupby_user.sum().astype('float32')  # 每个user的'residual'的和

    df_train.drop(columns=['residual', 'lag'], inplace=True)
    del residual_groupby_user, lag_groupby_user, lag_cumsum, lag_cumcount
    gc.collect()

with trace_mem("Generate feature 'user_correctness' and 'user_correct_cumsum'"):
    ## Generate feature 'user_correctness': current user`s answer accuracy before current row
    # shift()后第一行'lag'为None, dtype为float64!
    df_train['lag'] = df_train[['user_id', target_col]].groupby('user_id')[target_col].shift()
    df_train['lag'].fillna(0, inplace=True)
    df_train['lag'] = df_train['lag'].astype('int8')

    lag_groupby_user_id = df_train.loc[:, ['user_id', 'lag']].groupby(['user_id'])['lag']
    lag_cumsum, lag_cumcount = lag_groupby_user_id.cumsum().astype('int16'), \
                               lag_groupby_user_id.cumcount().astype('int16')

    df_train['user_correct_cumsum'] = lag_cumsum

    df_train['user_correctness'] = lag_cumsum / lag_cumcount
    df_train['user_correctness'].fillna(0, inplace=True)
    df_train['user_correctness'] = df_train['user_correctness'].astype('float32')

    del lag_cumsum, lag_cumcount, lag_groupby_user_id
    df_train.drop(columns=['lag'], inplace=True)
    gc.collect()

with trace_mem("Generate feature 'pq_elapsed_time_user_mean'"):
    df_train['lag'] = df_train[['user_id', 'prior_question_elapsed_time']]. \
        groupby('user_id', sort=False)['prior_question_elapsed_time'].shift()
    df_train['lag'].fillna(0, inplace=True)

    lag_groupby_user = df_train[['user_id', 'lag']].groupby('user_id', sort=False)['lag']
    lag_cumsum, lag_cumcount = lag_groupby_user.cumsum(), lag_groupby_user.cumcount()

    df_train['pq_elapsed_time_user_mean'] = lag_cumsum / lag_cumcount
    df_train['pq_elapsed_time_user_mean'].fillna(0, inplace=True)
    df_train['pq_elapsed_time_user_mean'] = df_train['pq_elapsed_time_user_mean'].astype('float32')

    df_train.drop(columns=['lag'], inplace=True)
    del lag_groupby_user, lag_cumsum, lag_cumcount
    gc.collect()

with trace_mem("Generate feature 'explanation_user_cumsum'"):
    explanation_groupby_userid = df_train[['user_id', 'prior_question_had_explanation']]. \
        groupby('user_id')['prior_question_had_explanation']

    df_train['lag'] = explanation_groupby_userid.shift()
    lag_cumsum = df_train[['user_id', 'lag']].groupby('user_id')['lag'].cumsum()
    df_train['lag'] = lag_cumsum

    df_train.rename(columns={'lag': 'explanation_user_cumsum'}, inplace=True)
    df_train['explanation_user_cumsum'].fillna(0, inplace=True)
    df_train['explanation_user_cumsum'] = df_train['explanation_user_cumsum'].astype('int16')

    explanation_sum_agg = explanation_groupby_userid.sum().astype('int16')

    del explanation_groupby_userid, lag_cumsum
    gc.collect()

with trace_mem("Dropping rows with 'val' == 0"):
    df_train = df_train[df_train['val'] != 0].reset_index(drop=True)  # drop rows with 'val' == 0

with trace_mem("Generate feature 'content_count', 'content_correctness'"):
    ## Generate feature 'content_count', 'content_correctness'
    df_train['content_count'] = df_train['content_id'].map(content_count_agg).astype('int32')
    df_train['content_correctness'] = df_train['content_id'].\
        map(content_target_sum_agg/content_count_agg).astype('float32')

## Process questions.csv
with trace_mem("Process questions.csv"):
    NULL_TAG = '255'
    data_types_dict = {'question_id': 'int16',
                       'part': 'int8',
                       'bundle_id': 'int16',
                       'tags': 'string'}
    df_questions = pd.read_csv(QUESTIONS_FILE_PATH, usecols=data_types_dict.keys(), dtype=data_types_dict)
    df_questions['tags'].fillna(NULL_TAG, inplace=True)
    del data_types_dict
    assert(0 == df_questions.isnull().sum().sum())
    df_questions.rename(columns={'question_id': 'content_id'}, inplace=True)

    """
    # Generate feature 'bundle_correctness', CV+0.000167, training time longer
    df_questions['content_correctness'] = df_questions['content_id'].\
        map(content_target_sum_agg/ content_count_agg).astype('float32')
    bundle_agg = df_questions.groupby('bundle_id')['content_correctness'].agg(['mean'])
    df_questions['bundle_correctness'] = df_questions['bundle_id'].map(bundle_agg['mean']).astype('float32')
    del bundle_agg
    df_questions.drop(columns=['content_correctness'], inplace=True)
    """
    # Generate feature 'tags_encoded'
    unique_tags_combos_keys = {value: idx for idx, value in enumerate(df_questions['tags'].unique())}
    df_questions['tags_encoded'] = df_questions['tags'].apply(lambda x: unique_tags_combos_keys[x]).astype('int16')
    del unique_tags_combos_keys

    # Generate feature 'tags_first1' and 'tags_last2'
    # tag_list.apply(len).value_counts():
    # (1: 6561)
    # (2, 171)
    # (3: 3976)
    # (4: 2021)
    # (5: 686)
    # (6: 108)
    question_tags_list = df_questions['tags'].apply(lambda x: x.split())  #.apply(lambda x: list(int(t) for t in x))

    df_questions['tags_first1'] = df_questions['tags'].apply(lambda x: x.split()[0])
    unique_tags_combos_keys = {value: idx for idx, value in enumerate(df_questions['tags_first1'].unique())}
    df_questions['tags_first1'] = df_questions['tags_first1'].apply(lambda x: unique_tags_combos_keys[x]).astype('int16')
    del unique_tags_combos_keys

    df_questions['tags_last2'] = df_questions['tags'].\
        apply(lambda x: x.split()[-1] if len(x.split()) == 1 else x.split()[-2]+x.split()[-1])
    unique_tags_combos_keys = {value: idx for idx, value in enumerate(df_questions['tags_last2'].unique())}
    df_questions['tags_last2'] = df_questions['tags_last2'].apply(lambda x: unique_tags_combos_keys[x]).astype('int16')

    # clean up
    df_questions.drop(columns=['tags'], inplace=True)
    del unique_tags_combos_keys, question_tags_list
    gc.collect()

## Merge df_questions to df_train
with trace_mem("Merge df_questions to df_train"):
    ## Merge df_questions to df_train
    df_train = pd.merge(df_train, df_questions, on='content_id', how='left')
    assert(0 == df_train.isnull().sum().sum())

## Split to train/val dataset
with trace_mem("Split to train/val dataset"):
    df_train.drop(columns=['user_id', 'content_id'], inplace=True)
    df_val = df_train[df_train['val'] == 2].reset_index(drop=True)
    df_train = df_train[df_train['val'] != 2].reset_index(drop=True)
    df_val.drop(columns=['val'], inplace=True)
    df_train.drop(columns=['val'], inplace=True)
    print(f"df_train`s shape {df_train.shape}, df_val`s shape: {df_val.shape}")

## Save User_feats_dict and Ques_feats_dict for online inference speed up
with trace_mem("Save User_feats_dict and Ques_feats_dict for online inference"):
    def clip(count):
        return np.clip(count, 1, np.inf)

    # User_feats_dict: {user_id: UserFeats}
    class UserFeats(object):
        def __init__(self, user_target_sum=0, user_count=0, explanation_sum=0, prior_question_elapsed_time_sum=0,
                     user_residual_sum=0, user_max_ts_1=0, user_max_ts_2=0, last_lagtime_1=0,
                     last_lagtime_2=0, last_task_container=0):
            self.ans_corr_cnt = user_target_sum
            self.user_cnt = user_count
            self.user_acc = user_target_sum/clip(user_count)

            self.user_explanation_sum = explanation_sum
            self.elapsed_time_sum = prior_question_elapsed_time_sum
            self.residual_sum = user_residual_sum
            self.residual_mean = user_residual_sum/clip(user_count)
            
            self.user_max_timestamp_1 = user_max_ts_1
            self.user_max_timestamp_2 = user_max_ts_2
            self.user_last_lagtime_1 = last_lagtime_1
            self.user_last_lagtime_2 = last_lagtime_2
            self.last_task_id = last_task_container

        def update_user_acc(self, _ans_corr):
            self.ans_corr_cnt += _ans_corr
            self.user_cnt += 1
            self.user_acc = self.ans_corr_cnt/clip(self.user_cnt)

        def get_user_acc(self):
            return self.user_acc

        def get_explanation_cumsum(self, current_explanation):
            current_explanation_sum = self.user_explanation_sum
            self.user_explanation_sum += current_explanation
            return current_explanation_sum

        def get_elapsed_time_mean(self, _elapsed_time):
            current_elapsed_mean = self.elapsed_time_sum/clip(self.user_cnt)
            self.elapsed_time_sum += _elapsed_time
            return current_elapsed_mean

        # suppose update_user_acc() is always called before update_user_residual()
        # self.user_cnt already been updated, so alway > 0 here
        def update_user_residual(self, new_residual):
            ## update self.residual_mean
            self.residual_sum += new_residual
            self.residual_mean = self.residual_sum/self.user_cnt  

        def get_residual_mean(self):
            return self.residual_mean

        def get_lagtime(self, new_timestamp, new_task_id):
            if new_task_id == self.last_task_id:
                return self.user_last_lagtime_1, self.user_last_lagtime_2

            self.last_task_id = new_task_id
            lag_time1, lag_time2 = new_timestamp/TS_SCALING - self.user_max_timestamp_1, \
                                   new_timestamp/TS_SCALING - self.user_max_timestamp_2
            # update lag2 first
            self.user_max_timestamp_2, self.user_last_lagtime_2 = self.user_max_timestamp_1, lag_time2
            self.user_max_timestamp_1, self.user_last_lagtime_1 = new_timestamp/TS_SCALING, lag_time1

            return lag_time1, lag_time2

    # Generate User_feats_dict
    User_feats_dict = defaultdict(UserFeats)
    for _user_id in user_target_sum_agg.index:
        _user_feats = UserFeats(user_target_sum=user_target_sum_agg[_user_id], user_count=user_count_agg[_user_id],
                                explanation_sum=explanation_sum_agg[_user_id],
                                prior_question_elapsed_time_sum=user_ques_elapsed_time_sum_agg[_user_id],
                                user_residual_sum=user_residual_sum_agg[_user_id],
                                user_max_ts_1=user_max_timestamp_1_agg[_user_id],
                                user_max_ts_2=user_max_timestamp_2_agg[_user_id],
                                last_lagtime_1=user_last_lagtime_1_agg[_user_id],
                                last_lagtime_2=user_last_lagtime_2_agg[_user_id],
                                last_task_container=user_last_task_agg[_user_id])
        User_feats_dict[_user_id] = _user_feats

    # Ques_feats_dict: {question_id: QuesFeats(ques_feats_list)}
    class QuesFeats(object):
        def __init__(self, content_target_sum=0, content_count=0, ques_bundle=1, ques_part=1, ques_tags_encoded=0,
                     ques_tags_first1=0, ques_tags_last2=0):
            self.corr_cnt = content_target_sum
            self.ques_cnt = content_count
            self.ques_acc = content_target_sum/clip(content_count)

            self.bundle = ques_bundle
            self.part = ques_part
            self.tags_encoded = ques_tags_encoded
            self.tags_first1 = ques_tags_first1
            self.tags_last2 = ques_tags_last2

        def update_ques_acc(self, _ans_corr):
            self.corr_cnt += _ans_corr
            self.ques_cnt += 1
            self.ques_acc = self.corr_cnt/clip(self.ques_cnt)

        def get_ques_acc(self):
            return self.ques_acc

    # Generate Ques_feats_dict
    Ques_feats_dict = defaultdict(QuesFeats)
    df_ques_tmp = df_questions.set_index('content_id')
    for _content_id in content_target_sum_agg.index:
        _ques_feats = QuesFeats(content_target_sum=content_target_sum_agg[_content_id],
                                content_count=content_count_agg[_content_id],
                                ques_bundle=df_ques_tmp.loc[_content_id, 'bundle_id'],
                                ques_part=df_ques_tmp.loc[_content_id, 'part'],
                                ques_tags_encoded=df_ques_tmp.loc[_content_id, 'tags_encoded'],
                                ques_tags_first1=df_ques_tmp.loc[_content_id, 'tags_first1'],
                                ques_tags_last2=df_ques_tmp.loc[_content_id, 'tags_last2'],
                                )
        Ques_feats_dict[_content_id] = _ques_feats

    del df_ques_tmp, _user_feats, _ques_feats
    del user_target_sum_agg, user_count_agg, explanation_sum_agg, content_target_sum_agg, content_count_agg, \
        user_residual_sum_agg, user_ques_elapsed_time_sum_agg, user_max_timestamp_1_agg, user_max_timestamp_2_agg, \
        user_last_lagtime_1_agg, user_last_lagtime_2_agg, user_last_task_agg
    gc.collect()

class FeatureEngineer(object):
    def __init__(self, user_feats_dict, ques_feats_dict, all_elapsed_time_mean):
        self._user_feats_dict = user_feats_dict
        self._ques_feats_dict = ques_feats_dict
        self._elapsed_time_mean = all_elapsed_time_mean
        self._init_feat_extract_fns()
        self._prior_user_ids = []
        self._prior_content_ids = []
        self._prior_content_type_ids = []

    def _init_feat_extract_fns(self):
        name_to_extract_fn = {  # Must match with TRA_FEATURES!
            ## u_feats: 该user_id所有的feats; q_feats: 该question_id所有的feats;
            ## row: test_df里当前行; cache: 缓存的通用feats;
            # From row['']:
            'prior_question_elapsed_time': lambda u_feats, q_feats, row, cache: self._elapsed_time_mean \
                if cache['elapsed_time_isna'] else row['prior_question_elapsed_time'],
            'prior_question_had_explanation': lambda u_feats, q_feats, row, cache: cache['current_explanation'],
            # From QuesFeats:
            'bundle_id': lambda u_feats, q_feats, row, cache: q_feats.bundle,
            'part': lambda u_feats, q_feats, row, cache: q_feats.part,
            'tags_encoded': lambda u_feats, q_feats, row, cache: q_feats.tags_encoded,
            'tags_first1': lambda u_feats, q_feats, row, cache: q_feats.tags_first1,
            'tags_last2': lambda u_feats, q_feats, row, cache: q_feats.tags_last2,
            'content_count': lambda u_feats, q_feats, row, cache: q_feats.ques_cnt,
            'content_correctness': lambda u_feats, q_feats, row, cache: q_feats.get_ques_acc(),
            # From UserFeats:
            'user_correctness': lambda u_feats, q_feats, row, cache: u_feats.get_user_acc(),
            'user_correct_cumsum': lambda u_feats, q_feats, row, cache: u_feats.ans_corr_cnt,
            'residual_user_mean': lambda u_feats, q_feats, row, cache: u_feats.get_residual_mean(),
            'pq_elapsed_time_user_mean': lambda u_feats, q_feats, row, cache: \
                u_feats.get_elapsed_time_mean(0 if cache['elapsed_time_isna']
                                              else row['prior_question_elapsed_time']),
            'explanation_user_cumsum': lambda u_feats, q_feats, row, cache: \
                u_feats.get_explanation_cumsum(cache['current_explanation']),
            'lag_time': lambda u_feats, q_feats, row, cache: cache['lag_times'][0],
            'lag_time2': lambda u_feats, q_feats, row, cache: cache['lag_times'][1],
            # From global dict:
            'user_content_attempted': lambda u_feats, q_feats, row, cache: \
                get_user_content_attempted(row['user_id'], row['content_id']),
        }
        self._feat_extract_fns = [name_to_extract_fn[name] for name in TRA_FEATURES]

    def _get_row_fvs(self, row: Series, u_feats: UserFeats, q_feats: QuesFeats) -> List[float]:
        cache = {'current_explanation': 0 if pd.isna(row['prior_question_had_explanation'])
                                          else row['prior_question_had_explanation'],
                 'elapsed_time_isna': True if pd.isna(row['prior_question_elapsed_time']) else False,
                 'lag_times': u_feats.get_lagtime(row['timestamp'], row['task_container_id']),
                }  # extract stats that is used by multiple features
        return [fn(u_feats, q_feats, row, cache) for fn in self._feat_extract_fns]

    def _reset_prior_lists(self):
        self._prior_user_ids, self._prior_content_ids, self._prior_content_type_ids = [], [], []

    def _process_prior_df(self, _prior_targets):
        len_targets = len(_prior_targets)
        if len_targets == 0:
            self._reset_prior_lists()
            return

        len_prior_user_ids, len_prior_content_ids = len(self._prior_user_ids), len(self._prior_content_ids)
        if (len_targets != len_prior_user_ids) or (len_prior_user_ids != len_prior_content_ids):
            print(f"length not match, len_targets: {len_targets}, len_prior_user_ids: {len_prior_user_ids}, "
                  f"len_prior_content_ids: {len_prior_content_ids}")
            self._reset_prior_lists()
            return

        for _uid, _cid, _target, _content_type in zip(self._prior_user_ids, self._prior_content_ids, _prior_targets,
                                                      self._prior_content_type_ids):
            if _content_type == 0:  # only process question rows
                self._user_feats_dict[_uid].update_user_acc(_target)
                self._ques_feats_dict[_cid].update_ques_acc(_target)
                # self._user_feats_dict[_uid].residual_sum += _target - self._ques_feats_dict[_cid].get_ques_acc()
                self._user_feats_dict[_uid].update_user_residual(_target - self._ques_feats_dict[_cid].get_ques_acc())

        self._reset_prior_lists()

    # Get feature values based on test_df
    def get_feat_values(self, df: DataFrame):
        # update prior fields
        prior_targets_list = json.loads(df['prior_group_answers_correct'].iloc[0])
        self._process_prior_df(prior_targets_list)

        # generate features
        fvs, _ques_row_ids = [], []

        for _, row in df.iterrows():
            self._prior_user_ids.append(row['user_id'])
            self._prior_content_ids.append(row['content_id'])
            self._prior_content_type_ids.append(row['content_type_id'])

            if row['content_type_id'] == 0:  # question rows
                _ques_row_ids.append(row['row_id'])
                u_feats = self._user_feats_dict[row['user_id']]  # new user will get correct feats(mostly 0)
                q_feats = self._ques_feats_dict[row['content_id']]
                fvs.append(self._get_row_fvs(row, u_feats, q_feats))

        return np.array(_ques_row_ids), np.array(fvs)  # np.array(fvs, dtype=np.float32)

######## Define model and train ########
def make_x_y_np(df):
    y = df[target_col].values.astype(np.float32)
    if not PLOT_SHAP:
        df.drop(columns=[target_col], inplace=True)

    x = np.ndarray(shape=(df.shape[0], len(TRA_FEATURES)), dtype=np.float32)
    for idx_, feature_ in enumerate(TRA_FEATURES):
        x[:, idx_] = df[feature_].values.astype(np.float32)
        if not PLOT_SHAP:
            df.drop(columns=[feature_], inplace=True)

    return x, y


#df_train.to_csv('./input/df_train.csv', index=False)
#df_val.to_csv('./input/df_val.csv', index=False)

with trace_mem("Prepare float32 ndarray for LGBM"):
    # print(f"Before make_x_y_np(), df_train = {sys.getsizeof(df_train)}")
    X_tra, y_tra = make_x_y_np(df_train)  # Before 4GB, After 0.77GB
    X_val, y_val = make_x_y_np(df_val)
    if PLOT_SHAP:
        del df_train
    else:
        del df_train, df_val

    gc.collect()

with trace_mem("Prepare datasets for LGBM"):  # X_tra+y_tra+X_val+y_val = 5.1GB, lgb_train+lgb_val = 96(copy=False???)
    lgb_train = lgb.Dataset(X_tra, y_tra,  feature_name=TRA_FEATURES)
    lgb_val = lgb.Dataset(X_val, y_val, feature_name=TRA_FEATURES)

# After del X_tra, y_tra, X_val, y_val, the RAM overhead will reduce ~5GB during LGBM training,
# but here: [5.8GB(+0.00GB): 0.0s] del X_tra, y_tra, X_val, y_val ???
with trace_mem("del X_tra, y_tra, X_val, y_val"):
    del X_tra, y_tra, X_val, y_val
    gc.collect()

## Set hyper parameters and start train
params = {'objective': 'binary',
          'seed': DEFAULT_SEED,
          'learning_rate': LEARNING_RATE,
          'metric': 'auc',
          'max_bin': MAX_BIN,
          'num_leaves': NUM_LEAVES,  # max number of leaves in one tree, default 31
          'feature_fraction': FEATURE_FRACTION,  # LightGBM will select FEATURE_FRACTION of features before
                                                 # training each tree(iteration)
          'bagging_fraction': BAGGING_FRACTION,
          'bagging_freq': BAGGING_FREQ,  # Every k-th iteration, LightGBM will randomly select bagging_fraction
                                         # of the data to use for the next k iterations
          }

time_begin = time()
lgb_model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_val],
                      verbose_eval=VERBOSE_EVAL,
                      num_boost_round=NUM_BOOST_ROUNDS,
                      early_stopping_rounds=EARLY_STOP_ROUNDS,
                      categorical_feature=cat_features)
print(f"Training lgb_model elapsed {str(timedelta(seconds=time()-time_begin))}")

with trace_mem("del lgb_train, lgb_val"):
    del lgb_train, lgb_val
    gc.collect()

######## Online Inference ########
feat_eng = FeatureEngineer(User_feats_dict, Ques_feats_dict, mean_prior_question_elapsed_time)

env = riiideducation.make_env()
iter_test = env.iter_test()

for (test_df, _) in iter_test:
    ques_row_ids, X_test = feat_eng.get_feat_values(test_df)
    target_preds = lgb_model.predict(X_test)
    submit_df = pd.DataFrame({'row_id': ques_row_ids, target_col: target_preds})
    env.predict(submit_df)