In [None]:
!ls ../input/

In [None]:
# https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering
import os
import gc
import time
import gzip
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import roc_auc_score

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

DATA_PATH = '../input/riiid-test-answer-prediction/'
MY_DATA_PATH = '../input/my_data/'
CACHE_PATH = '../input/lgb1215weights/'
if not os.path.exists(CACHE_PATH):
    os.mkdir(CACHE_PATH)

DEBUG = False
OFFLINE = False
if DEBUG:
    MY_DATA_PATH = f'{MY_DATA_PATH}/debug/'
    CACHE_PATH = f'{CACHE_PATH}/debug/'
    if not os.path.exists(CACHE_PATH):
        os.mkdir(CACHE_PATH)
config_file = f'{CACHE_PATH}/config.pkl'

########################################################################################################################
##### Util Fnc
def save_pickle(dic, save_path):
    # with open(save_path, 'wb') as f:
    with gzip.open(save_path, 'wb') as f:
        pickle.dump(dic, f)

def load_pickle(load_path):
    # with open(load_path, 'rb') as f:
    with gzip.open(load_path, 'rb') as f:
        message_dict = pickle.load(f)
    return message_dict

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df


########################################################################################################################
##### Load Data
## Get Question_Data
questions_df = pd.read_csv(f'{DATA_PATH}/questions.csv')
questions_df['content_bundle_same'] = (questions_df['question_id'] == questions_df['bundle_id']).astype(int)
# questions_df['bundle_content_nunique'] = questions_df.groupby('bundle_id')['question_id'].nunique()
questions_df['tags_len'] = questions_df['tags'].apply(lambda x: 0 if str(x) == 'nan' else len(str(x).split(' ')))
questions_df['tags'] = questions_df['tags'].apply(lambda x: [] if str(x) == 'nan' else str(x).split(' '))

questions_df['part_content_num'] = questions_df.groupby('part')['question_id'].transform('count')

question_bundle_dict = dict(zip(questions_df.question_id.values, questions_df.bundle_id.values))
question_part_dict = dict(zip(questions_df.question_id.values, questions_df.part.values))
question_tags_dict = dict(zip(questions_df.question_id.values, questions_df.tags.values))

bundle_df = questions_df.groupby('bundle_id')['question_id'].unique()
bundle_df = bundle_df[bundle_df.apply(len) > 1]
bundle_mapping = {}
for id_list in bundle_df.values:
    bid = id_list[0]
    for qid in id_list:
        bundle_mapping[qid] = (bid, len(id_list))
# print('bundle_mapping:\n', bundle_mapping)

## Get Lecture_Data
lectures_df = pd.read_csv(f'{DATA_PATH}/lectures.csv')
lecture_tag_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.tag.values))
lecture_part_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.part.values))
lecture_type_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.type_of.values))

## Get Samples
if OFFLINE:
    feld_needed = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id',
                   'user_answer', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(f'{MY_DATA_PATH}/train.pickle')[feld_needed]
    valid = pd.read_pickle(f'{MY_DATA_PATH}/valid.pickle')[feld_needed]

    train['prior_question_elapsed_time'] //= 1000
    train['timestamp'] /= 1000
    valid['prior_question_elapsed_time'] //= 1000
    valid['timestamp'] /= 1000

    # train['day'] = train['timestamp'] // 1000 // 60 // 60# // 24
    # valid['day'] = valid['timestamp'] // 1000 // 60 // 60# // 24
    # train = train[:10000]
    # valid = valid[:10000]
    print(f'Train|Valid: {len(train)}|{len(valid)}')

    # train_index = train.loc[train.content_type_id == False].reset_index(drop=True).groupby('user_id').tail(800).index
    # print(f'Chosen train index num: {len(train_index)} | {len(train_index)/len(train):.4f}')

    '''
    Make feat for valid:
    * ques: quest_train
    * lect: lect_train

    Make feat for test:
    * ques: quest_train/quest_valid
    * lect: lect_train/lect_valid
    '''
    ques_train = train.loc[train.content_type_id == False, ['row_id', 'content_id', 'answered_correctly']].reset_index(
        drop=True)
    ques_valid = valid.loc[valid.content_type_id == False, ['row_id', 'content_id', 'answered_correctly']].reset_index(
        drop=True)
    ques_train = reduce_mem_usage(ques_train, verbose=True)
    ques_valid = reduce_mem_usage(ques_valid, verbose=True)
    # lect_train = train.loc[train.content_type_id == True, ['row_id', 'user_id', 'content_id', 'task_container_id']].reset_index(drop=True)
    # lect_valid = valid.loc[valid.content_type_id == True, ['row_id', 'user_id', 'content_id', 'task_container_id']].reset_index(drop=True)
    # print(f'Train|Valid: {len(train)}|{len(valid)}')

    train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid['prior_question_had_explanation'].fillna(False).astype('int8')

    prior_question_elapsed_time_mean = train.loc[train.content_type_id == False].prior_question_elapsed_time.dropna().values.mean()
    train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean).astype(np.int32)
    valid['prior_question_elapsed_time'] = valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean).astype(np.int32)
    train = reduce_mem_usage(train, verbose=True)
    valid = reduce_mem_usage(valid, verbose=True)

    save_pickle(prior_question_elapsed_time_mean, f'{CACHE_PATH}/prior_question_elapsed_time_mean.pkl')

    # part_elapse_time_mean_dict = \
    #     dict(pd.concat([ques_train, ques_valid]).merge(questions_df, left_on='content_id', right_on='question_id', how='left').\
    #     groupby('part')['prior_question_elapsed_time'].mean())
    # save_pickle(part_elapse_time_mean_dict, f'{CACHE_PATH}/part_elapse_time_mean_dict.pkl')

    config_dict = {}
else:
    config_dict = load_pickle(config_file)
    # part_elapse_time_mean_dict = load_pickle(f'{CACHE_PATH}/part_elapse_time_mean_dict.pkl')
    prior_question_elapsed_time_mean = load_pickle(f'{CACHE_PATH}/prior_question_elapsed_time_mean.pkl')

########################################################################################################################
##### Content Static Feat
def make_content_feat(df, type):
    df = df.loc[df.content_type_id == False].reset_index(drop=True)
    file_name = f'content_feat_{type}.pkl'

    # df['timestamp_diff'] = df['timestamp'] - df.groupby('user_id')['timestamp'].shift(1)
    feat_df = df.groupby('content_id', as_index=False)['answered_correctly'].mean(). \
        rename(columns={'answered_correctly': 'content_target_mean'})

    # content_timestamp_diff_median = df.groupby('content_id')['timestamp_diff'].median()
    # feat_df['content_timestamp_diff_median'] = content_timestamp_diff_median.reindex(feat_df.content_id.values).values

    content_cnt = df.groupby('content_id')['user_id'].count()
    feat_df['content_cnt'] = content_cnt.reindex(feat_df.content_id.values).values

    save_pickle(feat_df, save_path=f'{CACHE_PATH}/{file_name}')
    feat_df = reduce_mem_usage(feat_df, verbose=True)
    return feat_df


if OFFLINE:
    content_feat = make_content_feat(df=train.copy(deep=True), type='train')
    content_feat_test = make_content_feat(df=pd.concat([train, valid]), type='test')
    print('content_feat:\n', content_feat.head())
else:
    content_feat_test = load_pickle(f'{CACHE_PATH}/content_feat_test.pkl')
    content_feat_test = reduce_mem_usage(content_feat_test, verbose=True)
content_target_mean_dict = dict(zip(content_feat_test.content_id.values,
                                    content_feat_test.content_target_mean.values))
content_feat_cols = [col for col in content_feat_test if col != 'content_id']

########################################################################################################################
##### Part Feat
def make_part_mean_dict(df):
    df = df.loc[df.content_type_id == False].reset_index(drop=True)
    df = df.merge(questions_df[['question_id', 'part']], left_on='content_id', right_on='question_id', how='left')

    feat_df = df.groupby('part', as_index=False)['answered_correctly'].mean(). \
        rename(columns={'answered_correctly': 'part_target_mean'})

    return dict(zip(feat_df.part.values, feat_df.part_target_mean.values))
if OFFLINE:
    part_target_mean_dict = make_part_mean_dict(df=pd.concat([train[['content_id', 'content_type_id', 'answered_correctly']],
                                                              valid[['content_id', 'content_type_id', 'answered_correctly']]]))
    save_pickle(part_target_mean_dict, f'{CACHE_PATH}/part_target_mean_dict.pkl')
else:
    part_target_mean_dict = load_pickle(f'{CACHE_PATH}/part_target_mean_dict.pkl')
questions_df['part_target_mean'] = questions_df['part'].apply(lambda x: part_target_mean_dict[x])

########################################################################################################################
##### Id Static Feat
static_feat_cols = ['part', 'prior_question_elapsed_time', 'content_id'] # + [f'tags_w2v{i}' for i in range(w2v_dim)] , 'tag1', 'tag2', 'tag3', 'tag4'


def get_stat_feat(df, feat_cols):
    df = df.loc[df.content_type_id == False].reset_index(drop=True)
    df = df.merge(questions_df[['question_id', 'part']], left_on='content_id', right_on='question_id', how='left')

    return df[feat_cols]


if OFFLINE:
    state_feat_train = get_stat_feat(df=train.copy(deep=True), feat_cols=static_feat_cols)
    state_feat_valid = get_stat_feat(df=valid.copy(deep=True), feat_cols=static_feat_cols)
    state_feat_train = reduce_mem_usage(state_feat_train, verbose=True)
    state_feat_valid = reduce_mem_usage(state_feat_valid, verbose=True)

    # for _part in part_elapse_time_mean_dict:
    #     state_feat_train.loc[state_feat_train.part == _part, ['prior_question_elapsed_time']] = \
    #         state_feat_train.loc[state_feat_train.part == _part, ['prior_question_elapsed_time']].fillna(part_elapse_time_mean_dict[_part])
    #     state_feat_valid.loc[state_feat_valid.part == _part, ['prior_question_elapsed_time']] = \
    #         state_feat_valid.loc[state_feat_valid.part == _part, ['prior_question_elapsed_time']].fillna(part_elapse_time_mean_dict[_part])

########################################################################################################################
##### User Loop Feat
window_size = 25
if OFFLINE:
    user_cnt_dict = defaultdict(int)
    user_pos_cnt_dict = defaultdict(int)
    user_part_cnt_dict = defaultdict(int)
    user_part_pos_cnt_dict = defaultdict(int)
    user_content_cnt_dict = defaultdict(int)
    user_content_pos_cnt_dict = defaultdict(int)
    user_content_redo_cnt_dict = defaultdict(int)
    user_content_mean_sum_dict = defaultdict(int)
    user_consecutive_pos_cnt_dict = defaultdict(int)
    user_target_win25_dict = defaultdict(list)
    user_content_mean_win10_dict = defaultdict(list)

    user_explanation_cnt_dict = defaultdict(int)
    user_explanation_pos_cnt_dict = defaultdict(int)
    user_elapse_time_sum_dict = defaultdict(int)
    user_elapse_time_win10_dict = defaultdict(list)
    user_last_timestamp_dict = defaultdict(int)
    user_last_task_dict = defaultdict(int)
    user_content_win5_dict = defaultdict(list)
    user_part_win10_dict = defaultdict(list)

    bundle_state_dict = defaultdict(list) # bundle_id, time_diff
    # user_order_in_session_dict = defaultdict(int)
    user_cum_time_dict = defaultdict(int)
    user_timespan_win10_dict = defaultdict(list)

    user_tags_cnt_dict = defaultdict(int)
    user_tags_pos_cnt_dict = defaultdict(int)

    user_continue_quest_cnt_dict = defaultdict(int)
else:
    user_content_feat_df = pd.read_pickle(f'{CACHE_PATH}/user_content_feat.pkl')
    user_content_feat_df = reduce_mem_usage(user_content_feat_df)
    user_content_cnt_dict = defaultdict(int)
    user_content_pos_cnt_dict = defaultdict(int)

    user_cnt_dict = load_pickle(f'{CACHE_PATH}/user_cnt_dict.pkl')
    user_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_pos_cnt_dict.pkl')
    user_part_cnt_dict = load_pickle(f'{CACHE_PATH}/user_part_cnt_dict.pkl')
    user_part_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_part_pos_cnt_dict.pkl')
    # user_content_cnt_dict = load_pickle(f'{CACHE_PATH}/user_content_cnt_dict.pkl')
    user_content_redo_cnt_dict = load_pickle(f'{CACHE_PATH}/user_content_redo_cnt_dict.pkl')
    user_content_mean_sum_dict = load_pickle(f'{CACHE_PATH}/user_content_mean_sum_dict.pkl')
    user_consecutive_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_consecutive_pos_cnt_dict.pkl')
    user_target_win25_dict = load_pickle(f'{CACHE_PATH}/user_target_win25_dict.pkl')
    user_content_mean_win10_dict = load_pickle(f'{CACHE_PATH}/user_content_mean_win10_dict.pkl')

    user_explanation_cnt_dict = load_pickle(f'{CACHE_PATH}/user_explanation_cnt_dict.pkl')
    user_explanation_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_explanation_pos_cnt_dict.pkl')
    user_elapse_time_sum_dict = load_pickle(f'{CACHE_PATH}/user_elapse_time_sum_dict.pkl')
    user_elapse_time_win10_dict = load_pickle(f'{CACHE_PATH}/user_elapse_time_win10_dict.pkl')
    user_last_timestamp_dict = load_pickle(f'{CACHE_PATH}/user_last_timestamp_dict.pkl')
    user_last_task_dict = load_pickle(f'{CACHE_PATH}/user_last_task_dict.pkl')
    user_content_win5_dict = load_pickle(f'{CACHE_PATH}/user_content_win5_dict.pkl')
    user_part_win10_dict = load_pickle(f'{CACHE_PATH}/user_part_win10_dict.pkl')

    bundle_state_dict = load_pickle(f'{CACHE_PATH}/bundle_state_dict.pkl')
    # user_order_in_session_dict = load_pickle(f'{CACHE_PATH}/user_order_in_session_dict.pkl')
    user_cum_time_dict = load_pickle(f'{CACHE_PATH}/user_cum_time_dict.pkl')
    user_timespan_win10_dict = load_pickle(f'{CACHE_PATH}/user_timespan_win10_dict.pkl')

    user_tags_cnt_dict = load_pickle(f'{CACHE_PATH}/user_tags_cnt_dict.pkl')
    user_tags_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_tags_pos_cnt_dict.pkl')

    user_continue_quest_cnt_dict = load_pickle(f'{CACHE_PATH}/user_continue_quest_cnt_dict.pkl')

used_cols = ['user_id', 'content_id', 'task_container_id', 'answered_correctly', 'prior_question_elapsed_time',
             'prior_question_had_explanation', 'content_type_id', 'timestamp']


def make_user_loop_features(df, content_target_mean_dict,
                            user_cnt_dict, user_pos_cnt_dict,
                            user_part_cnt_dict, user_part_pos_cnt_dict,
                            user_content_cnt_dict, user_content_pos_cnt_dict, user_content_redo_cnt_dict, user_content_mean_sum_dict,
                            user_consecutive_pos_cnt_dict, user_target_win25_dict, user_content_mean_win10_dict,
                            user_explanation_cnt_dict, user_explanation_pos_cnt_dict, user_elapse_time_sum_dict, user_elapse_time_win10_dict,
                            user_last_timestamp_dict, user_last_task_dict,
                            user_content_win5_dict, user_part_win10_dict,
                            bundle_state_dict, user_cum_time_dict, user_timespan_win10_dict,# user_order_in_session_dict,
                            user_tags_cnt_dict, user_tags_pos_cnt_dict,
                            user_continue_quest_cnt_dict,
                            update=True, isTrain=True):
    sample_num = len(df.loc[df.content_type_id == False])

    user_cnt_npy = np.zeros(sample_num)
    user_pos_cnt_npy = np.zeros(sample_num)
    user_part_cnt_npy = np.zeros(sample_num)
    user_part_pos_cnt_npy = np.zeros(sample_num)
    user_content_cnt_npy = np.zeros(sample_num)
    user_content_pos_cnt_npy = np.zeros(sample_num)
    user_content_redo_cnt_npy = np.zeros(sample_num)
    user_content_mean_mean_npy = np.zeros(sample_num)
    user_consecutive_pos_cnt_npy = np.zeros(sample_num)
    user_pos_cnt_win25_npy = np.zeros(sample_num)
    user_content_mean_win10_npy = np.zeros(sample_num)

    user_explanation_cnt_npy = np.zeros(sample_num)
    user_explanation_pos_cnt_npy = np.zeros(sample_num)
    user_elapse_time_mean_npy = np.zeros(sample_num)
    user_elapse_time_mean_win10_npy = np.zeros(sample_num)
    user_last_timespan_npy = np.zeros(sample_num)
    user_last_task_diff_npy = np.zeros(sample_num)
    user_content_appear_in_win5_npy = np.zeros(sample_num)
    user_part_cnt_in_win10_npy = np.zeros(sample_num)

    # user_order_in_session_npy = np.zeros(sample_num)
    user_cum_time_npy = np.zeros(sample_num)
    user_timespan_win10_mean_npy = np.zeros(sample_num)

    user_tags_cnt_mean_npy = np.zeros(sample_num)
    user_tags_pos_rate_npy = np.zeros(sample_num)

    user_continue_quest_cnt_npy = np.zeros(sample_num)

    if update:
        tk0 = tqdm(df[used_cols].values)
    else:
        tk0 = df[used_cols].values
    idx = 0
    for (_user_id, _content_id, _task_container_id, _answered_correctly, _prior_question_elapsed_time,
         _prior_question_had_explanation, _content_type_id, _timestamp) in tk0:

        if _content_id in content_target_mean_dict:
            _content_target_mean = content_target_mean_dict[_content_id]
        else:
            _content_target_mean = 0

        if _content_type_id == False:
            _bundle_id = question_bundle_dict[_content_id]
            _part = question_part_dict[_content_id]
            _tags = question_tags_dict[_content_id]
            # print('_content_id: ', _content_id)
            # print('_part: ', _part)

            # user_cnt
            user_cnt_npy[idx] = user_cnt_dict[_user_id]
            user_cnt_dict[_user_id] += 1
            # user_pos_cnt
            user_pos_cnt_npy[idx] = user_pos_cnt_dict[_user_id]
            if update:
                user_pos_cnt_dict[_user_id] += _answered_correctly
            _user_part = str(_user_id) + '_' + str(_part)
            # user_part_cnt
            user_part_cnt_npy[idx] = user_part_cnt_dict[_user_part]
            user_part_cnt_dict[_user_part] += 1
            # user_part_pos_cnt
            user_part_pos_cnt_npy[idx] = user_part_pos_cnt_dict[_user_part]
            if update:
                user_part_pos_cnt_dict[_user_part] += _answered_correctly
            # user_content_cnt
            # _user_content = str(_user_id) + '_' + str(_content_id)
            _user_content = np.int64(_user_id) * 10_0000 + _content_id
            if isTrain:
                user_content_cnt_npy[idx] = user_content_cnt_dict[_user_content]
                user_content_cnt_dict[_user_content] += 1
            else:
                if _user_content in user_content_feat_df.index:
                    user_content_cnt_npy[idx] = user_content_feat_df.loc[_user_content]['user_content_cnt'] + user_content_cnt_dict[_user_content]
                else:
                    user_content_cnt_npy[idx] = user_content_cnt_dict[_user_content]
                user_content_cnt_dict[_user_content] += 1
            # user_content_pos_cnt
            if isTrain:
                user_content_pos_cnt_npy[idx] = user_content_pos_cnt_dict[_user_content]
                if update:
                    user_content_pos_cnt_dict[_user_content] += _answered_correctly
            else:
                if _user_content in user_content_feat_df.index:
                    user_content_pos_cnt_npy[idx] = user_content_feat_df.loc[_user_content]['user_content_pos_cnt'] + user_content_pos_cnt_dict[_user_content]
                else:
                    user_content_pos_cnt_npy[idx] = user_content_pos_cnt_dict[_user_content]
            # user_content_redo_cnt
            user_content_redo_cnt_npy[idx] = user_content_redo_cnt_dict[_user_id]
            if isTrain:
                if user_content_cnt_dict[_user_content] > 0:
                    user_content_redo_cnt_dict[_user_id] += 1
            else:
                if _user_content in user_content_feat_df.index:
                    if user_content_feat_df.loc[_user_content]['user_content_cnt'] > 0:
                        user_content_redo_cnt_dict[_user_id] += 1
                else:
                    if user_content_cnt_dict[_user_content] > 0:
                        user_content_redo_cnt_dict[_user_id] += 1
            # user_content_mean_mean
            user_content_mean_mean_npy[idx] = user_content_mean_sum_dict[_user_id]
            user_content_mean_sum_dict[_user_id] += _content_target_mean
            # user_consecutive_pos_cnt
            user_consecutive_pos_cnt_npy[idx] = user_consecutive_pos_cnt_dict[_user_id]
            if update:
                if _answered_correctly:
                    user_consecutive_pos_cnt_dict[_user_id] += 1
                else:
                    user_consecutive_pos_cnt_dict[_user_id] = 0
            # user_pos_cnt_win25
            user_pos_cnt_win25_npy[idx] = sum(user_target_win25_dict[_user_id])
            if update:
                user_target_win25_dict[_user_id].append(_answered_correctly)
                if len(user_target_win25_dict[_user_id]) > 25:
                    tmp = user_target_win25_dict[_user_id]
                    user_target_win25_dict[_user_id] = tmp[-25:]
            # user_content_mean_win10
            div_num = len(user_content_mean_win10_dict[_user_id])
            if div_num > 0:
                user_content_mean_win10_npy[idx] = sum(user_content_mean_win10_dict[_user_id]) / div_num
            user_content_mean_win10_dict[_user_id].append(_content_target_mean)
            if len(user_content_mean_win10_dict[_user_id]) > 10:
                tmp = user_content_mean_win10_dict[_user_id]
                user_content_mean_win10_dict[_user_id] = tmp[-10:]
            # user_explanation_cnt
            user_explanation_cnt_npy[idx] = user_explanation_cnt_dict[_user_id]
            user_explanation_cnt_dict[_user_id] += _prior_question_had_explanation
            # user_explanation_pos_cnt
            user_explanation_pos_cnt_npy[idx] =user_explanation_pos_cnt_dict[_user_id]
            if update:
                if _answered_correctly:
                    user_explanation_pos_cnt_dict[_user_id] += _prior_question_had_explanation
            # user_elapse_time_mean
            user_elapse_time_mean_npy[idx] = user_elapse_time_sum_dict[_user_id]
            user_elapse_time_sum_dict[_user_id] += _prior_question_elapsed_time
            # user_elapse_time_mean_win10
            div_num = len(user_elapse_time_win10_dict[_user_id])
            if div_num > 0:
                user_elapse_time_mean_win10_npy[idx] = sum(user_elapse_time_win10_dict[_user_id]) / div_num
            user_elapse_time_win10_dict[_user_id].append(_prior_question_elapsed_time)
            if len(user_elapse_time_win10_dict[_user_id]) > 10:
                tmp = user_elapse_time_win10_dict[_user_id]
                user_elapse_time_win10_dict[_user_id] = tmp[-10:]
            # user_last_timespan
            time_diff = _timestamp - user_last_timestamp_dict[_user_id]
            if _content_id in bundle_mapping:
                # is bundle
                if len(bundle_state_dict[_user_id]) > 1:
                    # previous is also bundle
                    previous_bundle_id, previous_diff = bundle_state_dict[_user_id]
                    if bundle_mapping[_content_id][0] == previous_bundle_id:
                        # current = previous
                        time_diff = previous_diff
                    else:
                        # current != previous
                        time_diff = time_diff // bundle_mapping[_content_id][1]
                        bundle_state_dict[_user_id] = [bundle_mapping[_content_id][0], time_diff]
                else:
                    # previous is not bundle
                    time_diff = time_diff // bundle_mapping[_content_id][1]
                    bundle_state_dict[_user_id] = [bundle_mapping[_content_id][0], time_diff]
            else:
                # not bundle, clear state
                bundle_state_dict[_user_id] = []
            user_last_timespan_npy[idx] = time_diff
            user_last_timestamp_dict[_user_id] = _timestamp
            # user_last_task_diff
            user_last_task_diff_npy[idx] = _task_container_id - user_last_task_dict[_user_id]
            user_last_task_dict[_user_id] = _task_container_id
            # user_content_appear_in_win5
            user_content_appear_in_win5_npy[idx] = _content_id in user_content_win5_dict[_user_id]
            user_content_win5_dict[_user_id].append(_content_id)
            if len(user_content_win5_dict[_user_id]) > 5:
                del user_content_win5_dict[_user_id][0]
            # user_part_cnt_in_win10
            user_part_cnt_in_win10_npy[idx] = user_part_win10_dict[_user_id].count(_part)
            user_part_win10_dict[_user_id].append(_part)
            if len(user_part_win10_dict[_user_id]) > 10:
                del user_part_win10_dict[_user_id][0]

            # # user_order_in_session
            # if time_diff > 5 * 60:
            #     user_order_in_session_dict[_user_id] = 1
            # else:
            #     user_order_in_session_dict[_user_id] += 1
            # user_order_in_session_npy[idx] = user_order_in_session_dict[_user_id]

            # user_tags_feat
            div_num = 0
            for _tag in _tags:
                _user_tag = str(_user_id) + '_' + str(_tag)
                user_tags_cnt_mean_npy[idx] += user_tags_cnt_dict[_user_tag]
                if user_tags_cnt_dict[_user_tag] > 0:
                    div_num += 1

                if user_tags_cnt_dict[_user_tag] > 1:
                    _pos_rate = user_tags_pos_cnt_dict[_user_tag] / user_tags_cnt_dict[_user_tag]
                    user_tags_pos_rate_npy[idx] += _pos_rate

                user_tags_cnt_dict[_user_tag] += 1
                if update:
                    user_tags_pos_cnt_dict[_user_tag] += _answered_correctly
            if div_num > 0:
                user_tags_cnt_mean_npy[idx] //= div_num
                user_tags_pos_rate_npy[idx] /= div_num
            # user_cum_time
            if time_diff <= 5 * 60:
                user_cum_time_dict[_user_id] += time_diff
            user_cum_time_npy[idx] = user_cum_time_dict[_user_id]
            # user_timespan_win10
            if time_diff <= 5 * 60:
                user_timespan_win10_dict[_user_id].append(time_diff)
                if len(user_timespan_win10_dict[_user_id]) > 10:
                    del user_timespan_win10_dict[_user_id][0]
            div_num = len(user_timespan_win10_dict[_user_id])
            if div_num > 0:
                user_timespan_win10_mean_npy[idx] = sum(user_timespan_win10_dict[_user_id]) / div_num

            # user_continue_quest_cnt
            user_continue_quest_cnt_npy[idx] = user_continue_quest_cnt_dict[_user_id]
            user_continue_quest_cnt_dict[_user_id] += 1

            idx += 1
        else:
            _tag = lecture_tag_dict[_content_id]
            _part = lecture_part_dict[_content_id]
            _type_of = lecture_type_dict[_content_id]

            _user_part = str(_user_id) + '_' + str(_part)

            # user_continue_quest_cnt
            user_continue_quest_cnt_dict[_user_id] = 0


    feats_df = pd.DataFrame({
        'user_cnt': user_cnt_npy,
        'user_pos_cnt': user_pos_cnt_npy,
        'user_part_cnt': user_part_cnt_npy,
        'user_part_pos_cnt': user_part_pos_cnt_npy,
        'user_content_cnt': user_content_cnt_npy,
        'user_content_pos_cnt': user_content_pos_cnt_npy,
        'user_content_redo_cnt': user_content_redo_cnt_npy,
        'user_content_mean_mean': user_content_mean_mean_npy,
        'user_consecutive_pos_cnt': user_consecutive_pos_cnt_npy,
        'user_pos_cnt_win25': user_pos_cnt_win25_npy,
        'user_content_mean_win10': user_content_mean_win10_npy,
        'user_explanation_cnt': user_explanation_cnt_npy,
        'user_explanation_pos_cnt': user_explanation_pos_cnt_npy,
        'user_elapse_time_mean': user_elapse_time_mean_npy,
        'user_elapse_time_mean_win10': user_elapse_time_mean_win10_npy,
        'user_last_timespan': user_last_timespan_npy,
        'user_last_task_diff': user_last_task_diff_npy,
        'user_content_appear_in_win5': user_content_appear_in_win5_npy,
        'user_part_cnt_in_win10': user_part_cnt_in_win10_npy,
        # 'user_order_in_session': user_order_in_session_npy,
        'user_cum_time': user_cum_time_npy,
        'user_timespan_win10_mean': user_timespan_win10_mean_npy,

        'user_tags_cnt_mean': user_tags_cnt_mean_npy,
        'user_tags_pos_rate': user_tags_pos_rate_npy,

        'user_continue_quest_cnt': user_continue_quest_cnt_npy,
    })
    feats_df['user_target_mean'] = feats_df['user_pos_cnt'] / feats_df['user_cnt']
    feats_df['user_part_target_mean'] = feats_df['user_part_pos_cnt'] / feats_df['user_part_cnt']
    feats_df['user_content_target_mean'] = feats_df['user_content_pos_cnt'] / feats_df['user_content_cnt']
    feats_df['user_content_mean_mean'] /= feats_df['user_cnt']

    feats_df['user_explanation_mean'] = feats_df['user_explanation_cnt'] / feats_df['user_cnt']
    feats_df['user_explanation_rate'] = feats_df['user_explanation_pos_cnt'] / feats_df['user_explanation_cnt']

    feats_df['user_elapse_time_mean'] /= feats_df['user_cnt']

    feats_df = feats_df.drop(['user_explanation_cnt', 'user_explanation_pos_cnt'], axis=1)
    return feats_df


def update_user_feats(df, user_content_pos_cnt_dict,
                      user_pos_cnt_dict, user_part_pos_cnt_dict, user_consecutive_pos_cnt_dict, user_target_win25_dict,
                      user_explanation_pos_cnt_dict, user_tags_pos_cnt_dict
                      ):
    for (_user_id, _content_id, _task_container_id, _answered_correctly, _prior_question_elapsed_time,
         _prior_question_had_explanation, _content_type_id, _timestamp) in df[used_cols].values:

        if _content_type_id == False:
            _bundle_id = question_bundle_dict[_content_id]
            _part = question_part_dict[_content_id]
            _tags = question_tags_dict[_content_id]

            # user_pos_cnt
            user_pos_cnt_dict[_user_id] += _answered_correctly
            _user_part = str(_user_id) + '_' + str(_part)
            # user_part_pos_cnt
            user_part_pos_cnt_dict[_user_part] += _answered_correctly

            _user_content = np.int64(_user_id) * 10_0000 + _content_id
            # user_content_pos_cnt_dict[_user_content] += _answered_correctly
#             user_content_feat_df.loc[_user_content, ['user_content_cnt']] += _answered_correctly
            user_content_pos_cnt_dict[_user_content] += _answered_correctly
            # user_consecutive_pos_cnt
            if _answered_correctly:
                user_consecutive_pos_cnt_dict[_user_id] += 1
            else:
                user_consecutive_pos_cnt_dict[_user_id] = 0
            # user_target_win25_dict
            user_target_win25_dict[_user_id].append(_answered_correctly)
            if len(user_target_win25_dict[_user_id]) > 25:
                tmp = user_target_win25_dict[_user_id]
                user_target_win25_dict[_user_id] = tmp[-25:]
            # user_explanation_pos_cnt
            if _answered_correctly:
                user_explanation_pos_cnt_dict[_user_id] += _prior_question_had_explanation
            # user_tags_pos_cnt_dict
            for _tag in _tags:
                _user_tag = str(_user_id) + '_' + str(_tag)
                user_tags_pos_cnt_dict[_user_tag] += _answered_correctly

        else:
            _tag = lecture_tag_dict[_content_id]
            _part = lecture_part_dict[_content_id]
            _type_of = lecture_type_dict[_content_id]

if OFFLINE:
    user_feat_train = make_user_loop_features(
        df=train, content_target_mean_dict=content_target_mean_dict,
        user_cnt_dict=user_cnt_dict,
        user_pos_cnt_dict=user_pos_cnt_dict,
        user_part_cnt_dict=user_part_cnt_dict,
        user_part_pos_cnt_dict=user_part_pos_cnt_dict,
        user_content_cnt_dict=user_content_cnt_dict,
        user_content_pos_cnt_dict=user_content_pos_cnt_dict,
        user_content_redo_cnt_dict=user_content_redo_cnt_dict,
        user_content_mean_sum_dict=user_content_mean_sum_dict,
        user_consecutive_pos_cnt_dict=user_consecutive_pos_cnt_dict,
        user_target_win25_dict=user_target_win25_dict,
        user_content_mean_win10_dict=user_content_mean_win10_dict,
        user_explanation_cnt_dict=user_explanation_cnt_dict,
        user_explanation_pos_cnt_dict=user_explanation_pos_cnt_dict,
        user_elapse_time_sum_dict=user_elapse_time_sum_dict,
        user_elapse_time_win10_dict=user_elapse_time_win10_dict,
        user_last_timestamp_dict=user_last_timestamp_dict,
        user_last_task_dict=user_last_task_dict,
        user_content_win5_dict=user_content_win5_dict,
        user_part_win10_dict=user_part_win10_dict,
        bundle_state_dict=bundle_state_dict,
        # user_order_in_session_dict=user_order_in_session_dict,
        user_cum_time_dict=user_cum_time_dict,
        user_timespan_win10_dict=user_timespan_win10_dict,
        user_tags_cnt_dict=user_tags_cnt_dict,
        user_tags_pos_cnt_dict=user_tags_pos_cnt_dict,
        user_continue_quest_cnt_dict=user_continue_quest_cnt_dict,
        update=True, isTrain=True
    )
    user_feat_train = reduce_mem_usage(user_feat_train, verbose=True)

    user_feat_valid = make_user_loop_features(
        df=valid, content_target_mean_dict=content_target_mean_dict,
        user_cnt_dict=user_cnt_dict,
        user_pos_cnt_dict=user_pos_cnt_dict,
        user_part_cnt_dict=user_part_cnt_dict,
        user_part_pos_cnt_dict=user_part_pos_cnt_dict,
        user_content_cnt_dict=user_content_cnt_dict,
        user_content_pos_cnt_dict=user_content_pos_cnt_dict,
        user_content_redo_cnt_dict=user_content_redo_cnt_dict,
        user_content_mean_sum_dict=user_content_mean_sum_dict,
        user_consecutive_pos_cnt_dict=user_consecutive_pos_cnt_dict,
        user_target_win25_dict=user_target_win25_dict,
        user_content_mean_win10_dict=user_content_mean_win10_dict,
        user_explanation_cnt_dict=user_explanation_cnt_dict,
        user_explanation_pos_cnt_dict=user_explanation_pos_cnt_dict,
        user_elapse_time_sum_dict=user_elapse_time_sum_dict,
        user_elapse_time_win10_dict=user_elapse_time_win10_dict,
        user_last_timestamp_dict=user_last_timestamp_dict,
        user_last_task_dict=user_last_task_dict,
        user_content_win5_dict=user_content_win5_dict,
        user_part_win10_dict=user_part_win10_dict,
        bundle_state_dict=bundle_state_dict,
        # user_order_in_session_dict=user_order_in_session_dict,
        user_cum_time_dict=user_cum_time_dict,
        user_timespan_win10_dict=user_timespan_win10_dict,
        user_tags_cnt_dict=user_tags_cnt_dict,
        user_tags_pos_cnt_dict=user_tags_pos_cnt_dict,
        user_continue_quest_cnt_dict=user_continue_quest_cnt_dict,
        update=True, isTrain=True
    )
    user_feat_valid = reduce_mem_usage(user_feat_valid, verbose=True)
    del train, valid
    gc.collect()
    user_feat_cols = user_feat_train.columns.values.tolist()
    config_dict['user_feat_cols'] = user_feat_cols
user_feat_cols = config_dict['user_feat_cols']

########################################################################################################################
##### Content Static Feat
def make_content_feat2(id_df, df, type):
    df['content_id'] = id_df['content_id'].values
    file_name = f'content_feat2_{type}.pkl'

    feat_df = df.groupby('content_id', as_index=False)['user_last_timespan'].median(). \
        rename(columns={'answered_correctly': 'content_timespan_median'})

    save_pickle(feat_df, save_path=f'{CACHE_PATH}/{file_name}')
    feat_df = reduce_mem_usage(feat_df, verbose=True)
    return feat_df


if OFFLINE:
    content_feat2 = make_content_feat2(id_df=ques_train, df=user_feat_train.copy(deep=True), type='train')
    content_feat2_test = make_content_feat2(id_df=pd.concat([ques_train, ques_valid]), df=pd.concat([user_feat_train, user_feat_valid]), type='test')
    print('content_feat2:\n', content_feat2.head())
else:
    content_feat2_test = load_pickle(f'{CACHE_PATH}/content_feat2_test.pkl')
    content_feat2_test = reduce_mem_usage(content_feat2_test, verbose=True)
content_feat2_cols = [col for col in content_feat2_test if col != 'content_id']
########################################################################################################################
##### Merge Feat
def merge_features(df, static_feat, content_feat, content_feat2, user_feat):
    feat_df = static_feat.copy(deep=True)
    # print('1')
    feat_df[content_feat_cols] = content_feat.set_index('content_id').reindex(df['content_id'].values).values
    feat_df[content_feat2_cols] = content_feat2.set_index('content_id').reindex(df['content_id'].values).values
    # print('2')
    # feat_df[part_feat_cols] = part_feat.set_index('part').reindex(feat_df['part'].values).values
    for col in user_feat_cols:
        feat_df[col] = user_feat[col].values
    # print('3')

    # # ### cross feat
    # feat_df['timestamp_pos_cnt_mean'] = feat_df['timestamp'] / feat_df['user_pos_cnt']

    return feat_df
# cross_feat_cols = ['timestamp_pos_cnt_mean']

if OFFLINE:
    train_feat = merge_features(df=ques_train,
                                static_feat=state_feat_train,
                                content_feat=content_feat,
                                content_feat2=content_feat2,
                                # part_feat=part_feat,
                                user_feat=user_feat_train)
    del state_feat_train, user_feat_train
    gc.collect()
    train_feat = reduce_mem_usage(train_feat, verbose=True)

    valid_feat = merge_features(df=ques_valid,
                                static_feat=state_feat_valid,
                                content_feat=content_feat,
                                content_feat2=content_feat2,
                                # part_feat=part_feat,
                                user_feat=user_feat_valid)
    del state_feat_valid, user_feat_valid, content_feat
    gc.collect()
    valid_feat = reduce_mem_usage(valid_feat, verbose=True)
    # print('train_feat: ', len(train_feat))
    # train_feat.to_csv('train_feat2.csv', index=False)

feat_cols = static_feat_cols + content_feat_cols + content_feat2_cols + user_feat_cols# + cross_feat_cols# + part_feat_cols
print(f'Feat Nums: {len(feat_cols)}')
# ########################################################################################################################
# ##### Cross Feat
# if OFFLINE:
#     train['answered_correctly_avg_u_c'] = train['answered_correctly_avg_u'] * train['answered_correctly_avg_c'] / \
#                                           (train['answered_correctly_avg_u'] + train['answered_correctly_avg_c'])
#     valid['answered_correctly_avg_u_c'] = valid['answered_correctly_avg_u'] * valid['answered_correctly_avg_c'] / \
#                                           (valid['answered_correctly_avg_u'] + valid['answered_correctly_avg_c'])
#
########################################################################################################################
##### Train Model
TARGET = 'answered_correctly'
if OFFLINE:
    # train_feat = train_feat.loc[train_index]
    # train_labels = ques_train.loc[train_index, TARGET]
    train_labels = ques_train[TARGET]
    valid_labels = ques_valid[TARGET]
    del ques_train, ques_valid
    gc.collect()

    lgb_train = lgb.Dataset(train_feat[feat_cols], train_labels, categorical_feature=['content_id'])
    lgb_valid = lgb.Dataset(valid_feat[feat_cols], valid_labels, categorical_feature=['content_id'])
    del train_feat, train_labels
    gc.collect()

    params = {
        'objective': 'binary',
        'seed': 28,
        'num_leaves': 256,
        'max_bin': 1024,
        'feature_fraction': 0.6,
        'max_depth': 8,
        'verbose': -1,
        'cat_l2': 30,
        'cat_smooth': 20,
        'num_threads': 20,
    }
    feature_importance_df = pd.DataFrame()
    feature_importance_df["feature"] = feat_cols

    model = lgb.train(
        params, lgb_train,
        valid_sets=lgb_valid,
        verbose_eval=50,
        num_boost_round=10000,
        early_stopping_rounds=50
    )
    model.save_model(f'{CACHE_PATH}/model.txt')

    feature_importance_df['importance'] = model.feature_importance()
    feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
    feature_importance_df.to_csv(f'{CACHE_PATH}/feature_importance.csv', index=False)

    valid_score = roc_auc_score(valid_labels, model.predict(valid_feat[feat_cols]))
    print(f'auc: {valid_score:.4f}')
    # lgb.plot_importance(model)

    ### save user_content feat
    user_content_feat_df = pd.DataFrame()
    user_content_feat_df['user_content_id'] = user_content_cnt_dict.keys()
    user_content_feat_df['user_content_cnt'] = user_content_cnt_dict.values()

    user_content_pos_cnt_feat_buff = pd.DataFrame()
    user_content_pos_cnt_feat_buff['user_content_id'] = user_content_pos_cnt_dict.keys()
    user_content_pos_cnt_feat_buff['user_content_pos_cnt'] = user_content_pos_cnt_dict.values()
    user_content_feat_df = user_content_feat_df.merge(user_content_pos_cnt_feat_buff, on='user_content_id', how='left')
    del user_content_pos_cnt_feat_buff

    user_content_feat_df = user_content_feat_df.set_index('user_content_id')
    user_content_feat_df.to_pickle(f'{CACHE_PATH}/user_content_feat.pkl')

    save_pickle(config_dict, config_file)
    save_pickle(user_cnt_dict, f'{CACHE_PATH}/user_cnt_dict.pkl')
    save_pickle(user_pos_cnt_dict, f'{CACHE_PATH}/user_pos_cnt_dict.pkl')
    save_pickle(user_part_cnt_dict, f'{CACHE_PATH}/user_part_cnt_dict.pkl')
    save_pickle(user_part_pos_cnt_dict, f'{CACHE_PATH}/user_part_pos_cnt_dict.pkl')
    save_pickle(user_content_redo_cnt_dict, f'{CACHE_PATH}/user_content_redo_cnt_dict.pkl')
    save_pickle(user_content_mean_sum_dict, f'{CACHE_PATH}/user_content_mean_sum_dict.pkl')
    save_pickle(user_consecutive_pos_cnt_dict, f'{CACHE_PATH}/user_consecutive_pos_cnt_dict.pkl')
    save_pickle(user_target_win25_dict, f'{CACHE_PATH}/user_target_win25_dict.pkl')
    save_pickle(user_content_mean_win10_dict, f'{CACHE_PATH}/user_content_mean_win10_dict.pkl')
    save_pickle(user_explanation_cnt_dict, f'{CACHE_PATH}/user_explanation_cnt_dict.pkl')
    save_pickle(user_explanation_pos_cnt_dict, f'{CACHE_PATH}/user_explanation_pos_cnt_dict.pkl')
    save_pickle(user_elapse_time_sum_dict, f'{CACHE_PATH}/user_elapse_time_sum_dict.pkl')
    save_pickle(user_elapse_time_win10_dict, f'{CACHE_PATH}/user_elapse_time_win10_dict.pkl')
    save_pickle(user_last_timestamp_dict, f'{CACHE_PATH}/user_last_timestamp_dict.pkl')
    save_pickle(user_last_task_dict, f'{CACHE_PATH}/user_last_task_dict.pkl')
    save_pickle(user_content_win5_dict, f'{CACHE_PATH}/user_content_win5_dict.pkl')
    save_pickle(user_part_win10_dict, f'{CACHE_PATH}/user_part_win10_dict.pkl')
    save_pickle(bundle_state_dict, f'{CACHE_PATH}/bundle_state_dict.pkl')
    # save_pickle(user_order_in_session_dict, f'{CACHE_PATH}/user_order_in_session_dict.pkl')
    save_pickle(user_cum_time_dict, f'{CACHE_PATH}/user_cum_time_dict.pkl')
    save_pickle(user_timespan_win10_dict, f'{CACHE_PATH}/user_timespan_win10_dict.pkl')
    save_pickle(user_tags_cnt_dict, f'{CACHE_PATH}/user_tags_cnt_dict.pkl')
    save_pickle(user_tags_pos_cnt_dict, f'{CACHE_PATH}/user_tags_pos_cnt_dict.pkl')
    save_pickle(user_continue_quest_cnt_dict, f'{CACHE_PATH}/user_continue_quest_cnt_dict.pkl')
else:
    model = lgb.Booster(model_file=f'{CACHE_PATH}/model.txt')
    print('load model finished')


########################################################################################################################
##### Inference
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self

    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df = self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start, 'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start, 'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (
                    crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if crr_user_id == pre_added_user and (
                        crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()


if OFFLINE:
    target_df = pd.read_pickle(f'{MY_DATA_PATH}/valid.pickle')
    iter_test = Iter_Valid(target_df, max_user=1000)
    predicted = []


    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation

    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

previous_test_df = None
idx = 0
for (test_df, sample_prediction_df) in iter_test:
    print('idx: ', idx)
    test_df[TARGET] = 0
    test_df['prior_question_elapsed_time'] //= 1000
    test_df['timestamp'] /= 1000
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean).astype(np.int32)
#     test_df.loc[test_df['content_id'] > 13522, 'content_id'] = 13522
    test_df = reduce_mem_usage(test_df, verbose=True)
    # test_df['day'] = test_df['timestamp'] // 1000 // 60 // 60# // 24
    ques_test = test_df.loc[test_df.content_type_id == False, ['row_id', 'content_id']].reset_index(drop=True)
    ques_test = reduce_mem_usage(ques_test, verbose=True)
    if previous_test_df is not None:
        # print('np.array(eval(test_df["prior_group_answers_correct"].iloc[0])):\n', np.array(eval(test_df["prior_group_answers_correct"].iloc[0])))
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])

        update_user_feats(
            df=previous_test_df, user_content_pos_cnt_dict=user_content_pos_cnt_dict,
            user_pos_cnt_dict=user_pos_cnt_dict,
            user_part_pos_cnt_dict=user_part_pos_cnt_dict,
            user_consecutive_pos_cnt_dict=user_consecutive_pos_cnt_dict,
            user_target_win25_dict=user_target_win25_dict,
            user_explanation_pos_cnt_dict=user_explanation_pos_cnt_dict,
            user_tags_pos_cnt_dict=user_tags_pos_cnt_dict,
        )
    previous_test_df = test_df.copy(deep=True)

    state_feat_test = get_stat_feat(df=test_df.copy(deep=True), feat_cols=static_feat_cols)
    state_feat_test = reduce_mem_usage(state_feat_test, verbose=True)
    # for _part in part_elapse_time_mean_dict:
    #     state_feat_test.loc[state_feat_test.part == _part, ['prior_question_elapsed_time']] = \
    #         state_feat_test.loc[state_feat_test.part == _part, ['prior_question_elapsed_time']].fillna(part_elapse_time_mean_dict[_part])

    user_feat_test = make_user_loop_features(
        df=test_df, content_target_mean_dict=content_target_mean_dict,
        user_cnt_dict=user_cnt_dict,
        user_pos_cnt_dict=user_pos_cnt_dict,
        user_part_cnt_dict=user_part_cnt_dict,
        user_part_pos_cnt_dict=user_part_pos_cnt_dict,
        user_content_cnt_dict=user_content_cnt_dict,
        user_content_pos_cnt_dict=user_content_pos_cnt_dict,
        user_content_redo_cnt_dict=user_content_redo_cnt_dict,
        user_content_mean_sum_dict=user_content_mean_sum_dict,
        user_consecutive_pos_cnt_dict=user_consecutive_pos_cnt_dict,
        user_target_win25_dict=user_target_win25_dict,
        user_content_mean_win10_dict=user_content_mean_win10_dict,
        user_explanation_cnt_dict=user_explanation_cnt_dict,
        user_explanation_pos_cnt_dict=user_explanation_pos_cnt_dict,
        user_elapse_time_sum_dict=user_elapse_time_sum_dict,
        user_elapse_time_win10_dict=user_elapse_time_win10_dict,
        user_last_timestamp_dict=user_last_timestamp_dict,
        user_last_task_dict=user_last_task_dict,
        user_content_win5_dict=user_content_win5_dict,
        user_part_win10_dict=user_part_win10_dict,
        bundle_state_dict=bundle_state_dict,
        # user_order_in_session_dict=user_order_in_session_dict,
        user_cum_time_dict=user_cum_time_dict,
        user_timespan_win10_dict=user_timespan_win10_dict,
        user_tags_cnt_dict=user_tags_cnt_dict,
        user_tags_pos_cnt_dict=user_tags_pos_cnt_dict,
        user_continue_quest_cnt_dict=user_continue_quest_cnt_dict,
        update=False, isTrain=False
    )
    test_feat = merge_features(df=ques_test,
                               static_feat=state_feat_test,
                               content_feat=content_feat_test,
                               content_feat2=content_feat2_test,
                               # part_feat=part_feat_test,
                               user_feat=user_feat_test)
    # test_feat = reduce_mem_usage(test_feat, verbose=True)

    ques_test[TARGET] = model.predict(test_feat[feat_cols].values.reshape(len(test_feat), len(feat_cols)))
    set_predict(ques_test[['row_id', TARGET]])

    idx += 1
    if OFFLINE:
        if idx >= 5:
            break

# auc: 0.7265
# auc: 0.7274
# auc: 0.7527 add user 2feat
# auc: 0.7652
# auc: 0.7728 add user_last_timespan
# auc: 0.7733 add user_part_last_timespan
# auc: 0.7732 num_leaves32-->64 (all data auc: 0.7812, LB 0.780)
# auc: 0.7734 add user_lect_cnt
# auc: 0.7738 add user_part_lect_cnt
# auc: 0.7741 add user_last_target
# auc: 0.7743 add user_part_last_target
# auc: 0.7744 add user_last_part
# auc: 0.7752 add tag1234
# auc: 0.7755 add content_cnt
# auc: 0.7764 add user_last_task_diff
# auc: 0.7766 add content_bundle_same
# auc: 0.7775 add user_part_elapsed_time_mean
# auc: 0.7778 add user_part_last_elapsed_time_diff
# auc: 0.7781 add user_last_content_type
# auc: 0.7783 add user_same_content
# auc: 0.7784 add user_content_repeat_num
# auc: 0.7789 add tags_w2v_feat

# auc: 0.7742 del user_content_cnt
# auc: 0.7745 add user_lect_cnt_rate
# auc: 0.7794 add user_content_mean_mean all data auc: 0.7877
# auc: 0.7835 add user_content_cnt (all data auc: 0.7916, LB ??)
# auc: 0.7839 add user_last_pos_timespan
# auc: 0.7841 add user_part_last_pos_timespan

# auc: 0.7826 del tags_w2v (all data auc: 0.7916, LB 0.789)
# auc: 0.7828 add user_part_mean_mean
# auc: 0.7834 add part_content_num (all data auc: 0.7919, LB ??)
# auc: 0.7832 time/1000
# auc: 0.7834 exlanation pos_cnt, target_mean
# auc: 0.7839 add user_pos_cnt 5/10/30
# auc: 0.7941 add_user_times
# auc: 0.7845 add user_tags cnt pos_rate
# auc: 0.7850 prior_question_elapsed_time fillna
# auc: 0.7853 add timestamp_pos_cnt_mean
# auc: 0.7858 add content_id
# auc: 0.7864 change params
# auc: 0.7872 change bugs
# auc: 0.7877 add user_continue_pos_cnt
# auc: 0.7866 del tags_feat
# auc: 0.7884 add user_5min_cnt user_5min_pos_cnt user_5min_target_mean
# auc: 0.7888 add user_30min_cnt user_30min_pos_cnt user_30min_target_mean
# auc: 0.7890 add user_2min_cnt user_2min_pos_cnt user_2min_target_mean
# auc: 0.7894 add content_timestamp_diff_median
# auc: 0.7898 add tags_feat

##############################
### new

# auc: 0.7626 user_cnt/pos_cnt/rate, user_content_cnt/pos_cnt/rate part prior_question_elapsed_time content_id
# auc: 0.7638 add user_continue_quest_cnt
# auc: 0.7675 add user_part_cnt/pos_cnt/rate
# auc: 0.7680 add user_content_redo_cnt
# auc: 0.7752 add user_content_mean_mean
# auc: 0.7757 add user_consecutive_pos_cnt
# auc: 0.7760 add user_pos_cnt_win25
# auc: 0.7762 add user_content_mean_win10
# auc: 0.7761 add user_explanation_mean/rate
# auc: 0.7764 add user_elapse_time_mean
# auc: 0.7764 add user_elapse_time_mean_win10
# auc: 0.7857 add user_last_timespan
# auc: 0.7867 add user_last_task_diff
# auc: 0.7865 del user_content_pos_cnt
# auc: 0.7872 add user_content_win5
# auc: 0.7873 add user_part_win10
# auc: 0.7874 del content_user_mean_mean content_explanation_mean
# auc: 0.7884 change timediff
# auc: 0.7884 change content_timespan_median
# auc: 0.7876 add user_order_in_session
# auc: 0.7887 add user_tags_cnt_mean user_tags_pos_rate
# auc: 0.7888 del user_order_in_session
# auc: 0.7894 add user_cum_time
# auc: 0.7901 add user_timespan_win10_mean(all data auc: 0.7980)