## Reading Data and Importing Libraries ##

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb

import random
import os


In [None]:
init_state = [1, 0, 1]
kernel = init_state[0]
debug = init_state[1]
feature_engineering = init_state[2]
# Random seed
SEED = 123

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)


In [None]:
def add_features(df, answer_correctly_u_count_dict, answer_u_count_dict, answer_uq_count_dict, answer_correct_uq_count_dict,
                 answer_incorrect_uq_count_dict, explanation_u_count_dict, elapsed_time_u_count_dict, answer_recent_n_correct_u_avg_dict, 
                 answer_recent_u_timestamp_dict, answer_incorrect_recent_u_timestamp_dict,answer_q_count_dict,elapsed_time_q_total_dict,recent_n=5,do_update=True,make_feature=True,inference=False):
    answer_u_count = np.zeros(len(df), dtype=np.int32)
    answer_correctly_u_count = np.zeros(len(df), dtype=np.int32)
    answer_correctly_u_avg = np.zeros(len(df), dtype=np.float32)
    explanation_u_avg = np.zeros(len(df), dtype=np.float32)
    elapsed_time_u_avg = np.zeros(len(df), dtype=np.float32)
    answer_recent_n_correct_u_avg = np.zeros(len(df), dtype=np.float32)
    answer_recent_u_timestamp = np.zeros(len(df), dtype=np.float32)
    answer_incorrect_recent_u_timestamp = np.zeros(len(df), dtype=np.float32)
    elapsed_time_q_avg=np.zeros(len(df),dtype=np.float32)
    
    answer_uq_count = np.zeros(len(df), dtype=np.int32)
#     answer_correct_uq_count = np.zeros(len(df), dtype=np.int32)
#     answer_incorrect_uq_count = np.zeros(len(df), dtype=np.int32)

    if not inference:
        df_fields=['user_id','content_id','answered_correctly','prior_question_had_explanation','prior_question_elapsed_time','timestamp']
    else:
        df_fields=['user_id','content_id','timestamp']
    
    fdict={}
    for ind,item in enumerate(df_fields):
        fdict[item]=ind
    
    for cnt, row in enumerate(tqdm(df[df_fields].values)):
        if make_feature:
            # record user features
            answer_u_count[cnt] = answer_u_count_dict[row[fdict['user_id']]]
            answer_correctly_u_count[cnt] = answer_correctly_u_count_dict[row[fdict['user_id']]]
            answer_correctly_u_avg[cnt] = answer_correctly_u_count_dict[row[fdict['user_id']]]/answer_u_count_dict[row[fdict['user_id']]] if answer_u_count_dict[row[fdict['user_id']]] else np.nan
            explanation_u_avg[cnt] = explanation_u_count_dict[row[fdict['user_id']]]/answer_u_count_dict[row[fdict['user_id']]] if answer_u_count_dict[row[fdict['user_id']]] else np.nan
            elapsed_time_u_avg[cnt] = elapsed_time_u_count_dict[row[fdict['user_id']]]/answer_u_count_dict[row[fdict['user_id']]] if answer_u_count_dict[row[fdict['user_id']]] else np.nan
            elapsed_time_q_avg[cnt]=elapsed_time_q_total_dict[row[fdict['content_id']]]/answer_q_count_dict[row[fdict['content_id']]] if answer_q_count_dict[row[fdict['content_id']]] else np.nan
            if answer_u_count_dict[row[fdict['user_id']]] <=recent_n:
                answer_recent_n_correct_u_avg[cnt] = answer_correctly_u_avg[cnt]
            else:
                answer_recent_n_correct_u_avg[cnt] = answer_recent_n_correct_u_avg_dict[row[fdict['user_id']]]
            if len(answer_recent_u_timestamp_dict[row[fdict['user_id']]]) == 0:
                answer_recent_u_timestamp[cnt] = np.nan
            else:
                answer_recent_u_timestamp[cnt] = row[fdict['timestamp']] - answer_recent_u_timestamp_dict[row[fdict['user_id']]][0]
                
            if len(answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]]) == 0:
                answer_incorrect_recent_u_timestamp[cnt] = np.nan
            else:
                answer_incorrect_recent_u_timestamp[cnt] = row[fdict['timestamp']] - answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]][0]
                
            # record user features
            answer_uq_count[cnt] = answer_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]]
#             answer_correct_uq_count[cnt] = answer_correct_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]]
#             answer_incorrect_uq_count[cnt] = answer_incorrect_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]]

        if do_update:
            # update user dict
            answer_u_count_dict[row[fdict['user_id']]]+=1
            answer_correctly_u_count_dict[row[fdict['user_id']]]+=row[fdict['answered_correctly']]
            explanation_u_count_dict[row[fdict['user_id']]]+=row[fdict['prior_question_had_explanation']]
            elapsed_time_u_count_dict[row[fdict['user_id']]]+=row[fdict['prior_question_elapsed_time']]
            answer_q_count_dict[row[fdict['content_id']]]+=1
            elapsed_time_q_total_dict[row[fdict['content_id']]]+=row[fdict['prior_question_elapsed_time']]
            if answer_u_count_dict[row[fdict['user_id']]] == recent_n:
                answer_recent_n_correct_u_avg_dict[row[fdict['user_id']]] = answer_correctly_u_count_dict[row[fdict['user_id']]]/answer_u_count_dict[row[fdict['user_id']]] if answer_u_count_dict[row[fdict['user_id']]] else np.nan
            elif answer_u_count_dict[row[fdict['user_id']]] > recent_n:
                answer_recent_n_correct_u_avg_dict[row[fdict['user_id']]] = (answer_recent_n_correct_u_avg_dict[row[fdict['user_id']]]*(recent_n-1)+row[fdict['answered_correctly']])/recent_n
                
            if len(answer_recent_u_timestamp_dict[row[fdict['user_id']]]) == 0:
                answer_recent_u_timestamp_dict[row[fdict['user_id']]].append(row[fdict['timestamp']])
            else:
                answer_recent_u_timestamp_dict[row[fdict['user_id']]].pop()
                answer_recent_u_timestamp_dict[row[fdict['user_id']]].append(row[fdict['timestamp']])
            
            # update user_question dict
            answer_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]] += 1
            if row[fdict['answered_correctly']]:
    #             answer_correct_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]] += 1
                pass
            else:
    #             answer_incorrect_uq_count_dict[row[fdict['user_id']]][row[fdict['content_id']]] += 1
                if len(answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]]) == 0:
                    answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]].append(row[fdict['timestamp']])
                else:
                    answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]].pop()
                    answer_incorrect_recent_u_timestamp_dict[row[fdict['user_id']]].append(row[fdict['timestamp']])
    
    if not make_feature:
        return None
    
    # concat user features
    user_features_df = pd.DataFrame({'answer_correctly_u_count':answer_correctly_u_count, 'answer_u_count': answer_u_count,
                                     'answer_correctly_u_avg':answer_correctly_u_avg, 'answer_recent_n_correct_u_avg': answer_recent_n_correct_u_avg,
                                     'answer_recent_u_timestamp': answer_recent_u_timestamp, 'answer_incorrect_recent_u_timestamp': answer_incorrect_recent_u_timestamp,
                                     'answer_uq_count':answer_uq_count, 
                                     'explanation_u_avg':explanation_u_avg,
                                     'elapsed_time_u_avg': elapsed_time_u_avg,
                                     'elapsed_time_q_avg': elapsed_time_q_avg
                                    })
    
    #user_features_df['answer_correctly_u_avg'] = user_features_df['answer_correctly_u_count']/user_features_df['answer_u_count']
    df = pd.concat([df, user_features_df], axis=1)
    return df


In [None]:
def read_and_preprocess(kernel=False, debug=True, feature_engineering=True):
    print("***** Read Data")
    if kernel:
        train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
        valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
        question_file = '../input/riiid-test-answer-prediction/questions.csv'
    else:
        train_pickle = './CVdata/cv1_train.pickle'
        valid_pickle = './CVdata/cv1_valid.pickle'
        question_file = './data/questions.csv'
        
    pickle_features = ['timestamp', 'row_id', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[pickle_features]
    valid = pd.read_pickle(valid_pickle)[pickle_features]

    if debug:
        train = train[:1000000]
        valid = valid[:10000]
    elif feature_engineering:
        train = train.iloc[-40000000:]
    
    train = train.loc[train.content_type_id == False].reset_index(drop=True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

    questions_df = pd.read_csv(question_file)[['question_id', 'part']]
    questions_df['part'] = questions_df['part'].astype(np.int32)
    train = pd.merge(train, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    valid = pd.merge(valid, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')

    # fillna
    prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()
    train['prior_question_elapsed_time'] = train.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    valid['prior_question_elapsed_time'] = valid.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')

    # answered correctly average for each content
    content_df = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
    content_df.columns = ['content_id', 'answer_correctly_q_avg']
    train = pd.merge(train, content_df, on=['content_id'], how="left")
    valid = pd.merge(valid, content_df, on=['content_id'], how="left")

    # record for user    
    answer_correctly_u_count_dict = defaultdict(int)
    answer_u_count_dict = defaultdict(int)
    explanation_u_count_dict = defaultdict(int)
    elapsed_time_u_count_dict = defaultdict(int)
    answer_recent_n_correct_u_avg_dict = defaultdict(float)
    answer_recent_u_timestamp_dict = defaultdict(list)
    answer_incorrect_recent_u_timestamp_dict = defaultdict(list)

    # record for user_question
    answer_uq_count_dict = defaultdict(lambda: defaultdict(int))
    answer_correct_uq_count_dict = defaultdict(lambda: defaultdict(int))
    answer_incorrect_uq_count_dict = defaultdict(lambda: defaultdict(int))
    answer_q_count_dict=defaultdict(int)
    elapsed_time_q_total_dict=defaultdict(float)

    print("***** Add Features")
    train = add_features(train, answer_correctly_u_count_dict, answer_u_count_dict, answer_uq_count_dict, answer_correct_uq_count_dict,
                         answer_incorrect_uq_count_dict,explanation_u_count_dict,elapsed_time_u_count_dict, answer_recent_n_correct_u_avg_dict,
                         answer_recent_u_timestamp_dict, answer_incorrect_recent_u_timestamp_dict,answer_q_count_dict,elapsed_time_q_total_dict)
    valid = add_features(valid, answer_correctly_u_count_dict, answer_u_count_dict, answer_uq_count_dict, answer_correct_uq_count_dict,
                         answer_incorrect_uq_count_dict,explanation_u_count_dict,elapsed_time_u_count_dict, answer_recent_n_correct_u_avg_dict,
                         answer_recent_u_timestamp_dict, answer_incorrect_recent_u_timestamp_dict,answer_q_count_dict,elapsed_time_q_total_dict)
    gc.collect()
    
    features_dicts = {
        'answer_correctly_u_count_dict': answer_correctly_u_count_dict,
        'answer_u_count_dict': answer_u_count_dict,
        'answer_uq_count_dict': answer_uq_count_dict,
        'answer_correct_uq_count_dict': answer_correct_uq_count_dict,
        'answer_incorrect_uq_count_dict': answer_incorrect_uq_count_dict,
        'explanation_u_count_dict': explanation_u_count_dict,
        'elapsed_time_u_count_dict': elapsed_time_u_count_dict,
        'answer_recent_n_correct_u_avg_dict': answer_recent_n_correct_u_avg_dict,
        'answer_recent_u_timestamp_dict': answer_recent_u_timestamp_dict,
        'answer_incorrect_recent_u_timestamp_dict': answer_incorrect_recent_u_timestamp_dict,
        'answer_q_count_dict': answer_q_count_dict,
        'elapsed_time_q_total_dict': elapsed_time_q_total_dict
    }
    
    return train, valid,content_df, questions_df, prior_question_elapsed_time_mean, features_dicts


In [None]:
def train_and_evaluate(train, valid, debug=True, feature_engineering = True):
    print("***** Train")
    TARGET = 'answered_correctly'
    FEATS = ['answer_correctly_u_count','answer_u_count','answer_correctly_u_avg','answer_correctly_q_avg','prior_question_elapsed_time',
             'prior_question_had_explanation', 'part']
    new_FEATS = ['answer_uq_count', 'explanation_u_avg', 'elapsed_time_u_avg', 'answer_recent_n_correct_u_avg','answer_recent_u_timestamp',
                 'answer_incorrect_recent_u_timestamp','elapsed_time_q_avg']
    unused_FEATS = ['answer_incorrect_uq_count','answer_correct_uq_count']
    FEATS = FEATS+new_FEATS
    print('Features for train: ', FEATS)

    drop_cols = list(set(train.columns)-set(FEATS))
    print("Drop_cols: ", drop_cols)

    if feature_engineering and not debug:
        train = train.sample(15000000, random_state = SEED)
    print("Data size: ", len(train), len(valid))
    y_train = train[TARGET]
    y_valid = valid[TARGET]
    train.drop(drop_cols, axis=1, inplace=True)
    valid.drop(drop_cols, axis=1, inplace=True)
    _ = gc.collect()
    
    lgb_train = lgb.Dataset(train[FEATS], y_train)
    lgb_valid = lgb.Dataset(valid[FEATS], y_valid)
    del train, y_train
    
    model = lgb.train(
                        {'objective': 'binary'}, 
                        lgb_train,
                        valid_sets=[lgb_train, lgb_valid],
                        verbose_eval=100,
                        num_boost_round=10000,
                        early_stopping_rounds=10
                    )
    print('auc:', roc_auc_score(y_valid, model.predict(valid[FEATS])))
    _ = lgb.plot_importance(model)
    return TARGET, FEATS, model


In [None]:
def inference(TARGET, FEATS, model,content_df, questions_df, prior_question_elapsed_time_mean, features_dicts):  
    answer_correctly_u_count_dict = features_dicts['answer_correctly_u_count_dict']
    answer_u_count_dict = features_dicts['answer_u_count_dict']
    answer_uq_count_dict = features_dicts['answer_uq_count_dict']
    answer_correct_uq_count_dict = features_dicts['answer_correct_uq_count_dict']
    answer_incorrect_uq_count_dict = features_dicts['answer_incorrect_uq_count_dict']
    explanation_u_count_dict = features_dicts['explanation_u_count_dict']
    elapsed_time_u_count_dict = features_dicts['elapsed_time_u_count_dict']
    answer_recent_n_correct_u_avg_dict = features_dicts['answer_recent_n_correct_u_avg_dict']
    answer_recent_u_timestamp_dict = features_dicts['answer_recent_u_timestamp_dict']
    answer_incorrect_recent_u_timestamp_dict = features_dicts['answer_incorrect_recent_u_timestamp_dict']
    answer_q_count_dict=features_dicts['answer_q_count_dict']
    elapsed_time_q_total_dict=features_dicts['elapsed_time_q_total_dict']
    
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            previous_test_df = previous_test_df[previous_test_df['content_type_id'] == 0].reset_index(drop=True)
            add_features(previous_test_df, answer_correctly_u_count_dict, answer_u_count_dict, answer_uq_count_dict, answer_correct_uq_count_dict,
                         answer_incorrect_uq_count_dict,explanation_u_count_dict,elapsed_time_u_count_dict, answer_recent_n_correct_u_avg_dict,
                         answer_recent_u_timestamp_dict, answer_incorrect_recent_u_timestamp_dict,answer_q_count_dict,elapsed_time_q_total_dict,make_feature=False)
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
        previous_test_df = test_df.copy()
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
        test_df = add_features(test_df, answer_correctly_u_count_dict, answer_u_count_dict, answer_uq_count_dict, answer_correct_uq_count_dict,
                         answer_incorrect_uq_count_dict,explanation_u_count_dict,elapsed_time_u_count_dict, answer_recent_n_correct_u_avg_dict,
                         answer_recent_u_timestamp_dict, answer_incorrect_recent_u_timestamp_dict,answer_q_count_dict,elapsed_time_q_total_dict,do_update=False,inference=True)
        test_df = pd.merge(test_df, content_df, on='content_id',  how="left")
        test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
        test_df[TARGET] =  model.predict(test_df[FEATS])

        set_predict(test_df[['row_id', TARGET]])


In [None]:
train, valid,content_df, questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(kernel=kernel, debug = debug, feature_engineering = feature_engineering)
train.tail(10)


In [None]:
TARGET, FEATURES, model = train_and_evaluate(train, valid, feature_engineering = feature_engineering, debug=debug)


In [None]:
if kernel:
    inference(TARGET, FEATURES, model, content_df, questions_df, prior_question_elapsed_time_mean, features_dicts)
