# Comments
Thanks to tito for this great script https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering

* Creating predictive feature is very important, here I just used 14 features and 15M data points to train the model.
* The dataset is big to preprocess using python with a for loop, their are other tools and frameworks like (SQL, Spark, Apache Beam, Dask) where you could make feature engineering much faster but if we are smart and make predictive feature it's ok to just use for loops.
* Foward feature engineering seems a good technique to try in this problem (create 1 new feature that you think it could be predective based on the problem, run the pipeline and check if val score increase, if it increase that feature is predictive and you should add it. Care when you just get some minor improvement, sometime is better to discard that feature because your experimentation process is going to get slower).

In [None]:
datasize =    30000000
statesize =   20000000 # train[:-statesize] for store state
samplesize =   8000000 # need lower than datasize - statesize

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm import tqdm
import lightgbm as lgb
import riiideducation
import matplotlib.pyplot as plt
import seaborn as sns

import random
import os

In [None]:
# Random seed
SEED = 456

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)


# Funcion for user stats with loops
def add_features(df, #answered_correctly_u_count, 
                 answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
                 #answered_correctly_q_count, 
                 answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq,
                 uccm_sum, cucm_sum, user_part_sum, user_part_count, bundle_sum, bundle_count, ubcm_sum, timestamp_u_correct, #bucm_sum, 
                 cucm0_sum, cucm0_count, cucm1_sum, cucm1_count, 
                 uccm0_sum, uccm0_count, uccm1_sum, uccm1_count,
                 assign, update = True):
    
    if assign == True:
        # -----------------------------------------------------------------------
        # Client features
        answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
        elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
        explanation_u_avg = np.zeros(len(df), dtype = np.float32)
        timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
        timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
        timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
        timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
        
        tucr = np.zeros(len(df), dtype = np.float32)
        
        
        # -----------------------------------------------------------------------
        # Question features
        answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
        elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
        explanation_q_avg = np.zeros(len(df), dtype = np.float32)
        # -----------------------------------------------------------------------
        # User Question
        answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)
        # -----------------------------------------------------------------------

        ### +++++
        uccm_mean = np.zeros(len(df), dtype = np.float32)
        ### +++++
        cucm_mean = np.zeros(len(df), dtype = np.float32)
        ### +++++
        user_part_mean = np.zeros(len(df), dtype = np.float32)

        ### +++++
        ubcm_mean = np.zeros(len(df), dtype = np.float32)
        bundle_mean = np.zeros(len(df), dtype = np.float16)
        #bucm_mean = np.zeros(len(df), dtype = np.float16)
        ### 
        uco = np.zeros(len(df), dtype = np.int16)
        ### 
        upco = np.zeros(len(df), dtype = np.int16)
        
        ### 
        uccm0_mean = np.zeros(len(df), dtype = np.float16)
        uccm1_mean = np.zeros(len(df), dtype = np.float16)
        cucm0_mean = np.zeros(len(df), dtype = np.float16)
        cucm1_mean = np.zeros(len(df), dtype = np.float16)

    
    
    for num, row in enumerate(tqdm(df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation',
                                  'timestamp', 'part', 'bundle_id', 'task_container_id']].values)):
        
        if assign == True:
            
            # Client features assignation
            # ------------------------------------------------------------------
            if (uccm0_count[row[0]] + uccm1_count[row[0]]) != 0:
                answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                explanation_u_avg[num] = explanation_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                ### +++++
                uccm_mean[num] = uccm_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                ### ubcm
                ubcm_mean[num] = ubcm_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                #### user count
                uco[num] = (uccm0_count[row[0]] + uccm1_count[row[0]])
            else:
                answered_correctly_u_avg[num] = np.nan
                elapsed_time_u_avg[num] = np.nan
                explanation_u_avg[num] = np.nan
                ### +++++
                uccm_mean[num] = np.nan
                ### ubcm
                ubcm_mean[num] = np.nan
                ### user count
                uco[num] = 0
            ### +++++ user part correctness
            if user_part_count[(row[0], row[6])] != 0:
                user_part_mean[num] = user_part_sum[(row[0], row[6])] / user_part_count[(row[0], row[6])]
                upco[num] = user_part_count[(row[0], row[6])]
            else:
                user_part_mean[num] = np.nan
                upco[num] = 0


            if len(timestamp_u[row[0]]) == 0:
                timestamp_u_recency_1[num] = np.nan
                timestamp_u_recency_2[num] = np.nan
                timestamp_u_recency_3[num] = np.nan
            elif len(timestamp_u[row[0]]) == 1:
                timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
                timestamp_u_recency_2[num] = np.nan
                timestamp_u_recency_3[num] = np.nan
            elif len(timestamp_u[row[0]]) == 2:
                timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
                timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
                timestamp_u_recency_3[num] = np.nan
            elif len(timestamp_u[row[0]]) == 3:
                timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
                timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
                timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]

            if len(timestamp_u_incorrect[row[0]]) == 0:
                timestamp_u_incorrect_recency[num] = np.nan
            else:
                timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]
                
                
            if len(timestamp_u_correct[row[0]]) == 0:
                tucr[num] = np.nan
            else:
                tucr[num] = row[5] - timestamp_u_correct[row[0]][0]

            # ------------------------------------------------------------------
            # Question features assignation
            if (cucm0_count[row[2]] + cucm1_count[row[2]]) != 0:
                answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])
                elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])
                explanation_q_avg[num] = explanation_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])

                ### +++++
                cucm_mean[num] = cucm_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])

            else:
                answered_correctly_q_avg[num] = np.nan
                elapsed_time_q_avg[num] = np.nan
                explanation_q_avg[num] = np.nan

                ### +++++
                cucm_mean[num] = np.nan
            
            if bundle_count[row[7]] == 0:
                bundle_mean[num] = np.nan
                
                #bucm_mean[num] = np.nan
                
            else:
                bundle_mean[num] = bundle_sum[row[7]] / bundle_count[row[7]]
                    
                #bucm_mean[num] = bucm_sum[row[7]] / bundle_count[row[7]]

            ### cucm0 cucm1
            if cucm0_count[row[2]] != 0:
                cucm0_mean[num] = cucm0_sum[row[2]] / cucm0_count[row[2]]
            else:
                cucm0_mean[num] = np.nan
            
            if cucm1_count[row[2]] != 0:
                cucm1_mean[num] = cucm1_sum[row[2]] / cucm1_count[row[2]]
            else: 
                cucm1_mean[num] = np.nan
                

            ### uccm0 uccm1 
            if uccm0_count[row[0]] != 0:
                uccm0_mean[num] = uccm0_sum[row[0]] / uccm0_count[row[0]]
            else:
                uccm0_mean[num] = np.nan
            
            if uccm1_count[row[0]] != 0:
                uccm1_mean[num] = uccm1_sum[row[0]] / uccm1_count[row[0]]
            else: 
                uccm1_mean[num] = np.nan
                
            # ------------------------------------------------------------------
            # Client Question assignation
            answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
            # ------------------------------------------------------------------ 

        ### +++++ uccm cucm update +++++
        if (cucm0_count[row[2]] + cucm1_count[row[2]]) == 0:
            uccm_sum[row[0]] += 0.5
        else:
            uccm_sum[row[0]] += answered_correctly_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])

        if (uccm0_count[row[0]] + uccm1_count[row[0]]) == 0:
            cucm_sum[row[2]] += 0.5
            #bucm_sum[row[7]] += 0.5
        else:
            cucm_sum[row[2]] += answered_correctly_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
            #bucm_sum[row[7]] += answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]

        ### +++++ ubcm update
        if bundle_count[row[7]] == 0:
            ubcm_sum[row[0]] += 0.5
        else:
            ubcm_sum[row[0]] += (bundle_sum[row[7]] / bundle_count[row[7]])

        
        
        # ------------------------------------------------------------------
        # Client features updates
        #answered_correctly_u_count[row[0]] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # user part
        user_part_count[(row[0], row[6])] += 1
        
        
        # ------------------------------------------------------------------
        # Question features updates
        #answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        
        ### bundle count
        bundle_count[row[7]] += 1
        
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            
            
            if row[1] == 1:
                if len(timestamp_u_correct[row[0]]) == 1:
                    timestamp_u_correct[row[0]].pop(0)
                    timestamp_u_correct[row[0]].append(row[5])
                else:
                    timestamp_u_correct[row[0]].append(row[5])
            
            
            
            
            
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            
            ## bundle sum
            bundle_sum[row[7]] += row[1]
            
            
            # ------------------------------------------------------------------
            # user part sum, count
            user_part_sum[(row[0], row[6])] += row[1]
            
            # cucm0, cucm1
            if row[1] == 1:
                cucm1_count[row[2]] += 1
                if (uccm0_count[row[0]] + uccm1_count[row[0]]) == 0:
                    cucm1_sum[row[2]] += 0.5
                else:
                    cucm1_sum[row[2]] += answered_correctly_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
                    
                    
                    
            if row[1] == 0:
                cucm0_count[row[2]] += 1
                if (uccm0_count[row[0]] + uccm1_count[row[0]]) == 0:
                    cucm0_sum[row[2]] += 0.5
                else:
                    cucm0_sum[row[2]] += answered_correctly_u_sum[row[0]] / (uccm0_count[row[0]] + uccm1_count[row[0]])
            
            # uccm0, uccm1
            if row[1] == 1:
                uccm1_count[row[0]] += 1
                
                if (cucm0_count[row[2]] + cucm1_count[row[2]]) == 0:
                    uccm1_sum[row[0]] += 0.5
                else:
                    uccm1_sum[row[0]] += answered_correctly_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])
             
            if row[1] == 0:
                uccm0_count[row[0]] += 1
                if (cucm0_count[row[2]] + cucm1_count[row[2]]) == 0:
                    uccm0_sum[row[0]] += 0.5
                else:
                    uccm0_sum[row[0]] += answered_correctly_q_sum[row[2]] / (cucm0_count[row[2]] + cucm1_count[row[2]])
              
    
    if assign == True:
        
        user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 'elapsed_time_u_avg': elapsed_time_u_avg, 'explanation_u_avg': explanation_u_avg, 
                                'answered_correctly_q_avg': answered_correctly_q_avg, 'elapsed_time_q_avg': elapsed_time_q_avg, 'explanation_q_avg': explanation_q_avg, 
                                'answered_correctly_uq_count': answered_correctly_uq_count, 'timestamp_u_recency_1': timestamp_u_recency_1, 'timestamp_u_recency_2': timestamp_u_recency_2,
                                'timestamp_u_recency_3': timestamp_u_recency_3, 'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency,
                                'uccm' : uccm_mean,
                                'cucm' : cucm_mean,
                                'upm' : user_part_mean, 
                                'ubcm' : ubcm_mean,
                                'user_count' : uco,
                                'user_part_count' : upco,
                                'bm' : bundle_mean, 
                                'tucr': tucr,
                                #'bucm': bucm_mean,
                                'cucm0' : cucm0_mean,
                                'cucm1' : cucm1_mean,
                                'uccm0' : uccm0_mean,
                                'uccm1' : uccm1_mean,
                               })
        df = df.reset_index(drop=True)
        user_df = user_df.reset_index(drop=True)
        df = pd.concat([df, user_df], axis = 1)
        return df
    else:
        return 0 
        
def update_features(df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect, user_part_sum, bundle_sum,
                    cucm0_sum, cucm0_count, cucm1_sum, cucm1_count,
                    uccm0_sum, uccm0_count, uccm1_sum, uccm1_count):
    for row in df[['user_id', 'answered_correctly', 'content_id', 'content_type_id', 'timestamp', 'part', 'bundle_id' , 
                   'answered_correctly_u_avg', 'answered_correctly_q_avg']].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            bundle_sum[row[6]] += row[1]
            # ------------------------------------------------------------------
            user_part_sum[(row[0], row[5])] += row[1]
        
        # cucm 0 1
        if row[1] == 1:
            cucm1_count[row[2]] += 1
            if np.isnan(row[7]):
                cucm1_sum[row[2]] += 0.5
            else:
                cucm1_sum[row[2]] += row[7]

            uccm1_count[row[0]] += 1
            if np.isnan(row[8]):
                uccm1_sum[row[0]] += 0.5
            else:
                uccm1_sum[row[0]] += row[8]

        if row[1] == 0:
            cucm0_count[row[2]] += 1
            if np.isnan(row[7]):
                cucm0_sum[row[2]] += 0.5
            else:
                cucm0_sum[row[2]] += row[7]
            
            uccm0_count[row[0]] += 1
            if np.isnan(row[8]):
                uccm0_sum[row[0]] += 0.5
            else:
                uccm0_sum[row[0]] += row[8]



            
            
            
            
    return

def read_and_preprocess(feature_engineering = False):
    
    train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
    question_file = '../input/riiid-test-answer-prediction/questions.csv'
    
    # Read data
    feld_needed = ['timestamp', 'user_id', 'answered_correctly', 'content_id', 'content_type_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'task_container_id']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    print('train shape', train.shape)
    print('valid shape', valid.shape)
    # Delete some trianing data to don't have ram problems
    if feature_engineering:
        train = train.iloc[-datasize:]
    
    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop = True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop = True)
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)
    questions_df['part'] = questions_df['part'].astype(np.int32)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    
    train = pd.merge(train, questions_df[['question_id', 'part', 'bundle_id']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    valid = pd.merge(valid, questions_df[['question_id', 'part', 'bundle_id']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    
    # Client dictionaries
    #answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    timestamp_u_correct = defaultdict(list)
    
    
    
    
    # Question dictionaries
    #answered_correctly_q_count = defaultdict(int)
    answered_correctly_q_sum = defaultdict(int)
    elapsed_time_q_sum = defaultdict(int)
    explanation_q_sum = defaultdict(int)
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))
    
    
    ### +++++ uccm 
    uccm_sum = defaultdict(int)
    
    uccm0_sum = defaultdict(int)
    uccm0_count = defaultdict(int)
    uccm1_sum = defaultdict(int)
    uccm1_count = defaultdict(int)   
    
    
    
    
    
    ### +++++ cucm
    cucm_sum = defaultdict(int)
    
    cucm0_sum = defaultdict(int)
    cucm0_count = defaultdict(int)
    cucm1_sum = defaultdict(int)
    cucm1_count = defaultdict(int)
    
    
    
    
    ### +++++ user part sum, count
    user_part_sum = defaultdict(int)
    user_part_count = defaultdict(int)
    ### +++++ bundle sum count, ubcm
    bundle_sum = defaultdict(int)
    bundle_count = defaultdict(int)
    ubcm_sum = defaultdict(int)
    #bucm_sum = defaultdict(int)
    
    print('User feature calculation started...')
    print('\n')
    
    temp = train[:-statesize].copy()
    train = train[-statesize:]
    
    gc.collect()
    temp = add_features(temp, #answered_correctly_u_count, 
                        answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
                        #answered_correctly_q_count, 
                        answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq,
                        uccm_sum, cucm_sum, user_part_sum, user_part_count, bundle_sum, bundle_count, ubcm_sum, timestamp_u_correct, #bucm_sum, 
                        cucm0_sum, cucm0_count, cucm1_sum, cucm1_count, 
                        uccm0_sum, uccm0_count, uccm1_sum, uccm1_count,
                        assign = False)
    
    gc.collect()
    train = add_features(train, #answered_correctly_u_count, 
                         answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
                         #answered_correctly_q_count, 
                         answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq,
                         uccm_sum, cucm_sum, user_part_sum, user_part_count, bundle_sum, bundle_count, ubcm_sum, timestamp_u_correct, #bucm_sum, 
                         cucm0_sum, cucm0_count, cucm1_sum, cucm1_count, 
                         uccm0_sum, uccm0_count, uccm1_sum, uccm1_count,
                         assign = True)
    gc.collect()
    
    
    
    
    
    
    valid = add_features(valid, #answered_correctly_u_count, 
                         answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
                         #answered_correctly_q_count, 
                         answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq,
                         uccm_sum, cucm_sum, user_part_sum, user_part_count, bundle_sum, bundle_count, ubcm_sum, timestamp_u_correct, #bucm_sum,
                         cucm0_sum, cucm0_count, cucm1_sum, cucm1_count, 
                         uccm0_sum, uccm0_count, uccm1_sum, uccm1_count,
                         assign = True)
    
    
    
    
    
    
    
    
    gc.collect()
    print('User feature calculation completed...')
    print('\n')
    
    features_dicts = {
        #'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        #'answered_correctly_q_count': answered_correctly_q_count,
        'answered_correctly_q_sum': answered_correctly_q_sum,
        'elapsed_time_q_sum': elapsed_time_q_sum,
        'explanation_q_sum': explanation_q_sum,
        'answered_correctly_uq': answered_correctly_uq,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect,
        ### +++++
        'uccm_sum' : uccm_sum,
        'cucm_sum' : cucm_sum,
        'ups' : user_part_sum,
        'upc' : user_part_count,
        'bundle_sum' : bundle_sum,
        'bundle_count' : bundle_count,
        'ubcm_sum' : ubcm_sum,
        'timestamp_u_correct': timestamp_u_correct,
        #'bucm_sum' : bucm_sum,
        'cucm0_sum': cucm0_sum,
        'cucm0_count' : cucm0_count,
        'cucm1_sum' : cucm1_sum,
        'cucm1_count': cucm1_count,
        'uccm0_sum': uccm0_sum,
        'uccm0_count' : uccm0_count,
        'uccm1_sum' : uccm1_sum,
        'uccm1_count': uccm1_count,
        
    }
    
    return train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts

# Function for training and evaluation
def train_and_evaluate(train, valid, feature_engineering = False):
    
    TARGET = 'answered_correctly'
    # Features to train and predict
    FEATURES = ['prior_question_elapsed_time', 'prior_question_had_explanation', 'part', 'answered_correctly_u_avg', 'elapsed_time_u_avg', 'explanation_u_avg',
                'answered_correctly_q_avg', 'elapsed_time_q_avg', 'explanation_q_avg', 'answered_correctly_uq_count', 'timestamp_u_recency_1', 'timestamp_u_recency_2', 'timestamp_u_recency_3', 
                'timestamp_u_incorrect_recency',
               ### +++++
               'uccm', 'cucm', 'upm', 'ubcm', 'user_count', 'user_part_count', 'bm', 'tucr', #'bucm'
              # 'task_container_id',
                'cucm0', 'cucm1', 'uccm0', 'uccm1', 'bundle_id'
               ]
    
    # Delete some training data to experiment faster
    if feature_engineering:
        train = train.sample(samplesize, random_state = SEED)
        #train = train[-12000000:]
    gc.collect()
    print(f'Traning with {train.shape[0]} rows and {len(FEATURES)} features')    
    drop_cols = list(set(train.columns) - set(FEATURES))
    y_train = train[TARGET]
    y_val = valid[TARGET]
    # Drop unnecessary columns
    train.drop(drop_cols, axis = 1, inplace = True)
    valid.drop(drop_cols, axis = 1, inplace = True)
    gc.collect()
    
    
    train['bundle_id'] = train['bundle_id'].astype('category')
    valid['bundle_id'] = valid['bundle_id'].astype('category')
    
    lgb_train = lgb.Dataset(train[FEATURES], y_train)
    lgb_valid = lgb.Dataset(valid[FEATURES], y_val)
    del train, y_train
    gc.collect()
    
    params = {'objective': 'binary', 
              'seed': SEED,
              'metric': 'auc',
            #  'learning_rate': 0.03,
              'num_leaves': 256,
              'feature_fraction': 0.75,
              'bagging_freq': 10,
              'bagging_fraction': 0.80
             }
    
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = 1000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 10,
        verbose_eval = 50
    )
    
    print('Our Roc Auc score for the validation data is:', roc_auc_score(y_val, model.predict(valid[FEATURES])))
    
    feature_importance = model.feature_importance()
    feature_importance = pd.DataFrame({'Features': FEATURES, 'Importance': feature_importance}).sort_values('Importance', ascending = False)
    
    fig = plt.figure(figsize = (10, 10))
    fig.suptitle('Feature Importance', fontsize = 20)
    plt.tick_params(axis = 'x', labelsize = 12)
    plt.tick_params(axis = 'y', labelsize = 12)
    plt.xlabel('Importance', fontsize = 15)
    plt.ylabel('Features', fontsize = 15)
    sns.barplot(x = feature_importance['Importance'], y = feature_importance['Features'], orient = 'h')
    
    
    lgb.plot_importance(model, importance_type = 'gain', figsize = (14, 10))
    lgb.plot_importance(model, importance_type = 'split', figsize = (14, 10))
 
    plt.show()
    
    
    return TARGET, FEATURES, model

# Using time series api that simulates production predictions
def inference(TARGET, FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts):
    
    # Get feature dict
    #answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    #answered_correctly_q_count = features_dicts['answered_correctly_q_count']
    answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    answered_correctly_uq = features_dicts['answered_correctly_uq']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    ### ++++++++++
    uccm_sum = features_dicts['uccm_sum']
    ### ++++++++++
    cucm_sum = features_dicts['cucm_sum']
    ### ++++++++++
    user_part_sum = features_dicts['ups']
    user_part_count = features_dicts['upc']
    ### +++++++++
    bundle_sum = features_dicts['bundle_sum']
    bundle_count = features_dicts['bundle_count']
    ubcm_sum = features_dicts['ubcm_sum']
    #bucm_sum = features_dicts['bucm_sum']
    
    timestamp_u_correct = features_dicts['timestamp_u_correct']
    
    ##
    uccm0_sum = features_dicts['uccm0_sum']
    uccm0_count = features_dicts['uccm0_count']
    uccm1_sum = features_dicts['uccm1_sum']
    uccm1_count = features_dicts['uccm1_count']
    
    cucm0_sum = features_dicts['cucm0_sum']
    cucm0_count = features_dicts['cucm0_count']
    cucm1_sum = features_dicts['cucm1_sum']
    cucm1_count = features_dicts['cucm1_count']
    
    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict
    
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect, user_part_sum, bundle_sum,
                            cucm0_sum, cucm0_count, cucm1_sum, cucm1_count,
                            uccm0_sum, uccm0_count, uccm1_sum, uccm1_count)
        
        ## 
        previous_test_df = pd.merge(previous_test_df, questions_df[['question_id', 'part', 'bundle_id']], left_on = 'content_id', right_on = 'question_id', how = 'left')
        
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
        test_df = pd.merge(test_df, questions_df[['question_id', 'part', 'bundle_id']], left_on = 'content_id', right_on = 'question_id', how = 'left')
        test_df[TARGET] = 0
        test_df = add_features(test_df, #answered_correctly_u_count, 
                               answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
                               #answered_correctly_q_count, 
                               answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq,
                               uccm_sum, cucm_sum, user_part_sum, user_part_count, bundle_sum, bundle_count, ubcm_sum, timestamp_u_correct, #bucm_sum, 
                               cucm0_sum, cucm0_count, cucm1_sum, cucm1_count, 
                               uccm0_sum, uccm0_count, uccm1_sum, uccm1_count,
                               assign = True, update = False)
        test_df[TARGET] =  model.predict(test_df[FEATURES])
        
        previous_test_df = test_df.copy()
        set_predict(test_df[['row_id', TARGET]])
        
    print('Job Done')
    
train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(feature_engineering = True)


del features_dicts
gc.collect()

TARGET, FEATURES, model = train_and_evaluate(train, valid, feature_engineering = True)

model.save_model('lgb_model.txt', num_iteration=model.best_iteration)
# model = lgb.Booster(model_file='lgb_model.txt')


#inference(TARGET, FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts)