# Comments
Thanks to tito for this great script

https://www.kaggle.com/ragnar123/riiid-model-lgbm

https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering

In [None]:
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd
import random
import riiideducation
import seaborn as sns

from collections import defaultdict
from sklearn.metrics import roc_auc_score
from tqdm import tqdm


train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
question_file= '../input/riiid-test-answer-prediction/questions.csv'
contents_feat_file= '../input/riiid-offline-features/content_feats.pkl'
parts_feat_file   = '../input/riiid-offline-features/part_feats.pkl'
question_tags_file='../input/riiid-offline-features/question_tags_feat.csv'

# 
isDebug = False
feature_engineering_rows = 4000000 if isDebug else 40000000 
training_rows = 150000 if isDebug else 10000000 

print(f'Number of rows for feature engineering: {str(feature_engineering_rows)}')
print(f'Number of rows for training: {str(training_rows)}')

question_tags_df_dtypes = {
    'question_id': np.int64,
    'tags_lsi': np.int8,
    'tag_acc_max': np.float16,
    'tag_count': np.int8,
    'tag_acc_min': np.float16
}

In [None]:
# Random seed
SEED = 123

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)


# Funcion for user stats with loops
def add_features(
        df, answered_correctly_u_count, answered_correctly_u_sum,
        elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
        answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum,
        explanation_q_sum, answered_correctly_uq, part_user_count, part_user_sum, 
        answered_correctly_ut, answered_correctly_ut_sum,
        update=True
):
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float16)
    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float16)
    explanation_u_avg  = np.zeros(len(df), dtype = np.float16)
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float16)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float16)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float16)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float16)
    # -----------------------------------------------------------------------
    # Question features
    answered_correctly_q_avg = np.zeros(len(df), dtype = np.float16)
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float16)
    explanation_q_avg  = np.zeros(len(df), dtype = np.float16)
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int16)
    # -----------------------------------------------------------------------
    # Part-User 
    part_u_count= np.zeros(len(df), dtype = np.int32)
    part_u_mean = np.zeros(len(df), dtype = np.float16)
    # -----------------------------------------------------------------------
    # User-Tags 
    answered_correctly_ut_count   = np.zeros(len(df), dtype = np.int16)
    answered_correctly_ut_avg_max = np.zeros(len(df), dtype = np.float16)
    answered_correctly_ut_avg_min = np.zeros(len(df), dtype = np.float16)
    # -----------------------------------------------------------------------
    
    # Loop for feature excpet part and tags related features
    for num, row in enumerate(zip(*df[[
        'user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time',
        'prior_question_had_explanation', 'timestamp', 'part', 'tags'
    ]].to_dict("list").values())):
        
        # Client features assignation
        # ------------------------------------------------------------------
        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            explanation_u_avg[num] = explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]
        else:
            # answered_correctly_u_avg[num] = np.nan
            answered_correctly_u_avg[num] = .637
            elapsed_time_u_avg[num] = np.nan
            explanation_u_avg[num] = np.nan
        
        # Timestampe features assignation
        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]
        
        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]    
            
        # ------------------------------------------------------------------
        # Question features assignation
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
            
        # ------------------------------------------------------------------
        # Client Question assignation
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        
        # ------------------------------------------------------------------
        # Part-User features assignation
        if part_user_count[row[0]][row[6]]==0:
            part_u_count[num] = 0
            part_u_mean[num]  = .637
        else:
            part_u_count[num] = part_user_count[row[0]][row[6]]
            part_u_mean[num]  = part_user_sum[row[0]][row[6]]/part_user_count[row[0]][row[6]]
            
        # ------------------------------------------------------------------
        # Tag-User features assignation
        tags = row[7].split()
        
        tag_user_mean  = []
        for tag in tags:
            tag_user_count = answered_correctly_ut[row[0]][tag]
            tag_user_sum   = answered_correctly_ut_sum[row[0]][tag]
            
            if tag_user_count == 0:
                tag_user_mean.append(.637)
            else:
                tag_user_mean.append(tag_user_sum / tag_user_count)
            
            answered_correctly_ut_count[num] += tag_user_count
            
        answered_correctly_ut_avg_max[num] = max(tag_user_mean)
        answered_correctly_ut_avg_min[num] = min(tag_user_mean)

        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Client Part updates
        part_user_count[row[0]][row[6]] += 1
        # ------------------------------------------------------------------
        # Client Tags updates
        for tag in tags:
            answered_correctly_ut[row[0]][tag] += 1
        
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            
            # ------------------------------------------------------------------
            # Part features updates
            part_user_sum[row[0]][row[6]] += row[1]
            
            # ------------------------------------------------------------------
            # Part features updates
            for tag in tags:
                answered_correctly_ut_sum[row[0]][tag] += row[1]
            
                
    user_dict = {
        'answered_correctly_u_avg': answered_correctly_u_avg,
        'elapsed_time_u_avg': elapsed_time_u_avg,
        'explanation_u_avg': explanation_u_avg,
        'answered_correctly_q_avg': answered_correctly_q_avg,
        'elapsed_time_q_avg': elapsed_time_q_avg,
        'explanation_q_avg': explanation_q_avg,
        'answered_correctly_uq_count': answered_correctly_uq_count,
        'timestamp_u_recency_1': timestamp_u_recency_1,
        'timestamp_u_recency_2': timestamp_u_recency_2,
        'timestamp_u_recency_3': timestamp_u_recency_3,
        'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency,
        'part_u_count': part_u_count,
        'part_u_mean': part_u_mean,
        'answered_correctly_ut_count': answered_correctly_ut_count,
        'answered_correctly_ut_avg_max': answered_correctly_ut_avg_max,
        'answered_correctly_ut_avg_min': answered_correctly_ut_avg_min,
    }
    
    for k, v in user_dict.items():
        df[k] = v
        
    return df
 
        
def update_features(
        df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect, 
        part_user_sum, answered_correctly_ut_sum
):
    for row in df[[
        'user_id', 'answered_correctly', 'content_id', 'content_type_id', 'timestamp', 'part', 'tags'
    ]].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            
            # ------------------------------------------------------------------
            # Part features updates
            part_user_sum[row[0]][row[5]] += row[1]
            
            # ------------------------------------------------------------------
            # Part features updates
            tags = row[6].split()
            for tag in tags:
                answered_correctly_ut_sum[row[0]][tag] += row[1]
            
    return


def defaultdictInt():
    return defaultdict(int)


def read_and_preprocess(feature_engineering=False):    
    # Read data
    feld_needed = ['timestamp', 'user_id', 'answered_correctly', 'content_id', 'content_type_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    # Delete some trianing data to don't have ram problems
    if feature_engineering:
        train = train.iloc[-feature_engineering_rows:]
    
    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop=True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)
    
    # REduce scale to avoid numerical overflow
    train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'] / 10000.
    valid['prior_question_elapsed_time'] = valid['prior_question_elapsed_time'] / 10000.
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)
    questions_df['part'] = questions_df['part'].astype(np.int32)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    
    # questions_df['tags'].fillna('-1', inplace=True)

    # Merge questions.csv
    train = pd.merge(train, questions_df[['question_id', 'part', 'tags']], left_on='content_id', right_on='question_id', how='left')
    valid = pd.merge(valid, questions_df[['question_id', 'part', 'tags']], left_on='content_id', right_on='question_id', how='left')

    # Client dictionaries
    answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    
    # Question dictionaries
    answered_correctly_q_count = defaultdict(int) 
    answered_correctly_q_sum = defaultdict(int)
    elapsed_time_q_sum = defaultdict(int)
    explanation_q_sum = defaultdict(int)
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(defaultdictInt)
    
    # User-Part dictionaries
    part_user_count = defaultdict(defaultdictInt)
    part_user_sum   = defaultdict(defaultdictInt)
    
    # User-tags dictionaries
    answered_correctly_ut = defaultdict(defaultdictInt)
    answered_correctly_ut_sum = defaultdict(defaultdictInt)
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(
        train, answered_correctly_u_count, answered_correctly_u_sum,
        elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
        answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum,
        explanation_q_sum, answered_correctly_uq, part_user_count, part_user_sum, 
        answered_correctly_ut, answered_correctly_ut_sum
    )
    valid = add_features(
        valid, answered_correctly_u_count, answered_correctly_u_sum,
        elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
        answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum,
        explanation_q_sum, answered_correctly_uq, part_user_count, part_user_sum, 
        answered_correctly_ut, answered_correctly_ut_sum
    )
    gc.collect()
    print('User feature calculation completed...')
    print('\n')

    # Merge offline features
    train.set_index('content_id', inplace=True), valid.set_index('content_id', inplace=True)
    
    # Content features
    content_feat = pickle.load(open(contents_feat_file, 'rb'))
    train = train.join(content_feat)
    valid = valid.join(content_feat)
    
    # Question tag features
    question_tags_df = pd.read_csv(question_tags_file, dtype=question_tags_df_dtypes)
    question_tags_df.set_index('question_id', inplace=True)
    train = train.join(question_tags_df)
    valid = valid.join(question_tags_df)
    
    train.reset_index(inplace=True), valid.reset_index(inplace=True)
    
    # Part features
    # part_feat = pickle.load(open(parts_feat_file, 'rb'))
    # train.set_index('part', inplace=True), valid.set_index('part', inplace=True)
    # train = train.join(part_feat)
    # valid = valid.join(part_feat)
    # train.reset_index(inplace=True), valid.reset_index(inplace=True)
    
    features_dicts = {
        'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        'answered_correctly_q_count': answered_correctly_q_count,
        'answered_correctly_q_sum': answered_correctly_q_sum,
        'elapsed_time_q_sum': elapsed_time_q_sum,
        'explanation_q_sum': explanation_q_sum,
        'answered_correctly_uq': answered_correctly_uq,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect,
        'part_user_count': part_user_count,
        'part_user_sum': part_user_sum,
        'answered_correctly_ut': answered_correctly_ut,
        'answered_correctly_ut_sum': answered_correctly_ut_sum
    }
    
    return train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts

In [None]:
gc.collect()

In [None]:
%%time
train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(feature_engineering=True)

In [None]:
# %%time
# # if os.path.exists(r'./tmp'):
# #     os.mkdir(r'./tmp')
# for col, feat in features_dicts.items():
#     pickle.dump(feat, open(f'./{col}.pkl', 'wb'))
    
#     features_dicts[col] = None
    
# gc.collect()

In [None]:
train.info()

In [None]:
# Create features of features
def features4features(df):
    df['correctness_u_q_avg_diff']  = df['answered_correctly_u_avg'] - df['answered_correctly_q_avg']
    df['correctness_u_pu_avg_diff'] = df['answered_correctly_u_avg'] - df['part_u_mean']
    df['correctness_u_uv_avg_max_diff'] = df['answered_correctly_u_avg'] - df['answered_correctly_ut_avg_max']
    df['correctness_u_uv_avg_min_diff'] = df['answered_correctly_u_avg'] - df['answered_correctly_ut_avg_min']
    df['correctness_uv_avg_max_min_diff'] = df['answered_correctly_ut_avg_max'] - df['answered_correctly_ut_avg_min']
    df['correctness_u_t_avg_max_diff'] = df['answered_correctly_u_avg'] - df['tag_acc_max']
    df['correctness_u_t_avg_min_diff'] = df['answered_correctly_u_avg'] - df['tag_acc_min']
    
    df['elapsed_time_u_q_diff'] = df['elapsed_time_u_avg'] - df['elapsed_time_q_avg']
    df['explanation_u_q_diff']  = df['explanation_u_avg']  - df['explanation_q_avg']
    
    return df


# Function for training and evaluation
def train_and_evaluate(train, valid, feature_engineering=False):
    
    TARGET = 'answered_correctly'
    # Features to train and predict
    FEATURES = [
        'prior_question_elapsed_time', 'answered_correctly_u_avg', 'part', 
        # 'prior_question_had_explanation', 
        'elapsed_time_u_avg', 'explanation_u_avg', 'answered_correctly_q_avg', 
        'elapsed_time_q_avg', 'explanation_q_avg', 'answered_correctly_uq_count', 
        'timestamp_u_recency_1', 'timestamp_u_recency_2', 'timestamp_u_recency_3', 
        'timestamp_u_incorrect_recency', 'part_u_count', 'part_u_mean',
        'answered_correctly_ut_count', 'answered_correctly_ut_avg_max',
        'answered_correctly_ut_avg_min',
        # offlines:
        'question_elapsed_time_mean', 'question_had_explanation_mean',
        'question_correctly_q_count', 'question_correctly_q_mean',
        # 'part_elapsed_time_mean','part_had_explanation_mean','part_correctly_q_mean',
        'tags_lsi', 'tag_acc_max', 'tag_acc_min', 
        # 'tag_count'
    ]
    
    CATEGORICAL_FEATURES = [
        'tags_lsi', 'part'
    ]
    
    for col in CATEGORICAL_FEATURES:
        train[col] = train[col].astype('category')
        valid[col] = valid[col].astype('category')
    
    # Delete some training data to experiment faster
    if feature_engineering:
        train = train.sample(training_rows, random_state=SEED)
        
    gc.collect()
    
    print(f'Traning with {train.shape[0]} rows and {len(FEATURES)} features')    
    
    drop_cols = list(set(train.columns) - set(FEATURES))
    y_train = train[TARGET]
    y_val = valid[TARGET]
    
    # Drop unnecessary columns
    for col in drop_cols:
        del train[col], valid[col]
    gc.collect()
    
    # Create features of features
    train = features4features(train)
    valid = features4features(valid)
    
    FEATURES = train.columns
    
    # To lgb dataset format
    lgb_train = lgb.Dataset(train[FEATURES], y_train, categorical_feature=CATEGORICAL_FEATURES)
    lgb_valid = lgb.Dataset(valid[FEATURES], y_val, categorical_feature=CATEGORICAL_FEATURES)
    
    del y_train, train
    gc.collect()
    
    params = {
        'objective': 'binary', 
        'seed': SEED,
        'learning_rate': .1,
        'max_depth': 15,
        'metric': 'auc',
        'num_leaves': 300,
        'feature_fraction': 0.75,
        'bagging_freq': 10,
        # 'bagging_fraction': 0.80,
        'subsample': 0.80,
    }

    
    model = lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=10,
        verbose_eval=50,
    )
    
    print('Our Roc Auc score for the validation data is:', roc_auc_score(y_val, model.predict(valid[FEATURES])))

    feature_importance = model.feature_importance()
    feature_importance = pd.DataFrame({'Features': FEATURES, 'Importance': feature_importance}).sort_values('Importance', ascending = False)
    
    fig = plt.figure(figsize=(10, 10))
    fig.suptitle('Feature Importance', fontsize=20)
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)
    plt.xlabel('Importance', fontsize=15)
    plt.ylabel('Features', fontsize=15)
    sns.barplot(x=feature_importance['Importance'], y=feature_importance['Features'], orient='h')
    plt.show()
    
    return TARGET, FEATURES, CATEGORICAL_FEATURES, model

In [None]:
# %%time
# TARGET, FEATURES, CATEGORICAL_FEATURES, model = train_and_evaluate(train, valid, feature_engineering=True)

In [None]:
TARGET = 'answered_correctly'
# Features to train and predict
FEATURES = [
    'prior_question_elapsed_time', 'answered_correctly_u_avg', 'part', 
    # 'prior_question_had_explanation', 
    'elapsed_time_u_avg', 'explanation_u_avg', 'answered_correctly_q_avg', 
    'elapsed_time_q_avg', 'explanation_q_avg', 'answered_correctly_uq_count', 
    'timestamp_u_recency_1', 'timestamp_u_recency_2', 'timestamp_u_recency_3', 
    'timestamp_u_incorrect_recency', 'part_u_count', 'part_u_mean',
    'answered_correctly_ut_count', 'answered_correctly_ut_avg_max',
    'answered_correctly_ut_avg_min',
    # offlines:
    'question_elapsed_time_mean', 'question_had_explanation_mean',
    'question_correctly_q_count', 'question_correctly_q_mean',
    # 'part_elapsed_time_mean','part_had_explanation_mean','part_correctly_q_mean',
    'tags_lsi', 'tag_acc_max', 'tag_acc_min', 
    # 'tag_count'
]

CATEGORICAL_FEATURES = [
    'tags_lsi', 'part'
]

for col in CATEGORICAL_FEATURES:
    train[col] = train[col].astype('category')
    valid[col] = valid[col].astype('category')

# Delete some training data to experiment faster
# if feature_engineering:
train = train.sample(training_rows, random_state=SEED)

gc.collect()

print(f'Traning with {train.shape[0]} rows and {len(FEATURES)} features')    

drop_cols = list(set(train.columns) - set(FEATURES))
y_train = train[TARGET]
y_val = valid[TARGET]

# Drop unnecessary columns
for col in drop_cols:
    del train[col], valid[col]
gc.collect()

# Create features of features
train = features4features(train)
valid = features4features(valid)

FEATURES = train.columns

# To lgb dataset format
lgb_train = lgb.Dataset(train[FEATURES], y_train, categorical_feature=CATEGORICAL_FEATURES)
lgb_valid = lgb.Dataset(valid[FEATURES], y_val, categorical_feature=CATEGORICAL_FEATURES)

del y_train, train
gc.collect()

params = {
    'objective': 'binary', 
    'seed': SEED,
    'learning_rate': .1,
    'max_depth': 15,
    'metric': 'auc',
    'num_leaves': 300,
    'feature_fraction': 0.75,
    'bagging_freq': 10,
    # 'bagging_fraction': 0.80,
    'subsample': 0.80,
}


model = lgb.train(
    params=params,
    train_set=lgb_train,
    num_boost_round=10000,
    valid_sets=[lgb_train, lgb_valid],
    early_stopping_rounds=10,
    verbose_eval=50,
)

print('Our Roc Auc score for the validation data is:', roc_auc_score(y_val, model.predict(valid[FEATURES])))

feature_importance = model.feature_importance()
feature_importance = pd.DataFrame({'Features': FEATURES, 'Importance': feature_importance}).sort_values('Importance', ascending = False)

fig = plt.figure(figsize=(10, 10))
fig.suptitle('Feature Importance', fontsize=20)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.xlabel('Importance', fontsize=15)
plt.ylabel('Features', fontsize=15)
sns.barplot(x=feature_importance['Importance'], y=feature_importance['Features'], orient='h')
plt.show()

In [None]:
# Using time series api that simulates production predictions
def inference(TARGET, FEATURES, CATEGORICAL_FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts):
    
    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_q_count = features_dicts['answered_correctly_q_count']
    answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    answered_correctly_uq = features_dicts['answered_correctly_uq']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    part_user_count = features_dicts['part_user_count']
    part_user_sum = features_dicts['part_user_sum']
    answered_correctly_ut = features_dicts['answered_correctly_ut']
    answered_correctly_ut_sum = features_dicts['answered_correctly_ut_sum']
    
    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

    # Get offline features and merge
    content_feat = pickle.load(open(contents_feat_file, 'rb'))
    # part_feat = pickle.load(open(parts_feat_file, 'rb'))
    question_tags_df = pd.read_csv(question_tags_file, dtype=question_tags_df_dtypes)
    question_tags_df.set_index('question_id', inplace=True)
    
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            update_features(
                previous_test_df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect,
                part_user_sum, answered_correctly_ut_sum
            )
        
        test_df = pd.merge(test_df, questions_df[['question_id', 'part', 'tags']], left_on='content_id', right_on='question_id', how='left')
        
        previous_test_df = test_df.copy()
        
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
        
        test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'] / 10000.
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

        # test_df.set_index('content_id', inplace=True)
        # test_df = test_df.join(content_feat)
        # test_df.reset_index(inplace=True)
        # 
        # test_df.set_index('part', inplace=True)
        # test_df = test_df.join(part_feat)
        # test_df.reset_index(inplace=True)
        # 
        # test_df = pd.merge(test_df, question_tags_df, left_on='content_id', right_on='question_id', how='left')
        
        # Merge offline features
        test_df.set_index('content_id', inplace=True)
        
        # Content features
        test_df = test_df.join(content_feat)
        
        # Question tag features
        test_df = test_df.join(question_tags_df)
        
        test_df.reset_index(inplace=True)
        
        test_df[TARGET] = 0
        test_df = add_features(
            test_df, answered_correctly_u_count, answered_correctly_u_sum,
            elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, 
            answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum,
            explanation_q_sum, answered_correctly_uq, part_user_count, part_user_sum, 
            answered_correctly_ut, answered_correctly_ut_sum,
            update=False
        )
        for col in CATEGORICAL_FEATURES:
            test_df[col] = test_df[col].astype('category')
            
        test_df = features4features(test_df)
        test_df[TARGET] =  model.predict(test_df[FEATURES])
        set_predict(test_df[['row_id', TARGET]])
        
    print('Job Done')

In [None]:
# %%time
# for col, _ in features_dicts.items():
#     features_dicts[col] = pickle.load(open(f'./{col}.pkl', 'rb'))
    
#     os.remove(f'./{col}.pkl')
# gc.collect()

In [None]:
%%time
inference(TARGET, FEATURES, CATEGORICAL_FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts)