In kaggle competitions,  As a beginner,  there is an urge to create a model and test quickly. Also very often when we attempt to write a code from scratch, we see that the results are not as good as what we get when we 'copy and edit' a popular notebook. 

One of the learnings have been the importance of using some kind of 'unit test' at each stage of coding. Simple errors occur for some corner cases (or) though initially we would have printed and checked the code, a small change would have affected the data  and it would have been overlooked.  

To start with we could use a simple 'assert' to ensure data correctness at each step. So for each method/function, write the assert statements. Ensure we decide/identify the values that need to be checked ( with assert) even before we execute the method. 

We saw an increase of .4 points (.7x to .74) by fixing the errors in the methods


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
import gc
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm
import pickle


In [None]:
##
# Read the data and the question information. 
##
train_df = pd.read_pickle("/kaggle/input/riid-pickle-file/train.pkl")
question_static_info = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")


In [None]:
# Define constants
RUN_UNIT_TEST = 1  
MODEL_PICKLE_FILENAME = 'lgb_2.pkl'
TRAIN_DATA_LIMIT = -25000000
LAST_ENTRIES_PER_USER_CNT = 600

In [None]:
##
# This method calculates the user performance statistics - user performance, total attempts, total correct, mean time, lecture count. 
##
def calculate_user_summary_data (data):
    user_temp = pd.DataFrame()
    user_temp['user_id'] = data['user_id'].unique()   
    
    user_group =  data.groupby('user_id').agg( {'content_type_id': 'sum'} )
    user_group.columns = ['lecture_count']
    lc_dict = user_group['lecture_count'].to_dict()
    
    data = data[data ['content_type_id']==False]
    user_group =  data.groupby('user_id').agg( {'answered_correctly': ['sum', 'count'], 'prior_question_elapsed_time': 'median'} )
    user_group.columns = ['user_correct_count', 'user_all_count', 'user_mean_time']
    user_correct_count_dict = user_group['user_correct_count'].to_dict()
    user_all_count_dict = user_group['user_all_count'].to_dict()
    user_mean_time_dict = user_group['user_mean_time'].to_dict()
    
    user_temp['user_mean_time'] = user_temp['user_id'].apply( lambda x: user_mean_time_dict[x] if x in user_mean_time_dict else 0  )
    user_temp['user_correct_count'] = user_temp['user_id'].apply( lambda x: user_correct_count_dict[x] if x in user_correct_count_dict else 0  )
    user_temp['user_all_count'] = user_temp['user_id'].apply( lambda x: user_all_count_dict[x] if x in user_all_count_dict else 0  )
    user_temp['lecture_count'] = user_temp['user_id'].apply( lambda x: lc_dict[x]  if x in lc_dict else 0 )
    user_temp['user_performance'] = user_temp['user_correct_count']/user_temp['user_all_count']
    user_temp = user_temp.set_index('user_id')
    return user_temp 


if RUN_UNIT_TEST == 1 :
    # Create simple 'assert' to ensure that the method is indeed returning the right data. 
    # Initially there was a mistake and the lecture count was returned as zero ( since we were checking for the content_type_id == 0 )
    # When that was fixed, the 'assert' for 'user_correct_count' failed. (The answered_correctly is -1 for lectures and this impacted the result).
    # Hence by using 'assert' we could catch the 'bug' easily instead of much later in the code
    punit_data = train_df[train_df['user_id'].isin([115, 5382])]
    punit_data = punit_data.join(question_static_info[['tags', 'part']], on="content_id" )
    punit_user_info = calculate_user_summary_data(punit_data)
    assert punit_user_info.loc[115]['user_all_count'] == 46
    assert punit_user_info.loc[5382]['lecture_count'] == 3
    assert punit_user_info.loc[5382]['user_correct_count'] == 84
    assert punit_user_info.loc[5382]['user_mean_time'] == 25000


In [None]:
##
# Calculate the question statistics 
##
def calculate_question_summary_data (data ):    
    question_temp = pd.DataFrame()
    data = data[ data['content_type_id']== False ]
    question_temp['question_id'] = data['content_id'].unique()
    
    question_group =  data.groupby('content_id').agg( {'answered_correctly': ['sum', 'count']} )
    question_group.columns = ['question_correct_count', 'question_all_count']
    question_correct_count_dict = question_group['question_correct_count'].to_dict()
    question_all_count_dict = question_group['question_all_count'].to_dict()
    
    question_temp['question_correct_count'] = question_temp['question_id'].apply( lambda x: question_correct_count_dict[x] )
    question_temp['question_all_count'] = question_temp['question_id'].apply( lambda x: question_all_count_dict[x] )
    question_temp['question_performance']  = question_temp['question_correct_count']/question_temp['question_all_count']
    question_temp = question_temp.set_index('question_id')
    return question_temp



if RUN_UNIT_TEST == 1:
    # One of the important test is to ensure that there is no leakage of the lecture information into the question
    # summary 
    punit_data = train_df[train_df['content_id'].isin([5000, 16736,6808 ])]
    punit_question_info = calculate_question_summary_data(punit_data)
    assert punit_question_info.loc[5000]['question_correct_count'] == 11091
    assert punit_question_info.loc[6808]['question_all_count'] == 17252
    assert (16736 in punit_question_info.index) == False

In [None]:

##
# Calculate the user part statistics. 
##    
def calculate_user_part_summary_data(data) : 
    user_part_temp = pd.DataFrame()
    data = data[ data['content_type_id']== False ]
    user_part_temp['user_part'] = data['user_part'].unique()

    user_part_data = data.groupby('user_part').agg ({'answered_correctly' : ['sum', 'count']})
    user_part_data.columns = ['user_part_correct_count', 'user_part_all_count']
    user_part_correct_count_dict = user_part_data['user_part_correct_count'].to_dict()
    user_part_all_count_dict = user_part_data['user_part_all_count'].to_dict()
    
    user_part_temp['user_part_correct_count'] = user_part_temp['user_part'].apply( lambda x: user_part_correct_count_dict[x] )
    user_part_temp['user_part_all_count'] = user_part_temp['user_part'].apply( lambda x: user_part_all_count_dict[x] )
    user_part_temp['user_part_performance'] = user_part_temp['user_part_correct_count']/user_part_temp['user_part_all_count']
    user_part_temp = user_part_temp.set_index ('user_part')
    return user_part_temp



if RUN_UNIT_TEST == 1:
    punit_data = train_df[train_df['user_id'].isin([115, 5382])]
    punit_data = punit_data.join(question_static_info[['tags', 'part']], on="content_id" )
    punit_data['user_part'] = punit_data['user_id'].astype(str) + " " + punit_data['part'].astype(str)
    punit_user_part_info = calculate_user_part_summary_data(punit_data)
    assert punit_user_part_info.loc['5382 5.0']['user_part_correct_count'] == 51 
    assert punit_user_part_info.loc['5382 2.0']['user_part_all_count'] == 32 
    assert punit_user_part_info.loc['115 5.0']['user_part_performance'] == 1 



In [None]:
##
# Calculate the summary data
##
def calculate_summary_data (data):
    user_info = calculate_user_summary_data ( data )
    question_info = calculate_question_summary_data (data )
    user_part_info = calculate_user_part_summary_data(data)
    return user_info, question_info, user_part_info

In [None]:
##
# For split into train and val , we are going to consider the last 500 entries for the data. We will be using the 'task container id'
# - using the initial ones for calculating the statistics and the last few ones for train/
# validation. Since we do not want a constant split, we generate a random 
# number between 500 and max_container_id for the split between initial data and ones for train/val.
## 

import random
def add_addln_info_to_data (train_df, question_static_info ):
    train_df = train_df.join(question_static_info[['tags', 'part']], on="content_id" )
    train_df['tags'] =  train_df['tags'].fillna(-1)
    train_df['part'] =  train_df['part'].fillna(-1)
    #train_df['user_tag'] = train_df['user_id'].astype(str) + " " + train_df['tags'].astype(str)
    train_df['user_part'] = train_df['user_id'].astype(str) + " " + train_df['part'].astype(str)
    
    task_group_max = train_df.groupby('user_id').agg({'task_container_id' : 'max'})
    task_group_max = task_group_max.rename (columns={'task_container_id': 'max_task_container_id'})
    task_group_max['cut_off'] = task_group_max['max_task_container_id'].apply(lambda x: LAST_ENTRIES_PER_USER_CNT if x>=LAST_ENTRIES_PER_USER_CNT else  x)
    task_group_max['random_count'] = task_group_max[['cut_off', 'max_task_container_id']].apply(lambda x:random.randint(x['cut_off'], x['max_task_container_id']), axis =1)
    train_df = train_df.join(task_group_max[['max_task_container_id', 'random_count', 'cut_off']], on='user_id')
    return train_df



if RUN_UNIT_TEST == 1:
    # Dummy dataframes can also be created to test methods. 
    punit_df = pd.DataFrame ( 

                 [{'user_id':1, 'content_id': 1, 'answered_correctly':1, 'content_type_id':0 , 'task_container_id': 1 } ,
                 {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':0 , 'task_container_id': 1 },
                  {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':1, 'task_container_id': 2  },
                  {'user_id':2, 'content_id': 1, 'answered_correctly':0, 'content_type_id':0 , 'task_container_id': 1 },
                  {'user_id':3, 'content_id': 2, 'answered_correctly':1, 'content_type_id':0 , 'task_container_id': 5550 }
                 ], index = [0, 1, 2, 3, 4 ]
        )

    punit_question_static_info= pd.DataFrame ( 

                 [{'question_id' : 1 , 'tags': "23 24" , 'part': 1 } ,
                  {'question_id' : 12 , 'tags': "25" , 'part': 2 }
                 ], index = [1,2]
        )
    punit_addln_data = add_addln_info_to_data(punit_df,punit_question_static_info )
    assert punit_addln_data.loc[0]['tags'] == "23 24"
    assert punit_addln_data.loc[0]['part'] == 1
    assert punit_addln_data.loc[0]['user_part'] == "1 1"
    assert punit_addln_data.loc[0]['cut_off'] == 2
    assert punit_addln_data.loc[4]['cut_off'] == LAST_ENTRIES_PER_USER_CNT      
    assert punit_addln_data.loc[4]['max_task_container_id'] == 5550      

In [None]:
##
# Merge the summarized data with the actual data based on user_id/content_id. 
##
def merge_summary_info_with_data_joinmthd (train_data_val, user_info, question_info, user_part_info ):
    train_data_val = train_data_val.join(user_part_info[['user_part_correct_count', 'user_part_performance']], on='user_part')
    train_data_val = train_data_val.join(user_info[['user_correct_count', 'lecture_count', 'user_performance', 'user_mean_time']], on='user_id')
    train_data_val = train_data_val.join(question_info[['question_correct_count', 'question_performance']], on='content_id')
    return train_data_val


def create_dictionaries(user_info, question_info, user_part_info) :
    user_part_correct_count_dict = user_part_info['user_part_correct_count'].to_dict()
    user_part_performance_dict = user_part_info['user_part_performance'].to_dict()
    user_correct_count_dict = user_info['user_correct_count'].to_dict()
    user_performance_dict = user_info['user_performance'].to_dict()
    lecture_count_dict = user_info['lecture_count'].to_dict()
    user_mean_time_dict = user_info['user_mean_time'].to_dict()
    question_correct_count_dict = question_info['question_correct_count'].to_dict()
    question_performance_dict = question_info['question_performance'].to_dict()
    
    dict_dict = {
        'user_part_correct_count' : user_part_correct_count_dict,
        'user_part_performance' : user_part_performance_dict,
        'user_correct_count' :  user_correct_count_dict,
        'lecture_count' :  lecture_count_dict,
        'user_performance': user_performance_dict,
        'user_mean_time' :  user_mean_time_dict,
        'question_correct_count' :  question_correct_count_dict,
        'question_performance' : question_performance_dict
        
        }

    return dict_dict
    
def merge_summary_info_with_data(train_data_val, dict_dict ):
    
    user_part_correct_count_dict = dict_dict.get('user_part_correct_count')
    user_part_performance_dict = dict_dict.get('user_part_performance')
    user_correct_count_dict=dict_dict.get('user_correct_count') 
    lecture_count_dict=dict_dict.get('lecture_count')
    user_performance_dict=dict_dict.get('user_performance')
    user_mean_time_dict=dict_dict.get('user_mean_time')
    question_correct_count_dict=dict_dict.get('question_correct_count')
    question_performance_dict=dict_dict.get('question_performance')
    
    
    train_data_val['user_part_correct_count'] = train_data_val['user_part'].apply( lambda x: user_part_correct_count_dict[x] \
                                                                                  if x in user_part_correct_count_dict else 0 )
    train_data_val['user_part_performance'] = train_data_val['user_part'].apply( lambda x: user_part_performance_dict[x] \
                                                                              if x in user_part_performance_dict else np.NAN )
    train_data_val['user_correct_count'] = train_data_val['user_id'].apply( lambda x: user_correct_count_dict[x] \
                                                                           if x in user_correct_count_dict else 0 )
    train_data_val['lecture_count'] = train_data_val['user_id'].apply( lambda x: lecture_count_dict[x] \
                                                                      if x in lecture_count_dict else 0 )
    train_data_val['user_performance'] = train_data_val['user_id'].apply( lambda x: user_performance_dict[x]  \
                                                                         if x in user_performance_dict else np. NAN )
    train_data_val['user_mean_time'] = train_data_val['user_id'].apply( lambda x: user_mean_time_dict[x]  \
                                                                         if x in user_mean_time_dict else np. NAN )
    train_data_val['question_correct_count'] = train_data_val['content_id'].apply( lambda x: question_correct_count_dict[x] \
                                                                                   if x in question_correct_count_dict else 0 )
    train_data_val['question_performance']  = train_data_val['content_id'].apply( lambda x: question_performance_dict[x] \
                                                                                 if x in question_performance_dict else np.NAN )
  
    return train_data_val

if RUN_UNIT_TEST == 1:
    punit_df = pd.DataFrame ( 

                 [{'user_id':1, 'content_id': 1, 'answered_correctly':1, 'content_type_id':0 , 'task_container_id': 1, 'tags': "23 24", "part":1 , 'user_part': "1 1"} ,
                 {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':0 , 'task_container_id': 1, 'tags': '25', 'part': 2, 'user_part': "1 2"},
                  {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':1, 'task_container_id': 2 , 'tags': '25', 'part': 2, 'user_part': "1 2" },
                  {'user_id':2, 'content_id': 1, 'answered_correctly':0, 'content_type_id':0 , 'task_container_id': 1, 'tags': "23 24", "part":1, 'user_part': "2 1" },
                  {'user_id':3, 'content_id': 2, 'answered_correctly':1, 'content_type_id':0 , 'task_container_id': 5550 , 'tags': '25', 'part': 2 , 'user_part': "3 2"}
                 ], index = [0, 1, 2, 3, 4 ]
        )

    punit_question_info= pd.DataFrame ( 

                 [{ 'question_all_count': 100 , 'question_correct_count': 55, 'question_performance':.55 } ,
                  { 'question_all_count': 200 , 'question_correct_count': 50, 'question_performance':.25 } 
                 ], index = [1,2]
        )
    punit_user_info= pd.DataFrame ( 

                 [{ 'user_all_count': 100 , 'user_correct_count': 50, 'user_performance':.50, 'lecture_count': 12, 'user_mean_time':30000 } ,
                  { 'user_all_count': 200 , 'user_correct_count': 50, 'user_performance':.25 , 'lecture_count': 19, 'user_mean_time':25000 } 
                 ], index = [1,2]
        )
    punit_user_part_info= pd.DataFrame ( 

                 [{ 'user_part_all_count': 10 , 'user_part_correct_count': 7, 'user_part_performance':.70 } ,
                  { 'user_part_all_count': 20 , 'user_part_correct_count': 5, 'user_part_performance':.25  } ,
                  { 'user_part_all_count': 10 , 'user_part_correct_count': 1, 'user_part_performance':.1  }
                 ], index = ["1 1","2 1", "1 2"]
        ) 
    
    dict_dict = create_dictionaries(punit_user_info,punit_question_info, punit_user_part_info)
    punit_result = merge_summary_info_with_data(punit_df, dict_dict)

    assert punit_result.loc[0]['user_performance'] == .50
    assert punit_result.loc[0]['user_part_performance'] == .70
    assert punit_result.loc[0]['question_performance'] == .55
    assert punit_result.loc[0]['question_correct_count'] == 55   
    assert punit_result.loc[0]['user_correct_count'] == 50
    assert punit_result.loc[0]['lecture_count'] == 12
    assert punit_result.loc[0]['user_part_correct_count'] == 7
    
    assert np.isnan(punit_result.loc[4]['user_performance']) == True
    assert np.isnan(punit_result.loc[4]['user_part_performance']) == True
    assert punit_result.loc[4]['question_performance'] == .25


In [None]:
##
# Handle the missing values in the data before training 
##
def handle_missing_vals (train_data_val ) :
    
    
    train_data_val['user_correct_count'] = train_data_val['user_correct_count'].fillna(0)
    train_data_val['lecture_count'] = train_data_val['lecture_count'].fillna(0)
    train_data_val['user_mean_time'] = train_data_val['user_mean_time'].fillna(26000)
    
    train_data_val['question_correct_count'] = train_data_val['question_correct_count'].fillna(0)
    train_data_val['user_part_correct_count'] = train_data_val['user_part_correct_count'].fillna(0)
    
    train_data_val['user_performance'] = train_data_val['user_performance'].fillna(train_data_val['user_performance'].median())
    train_data_val['question_performance'] = train_data_val['question_performance'].fillna(train_data_val['question_performance'].median())
    
    train_data_val['user_part_performance'] = train_data_val[['user_part_performance', 'user_performance']].apply ( \
        lambda x : x['user_part_performance'] if np.isnan(x['user_part_performance'])==False else x['user_performance'], axis=1)
    
    return train_data_val



In [None]:
##
# Consider only the last TRAIN_DATA_LIMIT entries and add the additional info for spliting into train/val
# and for feature engineering
##
FEATURE_COLUMNS = ['user_performance', 'user_mean_time', 'lecture_count', \
                'user_correct_count', 'question_performance', \
                'question_correct_count', 'user_part_performance', 'user_part_correct_count']
TARGET_COLUMN = 'answered_correctly'


train_df = train_df[TRAIN_DATA_LIMIT:-1]
train_df = add_addln_info_to_data ( train_df, question_static_info  )
unique_users = train_df['user_id'].unique()
train_df = train_df.reset_index()
train_df

In [None]:
%%time

##
# Use the initial set of data ( in the last 500 entries for a user) to calculate the statistics 
##
initial_data = train_df[train_df['task_container_id'] >= train_df['cut_off']]
initial_data = initial_data[initial_data['task_container_id'] < initial_data['random_count']]
initial_data = initial_data.reset_index()
user_info, question_info, user_part_info = calculate_summary_data(initial_data)

##
# Use the remaining of data ( in the last 500 entries for a  user) for train/val.
##
train_data_val = train_df[train_df['task_container_id'] >= train_df['cut_off']]
train_data_val = train_data_val[train_data_val['task_container_id'] >= train_data_val['random_count']]
train_data_val = train_data_val[train_data_val['content_type_id'] == 0 ]
train_data_val = train_data_val.reset_index()
train_data_val = merge_summary_info_with_data_joinmthd(train_data_val, user_info, question_info, user_part_info )
train_data_val = handle_missing_vals (train_data_val )

train_data_val

In [None]:
##
# Take 90% of the users to train the data and the remaining for validation. Create a custom cv  
##

from random import sample 

len_unique_user = len(unique_users)
train_cnt = int(len_unique_user*.9)
val_cnt = len_unique_user-train_cnt

def custom_cv():
        for i in range (0, 3):
            sampling_list = sample ( unique_users.tolist(), len_unique_user)
            yield train_data_val[ train_data_val['user_id'].isin( sampling_list[:train_cnt]) ], \
                  train_data_val[ train_data_val['user_id'].isin( sampling_list[train_cnt:])]
            
# Check if the custom cross validation code is working as expected. 
ccv = custom_cv()
for i in ccv:
    print(i)
    break

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

train_data, val_data = next(custom_cv())

# This below code for creating/training the model has been taken from https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering

lgb_train = lgb.Dataset(train_data[FEATURE_COLUMNS], train_data['answered_correctly'])
lgb_valid = lgb.Dataset(val_data[FEATURE_COLUMNS], val_data['answered_correctly'])

model = lgb.train(
                    {'objective': 'binary'}, 
                    lgb_train,
                    valid_sets=[lgb_valid],
                    verbose_eval=100,
                    num_boost_round=10000,
                    early_stopping_rounds=10
                )


In [None]:
pickle.dump(model, open(MODEL_PICKLE_FILENAME, 'wb'))

In [None]:
##
# Check on a sample test data 
##
from sklearn.metrics import roc_auc_score, accuracy_score

model = pickle.load(open(MODEL_PICKLE_FILENAME, 'rb'))
temp_data, test_data = next(custom_cv())
result = model.predict ( test_data[FEATURE_COLUMNS] )
print ( roc_auc_score(test_data['answered_correctly'], result )) 

In [None]:
del train_df
del train_data
del val_data
del temp_data
del test_data 
del user_info
del question_info
del user_part_info
del train_data_val
del initial_data
gc.collect()

In [None]:
import sys
local_vars = list(locals().items())
for var, obj in local_vars:
    size = sys.getsizeof(obj)
    if size > 1e7:
        print(f'{var:<18}{size/1e6:>10,.1f} MB')

# Inference

In [None]:
##
# Update the existing summary data based on the results obtained during testing in each batch 
##
def update_user_info(df, user_info):
    for row in df[['user_id','answered_correctly','content_type_id']].values:
        if row[2] == 0:
            if int( row[0] ) in user_info.index :
                user_info.loc[int(row[0]) ,'user_correct_count'] += row[1]
                user_info.loc[int(row[0]) , 'user_all_count'] += 1
                user_info.loc[int(row[0]) , 'lecture_count'] += 0
            else :
                user_info = user_info.append ( pd.DataFrame (
                                        {
                                            'user_correct_count' : row[1],
                                            'user_all_count': 1 ,
                                            'lecture_count': 0 ,
                                            'user_mean_time': 26000
                                        }, index = [int(row[0])]
                    
                                    ) 
                                 
                                 
                                 )
    user_info['user_performance'] = user_info['user_correct_count']   / user_info ['user_all_count'] 
    return user_info 




if RUN_UNIT_TEST == 1:
    punit_df = pd.DataFrame ( 

                 [{'user_id':1, 'content_id': 1, 'answered_correctly':1, 'content_type_id':0 } ,
                 {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':0 },
                  {'user_id':1, 'content_id': 2, 'answered_correctly':0, 'content_type_id':1 },
                  {'user_id':2, 'content_id': 1, 'answered_correctly':0, 'content_type_id':0 }
                 ], index = [0, 1, 2, 3 ]
        )

    punit_user_info= pd.DataFrame ( 

                 [{'user_correct_count' : 11 , 'user_all_count': 20 , 'user_mean_time': 30000, 'user_performance': 0.5, 'lecture_count': 3 } ,
                  {'user_correct_count' : 5 , 'user_all_count': 5 , 'user_mean_time': 20000, 'user_performance': 1.0 , 'lecture_count': 4} ,
                 ], index = [1,3]
        )
    punit_new = update_user_info(punit_df , punit_user_info)
    assert punit_new.loc[1]['user_all_count'] == 22
    assert punit_new.loc[2]['user_correct_count'] == 0
    assert punit_new.loc[2]['user_all_count'] == 1
    assert punit_new.loc[2]['user_mean_time'] == 26000
    assert punit_new.loc[3]['user_correct_count'] == 5 
    assert punit_new.loc[3]['user_all_count'] == 5 
    assert punit_new.loc[3]['user_mean_time'] == 20000 
    assert punit_new.loc[3]['user_performance'] == 1.0 
    

In [None]:
##
# Precalculate the summary information for the data. This is done to save time. 
# Also it is observed that better results are obtained when the entire data is considered for 
# calculating the summary 
##
user_info = pq.read_table("/kaggle/input/user-info-with-mean-time/user_info_final-2.parquet").to_pandas()
question_info = pq.read_table("/kaggle/input/riidinferenceinput1/question_summary.parquet").to_pandas()
user_part_info = pq.read_table("/kaggle/input/riidinferenceinput1/user_part_info_summary.parquet").to_pandas()

In [None]:
user_performance_median = user_info['user_performance'].median()
question_performance_median = question_info['question_performance'].median()
user_part_correct_count_dict = user_part_info['user_part_correct_count'].to_dict()
user_part_performance_dict = user_part_info['user_part_performance'].to_dict()
user_correct_count_dict = user_info['user_correct_count'].to_dict()
user_performance_dict = user_info['user_performance'].to_dict()
user_lecture_dict = user_info['lecture_count'].to_dict()
user_mean_time_dict = user_info['user_mean_time'].to_dict()
question_correct_count_dict = question_info['question_correct_count'].to_dict()
question_performance_dict = question_info['question_performance'].to_dict()


In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
from datetime import datetime
import random
print ( datetime.now() , "Starting Inference")
dict_dict = create_dictionaries(user_info,question_info, user_part_info)
for (test_df, sample_prediction_df) in iter_test:
    
    test_df = test_df[test_df['content_type_id' ] == 0]
    
    print ( datetime.now(), test_df.shape[0] ) 
    test_users = test_df['user_id'].unique() 

    
    X_val = test_df[test_df['content_type_id' ] == 0][['user_id', 'content_id']] 
    X_val = X_val.join(question_static_info['part'], on="content_id" )
    X_val['part'] =  X_val['part'].fillna(-1)
    X_val['user_part'] = X_val['user_id'].astype(str) + " " + X_val['part'].astype(str)+".0"  
     
    X_val = merge_summary_info_with_data(X_val, dict_dict)
    X_val = handle_missing_vals (X_val )

    print ( datetime.now() , "Merged the data ")
    
    X_val = X_val[FEATURE_COLUMNS]
    result = model.predict(X_val)
    
    print( datetime.now(), "Prediction complete for this batch")
    
    test_df[TARGET_COLUMN] = result 
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    user_info = update_user_info(test_df, user_info)


    
    