In [None]:
#Imports
import numpy as np       #Numpy for numerical computations
import pandas as pd      #Pandas for data manipulations
import riiideducation    #Package for the competition API
import seaborn as sns    #Seaborn for data vizualisation
import os
import gc                #For garbage collector

#Import data
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Reading data, using a pickle to read it faster (15 seconds more or less)
full_train = pd.read_pickle("../input/train-gzip/riiid_train.gzip")
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

In [None]:
full_train = full_train[['row_id','user_id','content_id','content_type_id','answered_correctly']]
train = full_train.groupby('user_id').tail(500)
test = full_train.groupby('user_id').tail(4)
train = train.drop(test.index)

In [None]:
#Dictionnary for questions average
question_average = pd.DataFrame(full_train.loc[full_train['content_type_id'] == 0].groupby(['content_id'])['answered_correctly'].mean()).rename(columns={'answered_correctly':'question_average'})
#Dictionnary for questions count
question_count = pd.DataFrame(full_train.loc[full_train['content_type_id'] == 0].groupby(['content_id']).size(),columns=['question_count'])
#Joining average and count
question_df = question_average.join(question_count)
#Computing sum as product of average and count
question_df['question_sum'] = question_df['question_average'] * question_df['question_count']
#Joining the new dataframe with questions data, getting more columns
question_df = question_df.join(questions,how='outer')[['question_average','question_count','question_sum']]
#Filling with default value
question_df['question_average'].fillna(0,inplace=True)
question_df['question_count'].fillna(0,inplace=True)
question_df['question_sum'].fillna(0,inplace=True)
#Cleaning for memory management
del question_average,question_count
gc.collect()

In [None]:
#Cleaning full_train to keep only train and test set (as the full_train is too big)
del full_train
gc.collect()

In [None]:
#Joining the average mark for the question to the train data
train = train.join(question_df,on=['content_id'], rsuffix='_question')

In [None]:
#Computing the mean by user and the mean by question to fill empty values
mean_user = train.loc[train['content_type_id'] == False].groupby(['user_id'])['answered_correctly'].mean().mean()
mean_question = train.loc[train['content_type_id'] == False].groupby(['content_id'])['answered_correctly'].mean().mean()

In [None]:
#Computing the average for the questions that the user answered to in the past
train['user_shift_question'] = train.loc[train['content_type_id'] == False].groupby(['user_id'])['question_average'].shift()
cumulated_question = train.loc[train['content_type_id'] == False].groupby(['user_id'])['user_shift_question'].agg(['cumsum','cumcount'])
train.loc[train['content_type_id'] == False,'average_past_questions'] = cumulated_question['cumsum'] / cumulated_question['cumcount']
train['average_past_questions'].fillna(mean_question,inplace=True)
train.drop(['user_shift_question','question_count','question_sum'],axis=1,inplace=True)
del cumulated_question

In [None]:
#Computing the average for each user he or she has until now
train['user_shift'] = train.loc[train['content_type_id'] == False].groupby(['user_id'])['answered_correctly'].shift()
cumulated = train.loc[train['content_type_id'] == False].groupby(['user_id'])['user_shift'].agg(['cumsum', 'cumcount'])
train.loc[train['content_type_id'] == False,'answered_correctly_user_average'] = cumulated['cumsum'] / cumulated['cumcount']
train['answered_correctly_user_average'].fillna(mean_user,inplace=True)
train.drop(columns=['user_shift'], inplace=True)
del cumulated

In [None]:
#Data for user average
user_average = pd.DataFrame(train.loc[train['content_type_id'] == 0].groupby(['user_id'])['answered_correctly_user_average'].last()).rename(columns={'answered_correctly_user_average':'user_average'})
#Data for user count
user_count = pd.DataFrame(train.loc[train['content_type_id'] == 0].groupby(['user_id']).size() - 1,columns=['user_count'])

In [None]:
#Here is what is working best at the moment (which is not base on the past events only)
#To me, this is not suitable as it is a target leakage
#To avoid this leakage, use the commented lines instead (which lead to a lower result until now)
tmp = train.loc[train['content_type_id'] == False].groupby(['user_id']).mean()
user_performance = pd.DataFrame(tmp['answered_correctly'] - tmp['question_average'], columns=['performance'])
del tmp

# train['performance'] = train['answered_correctly_user_average'] - train['average_past_questions']
# user_performance = pd.DataFrame(train.loc[train['content_type_id'] == False].groupby(['user_id'])['performance'].last())

In [None]:
user_df = user_performance.join(user_average).join(user_count)
user_df['user_sum'] = user_df['user_average'] * user_df['user_count']
del user_performance, user_count, user_average
gc.collect()

In [None]:
#Utility function to get the sum of question's average that a user has in a new set
def question_average_sum_by_user(df,question_df):
    my_dict = {}
    group = df.groupby(['user_id'])
    for user, val in group:
        average_sum = 0.0
        for row_index, row in val.iterrows():
            if (row['content_type_id'] == False):
                question_id = row['content_id']
                question_average = question_df.at[question_id,'question_average']
                average_sum += question_average
    #         print(f'user = {user}, id = {question_id}, average = {question_average}, average_sum={average_sum}')
        my_dict[user] = [average_sum]
    return pd.DataFrame.from_dict(my_dict,orient='index',columns=['question_average_sum'])

In [None]:
#Utility function to add the answers of the prior_df into it to update performance and question average
def add_answers_to_prior_df(current_df,prior_df):
    prior_df_ = prior_df.copy()
    if (prior_df.shape[0] > 0):
        val = eval(current_df.iloc[0]['prior_group_answers_correct'])
        if (len(val) == prior_df.shape[0]):
            prior_df_['answered_correctly_response'] = val
    return prior_df_

In [None]:
#Updating the question dataframe (especially for question average) for a new set of questions
def build_question_df(prior_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return question_df
    
    #Dictionnary for questions average
    question_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                           .groupby(['content_id'])['answered_correctly_response'].sum())\
                           .rename(columns={'answered_correctly_response':'question_sum'})
    
    #Dictionnary for questions count
    question_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                             .groupby(['content_id']).size(),columns=['question_count'])
    
    #Joining the two previous dataframes in one
    question_df = question_df.join(question_sum_prior,rsuffix='_previous').join(question_count_prior,rsuffix='_previous')
    
    #Filling null values
    question_df['question_average'].fillna(0,inplace=True)
    question_df['question_count'].fillna(0,inplace=True)
    question_df['question_sum'].fillna(0,inplace=True)
    question_df['question_sum_previous'].fillna(0,inplace=True)
    question_df['question_count_previous'].fillna(0,inplace=True)

    #Updating values
    question_df['question_sum'] = question_df['question_sum'] + question_df['question_sum_previous']
    question_df['question_count'] = question_df['question_count'] + question_df['question_count_previous']
    question_df['question_average'] = question_df['question_sum'] / question_df['question_count']
    question_df.drop(['question_count_previous','question_sum_previous'],inplace=True,axis=1)
    
    return question_df

In [None]:
#Updating the user dataframe (especially for question average) for a new set of questions
def build_user_df(prior_df,user_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return user_df
    
    #Dictionnary for user average
    user_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                       .groupby(['user_id'])['answered_correctly_response'].sum())\
                       .rename(columns={'answered_correctly_response':'user_sum'})
    
    #Dictionnary for user count
    user_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                         .groupby(['user_id']).size(),columns=['user_count'])

    #Joining the df with preexisting one
    user_df = user_df.join(user_sum_prior,how='outer',rsuffix='_previous').join(user_count_prior,rsuffix='_previous')
    
    #Filling null values
    user_df['performance'].fillna(0,inplace=True)
    user_df['user_average'].fillna(0,inplace=True)
    user_df['user_count'].fillna(0,inplace=True)
    user_df['user_sum'].fillna(0,inplace=True)
    user_df['user_count_previous'].fillna(0,inplace=True)
    user_df['user_sum_previous'].fillna(0,inplace=True)
    
    #Computing the average of correct answers for the list of questions each user head in prior
    user_df = user_df.join(question_average_sum_by_user(prior_df,question_df))
    user_df['question_average_sum'].fillna(0,inplace=True)
    
    #Updating values
    user_df['user_mean_performance'] = (user_df['user_sum'] - user_df['performance'] * user_df['user_count'] + user_df['question_average_sum']) / (user_df['user_count'] + user_df['user_count_previous'])
    user_df['user_sum'] = user_df['user_sum'] + user_df['user_sum_previous']
    user_df['user_count'] = user_df['user_count'] + user_df['user_count_previous']
    user_df['user_average'] = user_df['user_sum'] / user_df['user_count']
    user_df['performance'] = user_df['user_average'] - user_df['user_mean_performance']
    user_df.drop(['user_sum_previous','user_count_previous','question_average_sum','user_mean_performance'],axis=1,inplace=True)
    
    return user_df

In [None]:
#Main code, initializing the dataframes
prior_df = pd.DataFrame()
current_df = pd.DataFrame()
prior_df = add_answers_to_prior_df(current_df,prior_df)
question_df = build_question_df(prior_df,question_df)
user_df = build_user_df(prior_df,user_df,question_df)

In [None]:
prior_df

In [None]:
current_df

In [None]:
question_df

In [None]:
user_df

In [None]:
TARGET_COL = ['answered_correctly']
FEATURE_COLS = ['row_id', 'performance', 'question_average']

In [None]:
train = train[['row_id','user_id','content_id', 'content_type_id', 'answered_correctly']].reset_index()

In [None]:
def data_transform(df, is_training = True, is_validation = True): 
        
    #Joining average marks for questions with the main dataframe
    df = df.join(question_df['question_average'],on=['content_id'],rsuffix='_question_average')
    
    df = df.join(user_df[['performance','user_average', 'user_count']],on=['user_id'],rsuffix='_right')
        
    df = df.loc[df['content_type_id'] == False]
    
    if is_training or is_validation:
        df = df[FEATURE_COLS + TARGET_COL]
    else:
        df = df[FEATURE_COLS]
    return df

In [None]:
train.head()

In [None]:
%%time
#Transforming the train data
train = data_transform(train)

In [None]:
train.head()

In [None]:
%%time
#Transforming the test data
test = data_transform(test,False,True)

# Model
We use a lightgbm as proposed by many users
I dit not dig into the parameters tuning until now

In [None]:
#Building the final train and test sets for lightgbm
X_train = train[FEATURE_COLS]
y_train = train[TARGET_COL]
X_test = test[FEATURE_COLS]
y_test = test[TARGET_COL]

In [None]:
import lightgbm as lgb
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.025,
    'max_bin': 1000,
    'num_leaves': 80,
    'num_iterations' : 100
}
lgb_train = lgb.Dataset(X_train.iloc[:,1:],y_train)
lgb_val = lgb.Dataset(X_test.iloc[:,1:],y_test)

In [None]:
del X_train,y_train
gc.collect()

In [None]:
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train,lgb_val],
    verbose_eval=1,
    num_boost_round=100,
    early_stopping_rounds=10
)

In [None]:
%%time
predictions = pd.DataFrame(model.predict(X_test.iloc[:,1:]),index=X_test.index)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,predictions[0])

In [None]:
%%time
env = riiideducation.make_env()
iter_test = env.iter_test()
iter_nb = 0

for (current_df, sample_prediction_df) in iter_test:
    if (iter_nb != 0):
        prior_df = add_answers_to_prior_df(current_df,prior_df)
        question_df = build_question_df(prior_df,question_df)
        user_df = build_user_df(prior_df,user_df,question_df)
        
    prior_df = current_df.copy()
    current_df = data_transform(current_df,False,False)
    current_df['answered_correctly'] = model.predict(current_df.iloc[:,1:])
    iter_nb = 1
    env.predict(current_df.loc[:, ['row_id', 'answered_correctly']])