In [None]:
#Imports
import numpy as np       #Numpy for numerical computations
import pandas as pd      #Pandas for data manipulations
import riiideducation    #Package for the competition API
import seaborn as sns    #Seaborn for data vizualisation
import os
import gc

#Import data
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Reading data
full_train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

# A first glance at the data

## Train set

The columns in the train file are described as:
* row_id: (int64) ID code for the row.
* timestamp: (int64) the time in milliseconds between this user interaction and the first event completion from that user.
* user_id: (int32) ID code for the user.
* content_id: (int16) ID code for the user interaction
* content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.
* task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.
* user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.
* answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.
* prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.
* prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback

In [None]:
train = pd.read_pickle('../input/riidcv/cv1_train.pickle')
test = pd.read_pickle('../input/riidcv/cv1_valid.pickle')
train.shape,test.shape

In [None]:
#Dictionnary for questions average
question_average = pd.DataFrame(full_train.loc[full_train['content_type_id'] == 0].groupby(['content_id'])['answered_correctly'].mean()).rename(columns={'answered_correctly':'question_average'})
#Dictionnary for questions count
question_count = pd.DataFrame(full_train.loc[full_train['content_type_id'] == 0].groupby(['content_id']).size(),columns=['question_count'])
#Joining average and count
question_df = question_average.join(question_count)
#Computing sum as product of average and count
question_df['question_sum'] = question_df['question_average'] * question_df['question_count']
#Joining the new dataframe with questions data, getting more columns
question_df = question_df.join(questions,how='outer')[['question_average','question_count','question_sum']]
#Filling with default value
question_df['question_average'].fillna(0,inplace=True)
question_df['question_count'].fillna(0,inplace=True)
question_df['question_sum'].fillna(0,inplace=True)
#Cleaning for memory management
del question_average,question_count
gc.collect()

In [None]:
train = train.join(question_df,on=['content_id'], rsuffix='_question')


In [None]:
#Cleaning full_train to keep only train and test set (as it is too big)
del full_train
gc.collect()

In [None]:
#Joining the average mark for the question to the train data
train = train.join(question_df,on=['content_id'], rsuffix='_question')

In [None]:
def reduce_memory_usage(train_data):
    #Converting row_id to int32
#     train_data['row_id'] = train_data['row_id'].astype('int32')
    #Converting prior_question_had_explanation to boolean value and filling NA with False
    train_data['prior_question_had_explanation'] = train_data['prior_question_had_explanation'].fillna(False).astype('bool')
    #Converting timestamps to minutes
#     train_data['timestamp'] = (train_data['timestamp'] / (1000 * 60)).astype('float32')
    #Converting elapsed time to minutes
#     train_data['prior_question_elapsed_time'] = (train_data['prior_question_elapsed_time'] / (1000 * 60)).astype('float32')
    return train_data

What we need is :

1) A dictionnary for the users with :
    a) The number of questions answered
    b) The number of correct answers
    c) The sum of the average correctness of answers for the questions answered
    
2) A dictionnary for the questions with :
    a) The number of times the question has been asked
    b) The average percentage of correct answers for the question

Then we can compute the user performance as the average of the user - the average correctness for the question she/he was asked

We can update the user performance after a batch of questions in the following way :

1) For each question, update the number of times the question is asked the average percentage of correct answers for the question

2) For each user, add the average correctness of each question to the sum of average correctness, add the number of correct answers and the number of questions.

Recompute the user performance : average of answers - average of others on same questions

In [None]:
mean_user = train.loc[train['content_type_id'] == False].groupby(['user_id'])['answered_correctly'].mean().mean()
mean_question = train.loc[train['content_type_id'] == False].groupby(['content_id'])['answered_correctly'].mean().mean()

In [None]:
mean_user

In [None]:
mean_question

In [None]:
train['user_shift_question'] = train.loc[train['content_type_id'] == False].groupby(['user_id'])['question_average'].shift()

In [None]:
train['user_shift_question'] = train.loc[train['content_type_id'] == False].groupby(['user_id'])['question_average'].shift()
cumulated_question = train.loc[train['content_type_id'] == False].groupby(['user_id'])['user_shift_question'].agg(['cumsum','cumcount'])
train.loc[train['content_type_id'] == False,'average_past_questions'] = cumulated_question['cumsum'] / (cumulated_question['cumcount'] + 1)
train.drop(columns=['user_shift_question'],inplace=True)
train['average_past_questions'].fillna(mean_question,inplace=True)
del cumulated_question

In [None]:
#Computing the average for each user depending only on past events
train['user_shift'] = train.loc[train['content_type_id'] == False].groupby(['user_id'])['answered_correctly'].shift()
cumulated = train.loc[train['content_type_id'] == False].groupby(['user_id'])['user_shift'].agg(['cumsum', 'cumcount'])
train.loc[train['content_type_id'] == False,'answered_correctly_user_average'] = cumulated['cumsum'] / cumulated['cumcount']
train['answered_correctly_user_average'].fillna(mean_user,inplace=True)
train.drop(columns=['user_shift'], inplace=True)
del cumulated

In [None]:
#Dictionnary for user average
user_average = pd.DataFrame(train.loc[train['content_type_id'] == 0].groupby(['user_id'])['answered_correctly_user_average'].last()).rename(columns={'answered_correctly_user_average':'user_average'})
#Dictionnary for user count
user_count = pd.DataFrame(train.loc[train['content_type_id'] == 0].groupby(['user_id']).size() - 1,columns=['user_count'])

In [None]:
tmp = train.loc[train['content_type_id'] == False].groupby(['user_id']).mean()
#Ici, on prend la moyenne globale vs la performance globale sur toutes les questions
#Je voudrais faire la performance avant ces questions (= answered_correctly_user_average) et moyenne sur les questions déjà faites
train['performance_before'] = train['answered_correctly_user_average'] - train['average_past_questions']
user_performance = pd.DataFrame(train.loc[train['content_type_id'] == 0].groupby(['user_id'])['performance_before'].last()).rename(columns={'performance_before':'performance'})
# user_performance = pd.DataFrame(tmp['answered_correctly'] - tmp['question_average']).rename(columns={0:'performance'})
# user_performance2 = pd.DataFrame(train['answered_correctly_user_average'] - train['average_past_questions']).rename(columns={0:'performance'})
del tmp

In [None]:
gc.collect()

In [None]:
user_df = user_performance.join(user_average).join(user_count)
user_df['user_sum'] = user_df['user_average'] * user_df['user_count']

A student had a lecture for the given question if the tag of the lecture is one of the tags of the question

tag[lect] in tags[question]

In [None]:
def question_average_sum_by_user(df,question_df):
    my_dict = {}
    group = df.groupby(['user_id'])
    for user, val in group:
        average_sum = 0.0
        for row_index, row in val.iterrows():
            if (row['content_type_id'] == False):
                question_id = row['content_id']
                question_average = question_df.at[question_id,'question_average']
                average_sum += question_average
    #         print(f'user = {user}, id = {question_id}, average = {question_average}, average_sum={average_sum}')
        my_dict[user] = [average_sum]
    return pd.DataFrame.from_dict(my_dict,orient='index',columns=['question_average_sum'])

In [None]:
def add_answers_to_prior_df(current_df,prior_df):
    prior_df_ = prior_df.copy()
    if (prior_df.shape[0] > 0):
        val = eval(current_df.iloc[0]['prior_group_answers_correct'])
        if (len(val) == prior_df.shape[0]):
            prior_df_['answered_correctly_response'] = val
    return prior_df_

In [None]:
def build_question_df(prior_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return question_df
    
    #Dictionnary for questions average
    question_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                           .groupby(['content_id'])['answered_correctly_response'].sum())\
                           .rename(columns={'answered_correctly_response':'question_sum'})
    
    #Dictionnary for questions count
    question_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                             .groupby(['content_id']).size(),columns=['question_count'])
    
    #Joining the two previous dataframes in one
    question_df = question_df.join(question_sum_prior,rsuffix='_previous').join(question_count_prior,rsuffix='_previous')
    
    #Filling null values
    question_df['question_average'].fillna(0,inplace=True)
    question_df['question_count'].fillna(0,inplace=True)
    question_df['question_sum'].fillna(0,inplace=True)
    question_df['question_sum_previous'].fillna(0,inplace=True)
    question_df['question_count_previous'].fillna(0,inplace=True)

    #Updating values
    question_df['question_sum'] = question_df['question_sum'] + question_df['question_sum_previous']
    question_df['question_count'] = question_df['question_count'] + question_df['question_count_previous']
    question_df['question_average'] = question_df['question_sum'] / question_df['question_count']
    question_df.drop(['question_count_previous','question_sum_previous'],inplace=True,axis=1)
    
    return question_df

In [None]:
def build_user_df(prior_df,user_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return user_df
    
    #Dictionnary for user average
    user_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                       .groupby(['user_id'])['answered_correctly_response'].sum())\
                       .rename(columns={'answered_correctly_response':'user_sum'})
    
    #Dictionnary for user count
    user_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                         .groupby(['user_id']).size(),columns=['user_count'])

    #Joining the df with preexisting one
    user_df = user_df.join(user_sum_prior,how='outer',rsuffix='_previous').join(user_count_prior,rsuffix='_previous')
    
    #Filling null values
    user_df['performance'].fillna(0,inplace=True)
    user_df['user_average'].fillna(0,inplace=True)
    user_df['user_count'].fillna(0,inplace=True)
    user_df['user_sum'].fillna(0,inplace=True)
    user_df['user_count_previous'].fillna(0,inplace=True)
    user_df['user_sum_previous'].fillna(0,inplace=True)
    
    #Computing the average of correct answers for the list of questions each user head in prior
    user_df = user_df.join(question_average_sum_by_user(prior_df,question_df))
    user_df['question_average_sum'].fillna(0,inplace=True)
    
    #Updating values
    user_df['user_mean_performance'] = (user_df['user_sum'] - user_df['performance'] * user_df['user_count'] + user_df['question_average_sum']) / (user_df['user_count'] + user_df['user_count_previous'])
    user_df['user_sum'] = user_df['user_sum'] + user_df['user_sum_previous']
    user_df['user_count'] = user_df['user_count'] + user_df['user_count_previous']
    user_df['user_average'] = user_df['user_sum'] / user_df['user_count']
    user_df['performance'] = user_df['user_average'] - user_df['user_mean_performance']
    user_df.drop(['user_sum_previous','user_count_previous','question_average_sum','user_mean_performance'],axis=1,inplace=True)
    
    return user_df

In [None]:
#Main code
prior_df = pd.DataFrame()
current_df = pd.DataFrame()
prior_df = add_answers_to_prior_df(current_df,prior_df)
question_df = build_question_df(prior_df,question_df)
user_df = build_user_df(prior_df,user_df,question_df)

In [None]:
TARGET_COL = ['answered_correctly']
FEATURE_COLS = ['row_id', 'performance','user_count', 'question_average']

In [None]:
def data_transform(df, is_training = True, is_validation = True): 
    #Casting types to reduce memory usage
#     df = reduce_memory_usage(df)
    
    #Dropping columns from the beginning to accelerate further computations
#     df.drop(['task_container_id'],axis=1,inplace=True)

    #Joining average marks for questions with the main dataframe
    df = df.join(question_df['question_average'],on=['content_id'],rsuffix='_question_average')
    
    df = df.join(user_df[['performance','user_average', 'user_count']],on=['user_id'],rsuffix='_right')
    
    df['is_beginning'] = df['user_count'] < 20
    
    #Recasting after join
#     df['prior_question_had_explanation'] = df['prior_question_had_explanation'].astype('bool')
    
    df = df.loc[df['content_type_id'] == False]
    
    if is_training or is_validation:
        df = df[FEATURE_COLS + TARGET_COL]
    else:
        df = df[FEATURE_COLS]
    return df

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

model1 = lgb.Booster(model_file='../input/cvlgbm/model.txt')
model2 = lgb.Booster(model_file='../input/cv2lgb/model.txt')
model3 = lgb.Booster(model_file='../input/cv3riid/model.txt')
model4 = lgb.Booster(model_file='../input/riidcv4/model.txt')


In [None]:
test = pd.read_pickle('../input/riidcv/cv1_valid.pickle')
test = data_transform(test,False)
X_test = test[FEATURE_COLS]
y_test = test[TARGET_COL]
predictions = pd.DataFrame(model1.predict(X_test.iloc[:,1:]),index=X_test.index)
roc_auc_score(y_test,predictions[0])

In [None]:
test = pd.read_pickle('../input/riidcv/cv2_valid.pickle')
test = data_transform(test,False)
X_test = test[FEATURE_COLS]
y_test = test[TARGET_COL]
predictions = pd.DataFrame(model2.predict(X_test.iloc[:,1:]),index=X_test.index)
roc_auc_score(y_test,predictions[0])


In [None]:
test = pd.read_pickle('../input/riidcv/cv3_valid.pickle')
test = data_transform(test,False)
X_test = test[FEATURE_COLS]
y_test = test[TARGET_COL]
predictions = pd.DataFrame(model3.predict(X_test.iloc[:,1:]),index=X_test.index)
roc_auc_score(y_test,predictions[0])


In [None]:
test = pd.read_pickle('../input/riidcv/cv4_valid.pickle')
test = data_transform(test,False)
X_test = test[FEATURE_COLS]
y_test = test[TARGET_COL]
predictions = pd.DataFrame(model4.predict(X_test.iloc[:,1:]),index=X_test.index)
roc_auc_score(y_test,predictions[0])


In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
iter_nb = 0

for (current_df, sample_prediction_df) in iter_test:
    if (iter_nb != 0):
        prior_df = add_answers_to_prior_df(current_df,prior_df)
        question_df = build_question_df(prior_df,question_df)
        user_df = build_user_df(prior_df,user_df,question_df)
        
    prior_df = current_df.copy()
    current_df = data_transform(current_df,False,False)
    current_df['answered_correctly'] = 0.25*(model3.predict(current_df.iloc[:,1:])+model4.predict(current_df.iloc[:,1:])+model1.predict(current_df.iloc[:,1:])+model2.predict(current_df.iloc[:,1:]))
    iter_nb = 1
    env.predict(current_df.loc[:, ['row_id', 'answered_correctly']])