# Riiid! Answer Correctness Prediction - Intro
Team: Rohit

Author:Rohit Kumar Hansdah

Last edited: 20-Dec-2020

This model was designed to use an LSTM network, with L1 penalty on the recurrent and input weights for regularization. Since data is formatted as a time series, inputs are build on a per user basis. When training, a number of samples is pulled from each user. When testing, a database is saved and updated at each group/batch and takes into account new users. This database is in the variable 'test_user_dict'.

The hyperparameters are: the number of time steps in a series (variable timesteps), the number of samples per user (variable sampleperuser), the number of neurons in the LSTM (variable lstm_neurons), the learning rate (variable eta), and the number of training epochs (variable totalepoch).

In [None]:
# Script for competition "Riiid! Answer Correctness Prediction"
# Competition summary can be found here:
# https://www.kaggle.com/c/riiid-test-answer-prediction/overview

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import riiideducation

# Training Set Processing

In [None]:
#Load training data from csv file
#Data types are explicitly specified to minimize memory usage
subset_size = int(5E6)
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=subset_size, 
                       dtype={'row_id': 'int64',
                              'timestamp': 'int64',
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'content_type_id': 'int8',
                              'task_container_id': 'int16',
                              'user_answer': 'int8',
                              'answered_correctly': 'int8',
                              'prior_question_elapsed_time': 'float32', 
                              'prior_question_had_explanation': 'boolean',
                             }
                      )
print(train_df.dtypes)
print('From .csv loading train_df: rows, columns =', train_df.shape)

In [None]:
#Quick glimpse at the training data
train_df.head(20)

In [None]:
#Remove unused rows and columns
train_df = train_df.loc[train_df.content_type_id==0].reset_index(drop=True)
train_df.drop(['row_id', 'content_type_id', 'user_answer'], axis=1, inplace=True)
print('After dropping train_df: rows, columns =', train_df.shape)

In [None]:
#Define data frame preprocesser (removes nans and converts bools)
mean_pqet = train_df['prior_question_elapsed_time'].mean()
def df_preprocess(in_df):
    in_df['prior_question_elapsed_time'].fillna(value=mean_pqet, inplace=True)
    in_df['prior_question_had_explanation'] = in_df[
        'prior_question_had_explanation'
    ].fillna(value=False).astype(bool).map({True:1, False:0})
    in_df.fillna(value=0, inplace=True)
    
df_preprocess(train_df)
print('After preprocessing train_df: rows, columns', train_df.shape)

In [None]:
#Group by user and how many rows they have in the dataset
train_by_user_id = train_df.groupby(by='user_id')
user_list = list(train_by_user_id.groups.keys())
print('There are', len(user_list), 'users')
print('Average rows per user:', train_by_user_id.count().mean()[0])
print('Min rows of one user:', train_by_user_id.count().min()[0])
print('Max rows of one user:', train_by_user_id.count().max()[0])

#Delete all large variables not used past this point to conserve memory
del train_by_user_id

In [None]:
#Group by question, and create dictionary
#Dictionary has the question's mean, standard deviation, and skew
train_by_content_id = train_df.groupby(by='content_id')
content_id_dict = train_by_content_id.agg({'answered_correctly': [np.mean, np.std, 'skew']}).copy()
content_id_dict.columns = ['content_mean', 'content_std', 'content_skew']
content_id_dict.fillna(value=0, inplace=True)

#Delete all large variables not used past this point to conserve memory
del train_by_content_id

In [None]:
#Group by container id, and create dictionary
#Dictionary has the container's mean, standard deviation, and skew
train_by_container = train_df.groupby(by='task_container_id')
container_dict = train_by_container.agg({'answered_correctly': [np.mean, np.std, 'skew']}).copy()
container_dict.columns = ['container_mean', 'container_std', 'container_skew']
container_dict.fillna(value=0, inplace=True)

#Delete all large variables not used past this point to conserve memory
del train_by_container

In [None]:
#From the questions metadata, create dictionary of part and tags
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
questions_df.drop(['bundle_id', 'correct_answer'], axis=1, inplace=True)
questions_df.fillna(value=0, inplace=True)
questions_df = questions_df.rename(columns={'question_id':'content_id'})

#Questions will have between 0 to 5 tags
tag_list = [[], [], [], [], []]
tag_counts = {}
for thisstr in questions_df.tags:
    temp_list = str(thisstr).split(' ')
    temp_list = [int(i) for i in temp_list]
    temp_list.sort()
    for thistag in temp_list:
        if (not(thistag in tag_counts)):
            tag_counts[thistag] = 0
        tag_counts[thistag] = tag_counts[thistag] + 1
    temp_list = temp_list * math.ceil(5/len(temp_list))
    for i in range(5):
        tag_list[i].append(temp_list[i])

for i in range(5):
    questions_df['tag_'+str(i)] = tag_list[i]
questions_df.drop(['tags'], axis=1, inplace=True)

In [None]:
#See the frequency of each tag
plt.figure(figsize=(12,6))
plt.plot(*zip(*sorted(tag_counts.items())))
plt.xlabel('Tag')
plt.ylabel('Count of questions with that tag')
plt.title('Tag Count')
plt.grid(True)

In [None]:
#Define function to merge all dictionaries onto dataset
def merge_all_dict(in_df):
    temp_df = in_df.merge(content_id_dict, how='left', on='content_id')
    temp_df = temp_df.merge(container_dict, how='left', on='task_container_id')
    temp_df = temp_df.merge(questions_df, how='left', on='content_id')
    return temp_df.fillna(value=0)
    
train_df = merge_all_dict(train_df)
print('After merge, (rows, columns) =', train_df.shape)

In [None]:
#Define the labels of the input vector
#Columns that need to be scaled are first
scaled_labels = [
    'prior_question_elapsed_time',
    'content_id',
    'task_container_id',
    'part'
]
tag_labels = ['tag_'+str(i) for i in range(5)]
stat_labels = [
    'content_mean',
    'content_std',
    'content_skew',
    'container_mean',
    'container_std',
    'container_skew'
]
prior_labels = [
    'prior_question_had_explanation',
    'prior_answer_correct'
]
ts_labels = scaled_labels + tag_labels + stat_labels + prior_labels
sindex = len(scaled_labels)
tindex = len(tag_labels)

#Define input shapes
timesteps = 100
features = len(ts_labels)

In [None]:
#Create scaler for first set of labels
print('Creating scaler for preprocessing:')
scaler = StandardScaler()
scaler.fit(train_df[scaled_labels].to_numpy())

In [None]:
#Define function that will get the relevant features for time series
#Will scale all features that need it
def build_ts_array(in_df):
    out_np = in_df[ts_labels[:-1]].to_numpy()
    out_np[:, 0:sindex] = scaler.transform(out_np[:, 0:sindex])
    out_np[:, sindex:sindex+tindex] = out_np[:, sindex:sindex+tindex] / len(tag_counts)
    #Prior answer correct needs to shifted by 1 row
    temp_np = in_df['answered_correctly'].shift(periods=1, axis=0, fill_value=0.5).to_numpy()
    return np.hstack((out_np, np.expand_dims(temp_np, axis=1)))

In [None]:
#Define function that returns samples of a given user
#Data will be front padded if the number of rows is less than timesteps
def user_sample_array(in_df, inuser, numsamples):
    temp_df = in_df.loc[in_df.user_id==inuser].reset_index(drop=True)
    x_list = []
    y_list = []
    #.iloc includes the first argument, but excludes the second
    #Thus range starts at len() instead of len()-1
    for i in range(len(temp_df), 0, -math.ceil(len(temp_df) / numsamples)):
        next_x = build_ts_array(temp_df.iloc[max(0, i-timesteps):i])
        next_y = temp_df['answered_correctly'].iloc[i-1]
        if (len(next_x) < timesteps):
            next_x = np.vstack((
                np.zeros((timesteps - len(next_x), features)),
                next_x
            ))
        x_list.append(next_x)
        y_list.append(next_y)
    return x_list, y_list

In [None]:
#Create the training set
x_train_list = []
y_train_list = []

#Building the set, each sample is the time series of one user
time_start = time.time()
print('Start creating training set at', time.ctime())
sampleperuser = 10
for thisuser in user_list:
    new_x_list, new_y_list = user_sample_array(train_df, thisuser, sampleperuser)
    for xsample, ysample in zip(new_x_list, new_y_list):
        x_train_list.append(xsample)
        y_train_list.append(ysample)

x_train = np.array(x_train_list)
y_train = np.array(y_train_list)

time_finish = time.time()
print('Finished at', time.ctime())
print('Time elapsed:', int(time_finish - time_start), 'sec')
print('Shape of x_train:', x_train.shape)
print('Shape of y_train:', y_train.shape)

#Delete all large variables not used past this point to conserve memory
del x_train_list, y_train_list

In [None]:
#Show example array for training
for i, thislabel in enumerate(ts_labels):
    print('{:>32}'.format(thislabel), end=': ')
    for val in x_train[0, -5:, i]:
        print('{:>8.4f}'.format(val), end=' ')
    print('')

In [None]:
#Shuffle and split for validation
x_train, x_val, y_train, y_val = train_test_split(
    x_train,
    y_train,
    test_size = 1/10,
    random_state = 17,
    shuffle = True
)
print('After split:')
print('# of training data:', x_train.shape[0])
print('# of validation data:', x_val.shape[0])

In [None]:
#Delete all large variables not used past this point to conserve memory
del train_df

# Model Creation and Training

In [None]:
#Define hyperparameters
eta = 1E-1
totalepoch = 30

lstm_neurons = 60

input_reg = 1E-5
lstm_reg = 1E-5

In [None]:
#Build model
from tensorflow.keras.optimizers import Adam

model = keras.models.Sequential()
model.add(layers.Masking(mask_value=0., input_shape=(timesteps, features)))
model.add(layers.LSTM(
    lstm_neurons,
    recurrent_regularizer=regularizers.l1(lstm_reg),
    kernel_regularizer=regularizers.l1(input_reg)
))
model.add(layers.Dense(1, activation='sigmoid'))

opt_adam = Adam(learning_rate=eta)
model.compile(
    optimizer=opt_adam,
    loss='mse',
    metrics=['accuracy']
)
model.summary()

In [None]:
#Fit model to training set
time_start = time.time()
print('Training start at', time.ctime())
history = model.fit(
    x_train,
    y_train,
    batch_size=1024,
    epochs=totalepoch,
    validation_data=(x_val, y_val),
    verbose=0
)
time_finish = time.time()
print('Training finished at', time.ctime())
print('Time elapsed:', int(time_finish - time_start), 'sec')

In [None]:
pd.DataFrame(history.history)[['accuracy','val_accuracy']].plot(figsize=(12,6))
plt.grid(True)
plt.gca().set_ylim(0.6, 0.8)
plt.title('Accuracy')
plt.show()

In [None]:
pd.DataFrame(history.history)[['loss','val_loss']].plot(figsize=(12,6))
plt.grid(True)
plt.gca().set_ylim(0.1, 0.3)
plt.title('Loss/Error')
plt.show()

In [None]:
#Apply model on the validation set, and show ROC curve
y_val_pred = model.predict(x_val).ravel()
fpr_val, tpr_val, thresh_val = roc_curve(y_val.astype(np.uint8), y_val_pred)
auc_val = auc(fpr_val, tpr_val)
plt.figure(figsize=(12,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_val, tpr_val, label='Val (area = {:.5f})'.format(auc_val))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
#Show example prediction
print('{:>10}'.format('Expected'), '{:>11}'.format('Prediction'))
for i in range(15):
    print('{:>10.7f}'.format(y_val[i]), '{:>11.7f}'.format(y_val_pred[i]))

In [None]:
#Get feature input weights (relative influence)
weights = model.get_weights()
input_w = [0]*len(ts_labels)
for i in range(len(ts_labels)):
    temp_array = weights[0][i,::4]
    temp_array = [abs(j) for j in temp_array]
    input_w[i] = np.mean(temp_array)
scaled_w = 100*(input_w/max(input_w))

print('{:>30}'.format('FEATURE'), '{:>7}'.format('WEIGHT'), '{:>8}'.format('RELATIVE'))
for i, val in enumerate(input_w):
    print(
        '{:>30}'.format(ts_labels[i]),
        '{:>7.4f}'.format(val),
        '{:>8}'.format(scaled_w[i].astype(int))
    )

In [None]:
#Delete all large variables not used past this point to conserve memory
del x_train, y_train, x_val, y_val

# Model Predictions

In [None]:
# The environment can only be created once, per Riiid rules
# If you need to rerun the script, first kill the session/kernel
env = riiideducation.make_env()

#Load test data. Each batch of tests is along the principle axis, and must be followed in a strict order (to emulate the passage of time).
#An iteration of test must call 'env.predict()' else there will be an error
iter_test = env.iter_test()

#Apply model, and place results in prediction_df with the 'env.predict()' function, such as below
#>>env.predict(prediction_df)
#After the iter_test tuple is exhausted, the environment will output the final prediction in submission.csv
test_user_dict = {}
old_user_list = []
old_row_list = []
old_test_df = pd.DataFrame({'empty': [0]})
for (test_df, prediction_df) in iter_test:
    
    #Preprocessing
    test_df = test_df.reset_index(drop=True)
    #Get answers from previous group
    pgac_list = eval(str(test_df['prior_group_answers_correct'].iloc[0]))
    #Drop columns
    test_df.drop(['prior_group_responses', 'prior_group_answers_correct'], axis=1, inplace=True)
    df_preprocess(test_df)
    #Get user list and row list for this group
    new_user_list = test_df.user_id.tolist()
    new_row_list = test_df.row_id.tolist()
    #Merge dictionaries
    test_df = merge_all_dict(test_df)
    #Create answer column
    test_df['answered_correctly'] = [0.5]*len(test_df)
    test_df.fillna(value=0, inplace=True)
    
    #Update answered_correctly in previous group, then insert to dictionary
    if(pgac_list):
        pgac_dict = pd.DataFrame.from_dict({'row_id': old_row_list, 'answered_correctly': pgac_list})
        old_test_df = old_test_df.merge(pgac_dict, how='left', on='row_id')
        for thisindex, (thisrow, thisuser) in enumerate(zip(old_row_list, old_user_list)):
            if(old_test_df.iloc[thisindex]['content_type_id']==0):
                if(not(thisuser in test_user_dict)):
                    test_user_dict[thisuser] = {'rows': []}
                test_user_dict[thisuser][thisrow] = old_test_df.iloc[thisindex].tolist()
                test_user_dict[thisuser]['rows'].append(thisrow)
                if(len(test_user_dict[thisuser]['rows']) > timesteps):
                    row_remove = test_user_dict[thisuser]['rows'].pop(0)
                    test_user_dict[thisuser].pop(row_remove)    
    
    #Predict for each user in group
    x_test_list = []
    for thisindex, (thisrow, thisuser) in enumerate(zip(new_row_list, new_user_list)):
        if(thisuser in test_user_dict):
            temp_dict = dict(test_user_dict[thisuser])
            temp_dict.pop('rows')
            database_df = pd.DataFrame.from_dict(temp_dict, orient='index', columns=test_df.columns)
            user_df = pd.concat([database_df, test_df.loc[test_df.row_id==thisrow]])
        else:
            user_df = test_df.loc[test_df.row_id==thisrow]
        new_test_x, new_test_y = user_sample_array(user_df, thisuser, 1)
        x_test_list.append(new_test_x[0])
        
    x_test = np.array(x_test_list)
    pred_list = model.predict(x_test)
    update_df = pd.DataFrame({'row_id': new_row_list, 'prediction': pred_list.ravel()})
    update_df.fillna(0.5, inplace=True)

    #Save new -> old
    old_test_df = test_df.drop(['answered_correctly'], axis=1)
    old_user_list = list(new_user_list)
    old_row_list = list(new_row_list)
    
    #Call env.predict()
    prediction_df = prediction_df.merge(update_df, how='left', on='row_id')
    prediction_df.drop(['answered_correctly'], axis=1, inplace=True)
    prediction_df.rename(columns={'prediction': 'answered_correctly'}, inplace=True)
    prediction_df = prediction_df.astype({'answered_correctly': 'float64'})
    prediction_df.fillna(0.5, inplace=True)
    env.predict(prediction_df)