References:
#keras test example
https://www.kaggle.com/code1110/riiid-keras-logisitc-regression-for-analytics


#pytorch baseline
https://www.kaggle.com/maunish/riiid-super-cool-eda-and-pytorch-baseline

In [None]:
# delete all files created by skorch checkpoint (if any)
!rm -rf  /kaggle/working/skorch_chk/*

In [None]:
!pip install /kaggle/input/skorch090/skorch-0.9.0-py3-none-any.whl

In [None]:
import os
import sys
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split

from skorch import NeuralNetClassifier
from skorch.helper import predefined_split
from skorch.callbacks import ProgressBar, EarlyStopping, Checkpoint
from skorch.dataset import Dataset
from skorch.callbacks import LRScheduler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim
import torch.nn.functional as F

from glob import glob

# custom
import riiideducation

#supress warnings
import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
GENERAL_BATCH_SIZE = 2400
TIME_STEP_SIZE = 19

In [None]:
CATEGORAL_COLUMNS = ['content_id', 'task_container_id']
CONTINUOUS_COLUMNS = ['prior_question_had_explanation','prior_question_elapsed_time']

In [None]:
folder_path = '../input/riiid-test-answer-prediction/'
train_csv = folder_path + 'train.csv'
# test_csv =  folder_path + 'example_test.csv'
# lec_csv  =  folder_path + 'lectures.csv'
# que_csv =   folder_path + 'questions.csv'
# sample_csv =    folder_path + 'example_sample_submission.csv'

dtype = {
    'content_id': 'int16',
    'task_container_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_had_explanation': 'int8',
    'prior_question_elapsed_time': 'float32'
}

test_dtype = {
    'content_id': 'int16',
    'task_container_id': 'int16',
    'prior_question_had_explanation': 'int8',
    'prior_question_elapsed_time': 'float32'
}

# train_data = pd.read_csv(train_csv,
#                          usecols = dtype.keys(),
#                          dtype=dtype,
#                          low_memory=False,
#                          nrows=10**4)

train_data = pd.read_parquet("../input/riiid-train-data-multiple-formats/riiid_train.parquet")
print("Train size:", train_data.shape)

# test_data = pd.read_csv(test_csv)
# lec_data = pd.read_csv(lec_csv)
# que_data = pd.read_csv(que_csv)
# sample = pd.read_csv(sample_csv)

In [None]:
train_data.head()

**Basic data fixing**

In [None]:
# for now set to hard coded value, I need to recheck this value once we load all the data
#train_data["prior_question_elapsed_time"].mean()

#TIME_MEAN_FOR_ELAPSED = 21000.0
TIME_MEAN_FOR_ELAPSED = np.floor(train_data["prior_question_elapsed_time"].mean())
TIME_MEAN_FOR_ELAPSED

In [None]:
def fill_missing_values(data):
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].fillna(0).astype(np.int8)
    data["prior_question_elapsed_time"] = data["prior_question_elapsed_time"].fillna(TIME_MEAN_FOR_ELAPSED)
    
    return data

In [None]:
def change_column_type_to_categorical(data):
    data[CATEGORAL_COLUMNS] = data[CATEGORAL_COLUMNS].astype('category')
    
    return data

In [None]:
datafram_mapper = DataFrameMapper([(train_data[['prior_question_elapsed_time']].columns, MinMaxScaler(feature_range=(-1, 1)))])

def normalize_data(mapper, data, is_train=True):
    if is_train:    
        mapper = mapper.fit(data.copy())
    
    scaled_features = mapper.transform(data.copy())
    scaled_features_df = pd.DataFrame(scaled_features, index=data.index, columns=data[['prior_question_elapsed_time']].columns)
    data = pd.concat([data.drop('prior_question_elapsed_time', axis = 1), scaled_features_df], axis=1)
    
    # clean up
    del scaled_features
    del scaled_features_df

    gc.collect()

    return data

In [None]:
train_data = train_data[dtype.keys()]
train_data = fill_missing_values(train_data)
train_data = change_column_type_to_categorical(train_data)
train_data = normalize_data(datafram_mapper, train_data)

In [None]:
last_train_rows_for_pred = train_data[test_dtype.keys()].tail(TIME_STEP_SIZE - 1)

In [None]:
train_data.head()

**Get categorical column embeddding count**

In [None]:
# embedded_cols = {n: len(col.cat.categories) for n,col in train_data[CATEGORAL_COLUMNS].items()}
# embedded_cols

**Determining size of embedding**

In [None]:
#t = train_data['task_container_id'].astype(pd.CategoricalDtype(ordered=True))
#t = train_data['task_container_id'].astype(pd.CategoricalDtype(ordered=True))

In [None]:
#t.max()

In [None]:
# embedded_cols['content_id'] = 32737
# embedded_cols['task_container_id'] = 10000

In [None]:
embedded_cols = {'content_id': 32737, 'task_container_id': 10000}

In [None]:
embedded_cols

In [None]:
# try different minimum sizes - choosing 50 based on fastai
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes

In [None]:
all_actual_features = train_data.columns

**Dataset and data loader**

In [None]:
class Riid_RNN_Dataset(Dataset):
    def __init__(self, data):
        self.categorical_data = data.loc[:, CATEGORAL_COLUMNS].to_numpy(dtype=np.int64)
        self.continuous_data = data.loc[:, CONTINUOUS_COLUMNS].to_numpy(dtype=np.float32)
        self.targets = data['answered_correctly'].to_numpy(dtype=np.float32)
        
        self.data_length = len(self.targets) - TIME_STEP_SIZE

    def __getitem__(self, index):
        X1 = self.categorical_data[index: index + TIME_STEP_SIZE]
        X2 = self.continuous_data[index: index + TIME_STEP_SIZE]
        y = self.targets[index + TIME_STEP_SIZE, np.newaxis]

        return (X1, X2), y

    def __len__(self):
        return self.data_length

In [None]:
split_train, split_validation = train_test_split(train_data, test_size=0.20, shuffle=False)

train_dataset = Riid_RNN_Dataset(split_train)
validation_dataset = Riid_RNN_Dataset(split_validation)

# train_dataloader = DataLoader(train_dataset, batch_size=GENERAL_BATCH_SIZE, shuffle=False, num_workers=0)
# validation_dataloader = DataLoader(validation_dataset, batch_size=GENERAL_BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
# test code
# dataloader = DataLoader(train_dataset, batch_size=GENERAL_BATCH_SIZE, shuffle=False, num_workers=0)
# for i, batch in enumerate(dataloader):
#         print(i, batch[0][0].shape, batch[0][1].shape, batch[1].shape)
#         #print(batch[0][1])
#         break

In [None]:
del train_data

gc.collect()

**Model**

In [None]:
class RNN(nn.Module):
  def __init__(self, n_contineous_inputs, n_hidden, n_rnnlayers, n_outputs, embedding_sizes):
    super(RNN, self).__init__()
    
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
    self.n_emb, self.n_cont = n_emb, n_contineous_inputs
    
    self.emb_drop = nn.Dropout(0.3)
    
    self.D = self.n_emb + self.n_cont
    self.M = n_hidden
    self.K = n_outputs
    self.L = n_rnnlayers
    
    #print(f'RNN LSTM input size is: {self.D}')

    self.rnn = nn.LSTM(
        input_size=self.D,
        hidden_size=self.M,
        num_layers=self.L,
        batch_first=True)
    self.fc = nn.Linear(self.M, self.K)
  
  def forward(self, X):
    X_categorical, X_continuous = (*X,)
        
    # initial hidden states
    h0 = torch.zeros(self.L, X_categorical.size(0), self.M).to(device)
    c0 = torch.zeros(self.L, X_categorical.size(0), self.M).to(device)

    #print(f'categorical type: {X_categorical.dtype} & shape : {X_categorical.shape}')
    #print(f'continuous type: {X_continuous.dtype} & shape : {X_continuous.shape}')    

# below code is not needed
#     x = [print(embedding) for col_idx,embedding in enumerate(self.embeddings)]
#     x = [print(self.get_unique_categorical_data(X_categorical, col_idx).shape) for col_idx,embedding in enumerate(self.embeddings)]

#     for col_idx,embedding in enumerate(self.embeddings):
#         print(col_idx)
#         print(f'input min & max: {torch.min(self.get_unique_categorical_data(X_categorical, col_idx))} & {torch.max(self.get_unique_categorical_data(X_categorical, col_idx))}')
#         print(f'')
#         t = embedding(self.get_unique_categorical_data(X_categorical, col_idx))
# end of not needed code
    
    # categorial columns are first 2 columns in X
    #x = [embedding(self.get_unique_categorical_data(X_categorical, col_idx)) for col_idx,embedding in enumerate(self.embeddings)]
    x = [embedding(X_categorical[:, :,col_idx]) for col_idx,embedding in enumerate(self.embeddings)]
    #print(f'default shape: {x[0].shape} & {x[1].shape}')
    x = torch.cat(x, 2)
    #print(f'after merge shape: {x.shape}')
    x = self.emb_drop(x) # I can remove dropout if this is unable to compile

    #concatentate last 2 columns (that are contineous columns - first two are categorical)
    x = torch.cat([x, X_continuous], 2)    
    #print(f'RNN forward input size is: {x.shape}')
        
    # get RNN unit output
    out, _ = self.rnn(x, (h0, c0))

    # we only want h(T) at the final time step
    out = self.fc(out[:, -1, :])
    return out

  def get_unique_categorical_data(self, x, col_idx):
    x = x.reshape(-1, 2)
    _, idx = np.unique(x[:, col_idx], return_index=True)
    return x[np.sort(idx), col_idx]

In [None]:
class CFG:
    contineous_inputs=len(CONTINUOUS_COLUMNS)
    hidden_units=80
    layers=3
    target_output=1
    lr=1e-2
    #lr=5e-2
    weight_decay=1e-6
    batch_size=GENERAL_BATCH_SIZE
    epochs=5

In [None]:
early_loss_stop = EarlyStopping(threshold=0)
chkPoint = Checkpoint(dirname='skorch_chk',f_params='params_{last_epoch[epoch]}.pt')

In [None]:
 def iterator_train(dataset, **kwargs):
      return DataLoader(train_dataset, **kwargs)

 def iterator_valid(dataset, **kwargs):
      return DataLoader(validation_dataset, **kwargs)

In [None]:
net = NeuralNetClassifier(
    RNN,
    module__n_contineous_inputs=CFG.contineous_inputs,
    module__n_hidden=CFG.hidden_units,
    module__n_rnnlayers=CFG.layers,
    module__n_outputs=CFG.target_output,
    module__embedding_sizes=embedding_sizes,
    
    batch_size=CFG.batch_size,
    max_epochs=CFG.epochs,

    # any value besides '0' will give a runtime error in PyTorch 
    #(https://github.com/pytorch/pytorch/issues/28820)
    iterator_train__num_workers=0,
    iterator_valid__num_workers=0,
    iterator_train__shuffle=False,
    iterator_valid__shuffle=False,

    iterator_train = iterator_train,
    iterator_valid = iterator_valid,
    
    train_split=predefined_split(validation_dataset),
    #train_split=None,

    criterion=nn.BCEWithLogitsLoss,
    optimizer=torch.optim.Adam,
    lr=CFG.lr,
    optimizer__weight_decay=CFG.weight_decay,
    callbacks=[
        ProgressBar(), 
        early_loss_stop,
        chkPoint,
         ('lr_scheduler',
                 LRScheduler(policy=optim.lr_scheduler.ReduceLROnPlateau,
                             mode='min', 
                             factor=0.15, 
                             patience=3, 
                             verbose=True, 
                             eps=1e-4))
    ],
    device=device  # comment to train on cpu
)

In [None]:
net.fit(train_dataset, y=None)
#net.fit(np.zeros(len(train_dataset)), y=None)

In [None]:
# del train_dataset
# del validation_dataset

gc.collect()

In [None]:
# Plot training & validation loss values
def plot_history(history):
    plt.plot(history[:, 'train_loss'])
    plt.plot(history[:, 'valid_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right', frameon=False)
    plt.show()
    
plot_history(net.history)

**Load the best run**

In [None]:
# to load the last checkpoint and use it for prediction

net.initialize()
net.load_params(checkpoint=chkPoint)
#test_outputs = net.evaluation_step(X_test).sigmoid().cpu().numpy()

**Prediction**

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

first_test_pred = True

In [None]:
def make_datframe_compatible(test_dataframe):
    return test_dataframe[test_dtype.keys()].copy().astype(test_dtype)

In [None]:
def create_final_test_dataset(data):
    categorical_data = data.loc[:, CATEGORAL_COLUMNS].to_numpy(dtype=np.int64)
    continuous_data = data.loc[:, CONTINUOUS_COLUMNS].to_numpy(dtype=np.float32)
    result1 = np.zeros((len(data) - TIME_STEP_SIZE + 1, TIME_STEP_SIZE, 2), dtype=np.int64)
    result2 = np.zeros((len(data) - TIME_STEP_SIZE + 1, TIME_STEP_SIZE, 2), dtype=np.float32)
    
    for index in range(len(data) - TIME_STEP_SIZE + 1):
        result1[index] = categorical_data[index: index + TIME_STEP_SIZE]
        result2[index] = continuous_data[index: index + TIME_STEP_SIZE]

    return (result1, result2)

In [None]:
for test_df, sample_prediction_df in iter_test:
    test_dataset = fill_missing_values(test_df)
    test_dataset = make_datframe_compatible(test_dataset)
    test_dataset = change_column_type_to_categorical(test_dataset)
    test_dataset = normalize_data(datafram_mapper, test_dataset)

    if first_test_pred:
        merged_test_dataset = last_train_rows_for_pred.append(test_dataset, ignore_index=True)
    else:
        merged_test_dataset = previous_test_dataset.append(test_dataset, ignore_index=True)
    
    print(f'shape of final dataset: {merged_test_dataset.shape}')
    
    merged_test_dataset = create_final_test_dataset(merged_test_dataset)
    
#     t = net.evaluation_step(merged_test_dataset).sigmoid().cpu().numpy()
#     print(t[:1])
    test_df['answered_correctly'] = net.evaluation_step(merged_test_dataset).sigmoid().cpu().numpy()
        
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    
    previous_test_dataset = test_dataset.copy()