In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import gc
from typing import List
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from torch.utils.data import DataLoader, random_split
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

import riiideducation
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time

userActions = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

print("Train size:", userActions.shape)

In [None]:
# only look at those actions related to questions
numberOfActions = 1000000
userActions = userActions.loc[userActions['content_type_id'] == 0].head(numberOfActions)
userActions.head()

In [None]:
# does no care about correct answer
questions = questions.drop(columns=['correct_answer'])
questions.head()

In [None]:
gc.collect()

# Preprocess Question Tags

In [None]:
# convert tags to array of numbers
maximumTag = -math.inf
minimumTag = math.inf
def tagsToArray(x: str) -> List[int]:
    if x is np.nan:
        return []
    res = [int(tag) for tag in x.split()]
    global maximumTag
    global minimumTag
    maximumTag = max(maximumTag, *res)
    minimumTag = min(minimumTag, *res)
    return res
    

questions.tags = questions.tags.apply(tagsToArray)

questions.head()

In [None]:
print('min tag: ' + str(minimumTag))
print('max tag: ' + str(maximumTag))

In [None]:
def convertTagsToEncodedArray(x: List[int]) -> List[int]:
    global maximumTag
    encoded = np.zeros(maximumTag + 1)
    for tag in x:
        encoded[tag] = 1
    return encoded


questions.tags = questions.tags.apply(convertTagsToEncodedArray)

questions.head()

# Create DataSet

In [None]:

interactionFeatures = ['timestamp', 'content_type_id', 'task_container_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']
questionFeatures = ['bundle_id', 'part']

def toInt(x):
    if pd.isna(x) or x is None or x == np.nan:
        return -1
    return int(x)

def cleanUpUserInteractions(userInteractions, garbageCollect=False):
    interactions = userInteractions.loc[userInteractions['content_type_id'] == 0]
    
    if garbageCollect:
        gc.collect()
    
    interactions['content_type_id'] = interactions['content_type_id'].apply(toInt)
    interactions['prior_question_elapsed_time'] = interactions['prior_question_elapsed_time'].apply(toInt)
    
    if garbageCollect:
        gc.collect()
    
    interactions['timestamp'] = interactions['timestamp'].apply(toInt)
    
    if garbageCollect:
        gc.collect()
    
    interactions['content_type_id'] = interactions['content_type_id'].apply(toInt)
    interactions['prior_question_had_explanation'] = interactions['prior_question_had_explanation'].apply(toInt)
    
    if garbageCollect:
        gc.collect()
    
    interactions = interactions.replace([np.inf, -np.inf], np.nan)
    interactions = interactions.fillna(-1)

    if garbageCollect:
        gc.collect()
    
    interactions[interactionFeatures] = interactions[interactionFeatures].astype('float').fillna(value = -1)
    
    if garbageCollect:
        gc.collect()

    
    return interactions
    

class UserDataset(Dataset):
    """Dataset class for column dataset.
    Args:
       cats (list of str): List of the name of columns contain
                           categorical variables.
       conts (list of str): List of the name of columns which 
                           contain continuous variables.
       y (Tensor, optional): Target variables.
       is_reg (bool): If the task is regression, set ``True``, 
                      otherwise (classification) ``False``.
       is_multi (bool): If the task is multi-label classification, 
                        set ``True``.
    """
    def __init__(self, userInteractions, questions):
        self.isTest = False
        self.length = len(userInteractions)
        self.userInteractions = cleanUpUserInteractions(userInteractions[userInteractions.answered_correctly != -1], garbageCollect=True).iloc
        gc.collect()
        self.questions = {}
        self.tags = {}
        for index, row in questions.iterrows():
            question_id = int(row.question_id)
            self.questions[question_id] = torch.from_numpy(row[questionFeatures].astype('float').values).float()
            self.tags[question_id] = torch.from_numpy(row.tags.astype('float')).float()
        gc.collect()
        
        self.blankTags = torch.from_numpy(convertTagsToEncodedArray([])).float()
        self.blankQuestion = torch.from_numpy(np.ones(len(questionFeatures),)).float()
        
    
    def getData(self, row):
        questionId = int(row.content_id)
        return torch.from_numpy(row[interactionFeatures].astype('float').values).float(), self.questions.get(questionId, self.blankQuestion), self.tags.get(questionId, self.blankTags)
        
    def __len__(self): 
        return self.length
    
    def switchToTest(self, df):
        self.isTest = True
        self.length = len(df)
        self.userInteractions = df.iloc
        
    
    def __getitem__(self, idx):
        row = self.userInteractions[idx]
        if not self.isTest:
            return [torch.from_numpy(row[interactionFeatures].values).float(), self.questions[row.content_id], self.tags[row.content_id], torch.Tensor([row.answered_correctly]).long()]
        return [torch.from_numpy(row[interactionFeatures].values.astype('float')).float(), self.questions[row.content_id], self.tags[row.content_id]]

gc.collect()

# Model

In [None]:
trainData = UserDataset(userActions, questions)

In [None]:
batch_size = 64

gc.collect()

validationCount = int(numberOfActions * 0.1)

train_dataset, val_dataset = random_split(trainData, [numberOfActions - validationCount, validationCount])
train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, 
                        shuffle=False)

In [None]:
class Predictor(nn.Module):
  def __init__(self, tagCount: int, questionFeatureCount: int, interactionFeatureCount: int):
    super(Predictor, self).__init__()
    
    reductionDimensions = 4
    predictorDimensions = 16
    

    self.tagEncoder = nn.Sequential(
      nn.Linear(tagCount, 64),
      nn.ReLU(True),
      nn.Linear(64, 128),
      nn.Dropout(),
      nn.ReLU(True),
      nn.Linear(128, 32),
      nn.ReLU(True), 
      nn.Linear(32, reductionDimensions)
    )

    self.tagDecoder = nn.Sequential(
      nn.Linear(reductionDimensions, 32),
      nn.ReLU(True),
      nn.Linear(32, 128),
      nn.Dropout(),
      nn.ReLU(True),
      nn.Linear(128, 64),
      nn.ReLU(True),
      nn.Linear(64, tagCount), 
    )

    self.predictorEncoder = nn.Sequential(
      nn.Linear(reductionDimensions + questionFeatureCount + interactionFeatureCount, 64),
      nn.ReLU(True),
      nn.Linear(64, 64),
      nn.Dropout(),
      nn.ReLU(True),
      nn.Linear(64, 32),
      nn.ReLU(True),
      nn.Linear(32, predictorDimensions),
    )
    
    self.predictorDecoder = nn.Sequential(
      nn.Linear(predictorDimensions, 16),
      nn.ReLU(True),
      nn.Linear(16, 32),
      nn.ReLU(True),
      nn.Linear(32, 64),
      nn.ReLU(True),
      nn.Linear(64, reductionDimensions + questionFeatureCount + interactionFeatureCount),  
    )
    
    self.predictor = nn.Sequential(
        nn.Linear(predictorDimensions, 32),
        nn.ReLU(True),
        nn.Linear(32, 16),
        nn.ReLU(True),
        nn.Linear(16, 2)
    )

  def forward(self, interaction, question, tags):
    encodedTag = self.tagEncoder(tags)
    decodedTag = self.tagDecoder(encodedTag)
    predictOn = torch.cat((encodedTag, question, interaction), -1)
    predictEncoded = self.predictorEncoder(predictOn)
    predictDecoded = self.predictorDecoder(predictEncoded)
    prediction = self.predictor(predictEncoded)
    return prediction, decodedTag, predictOn, predictDecoded


In [None]:
lossBase = nn.MSELoss()
classificationLoss = nn.CrossEntropyLoss()

def loss_criterion(tags, decodedTags, tagRegularizationTerm, predictionOriginal, predictionDecoded, predictionRegularizationTerm, answeredCorrectly, prediction):
    return lossBase(tags, decodedTags) * tagRegularizationTerm + lossBase(predictionOriginal, predictionDecoded) * predictionRegularizationTerm + classificationLoss(prediction, answeredCorrectly.squeeze(1))

# Train

In [None]:
tagRegularizationTerm = 0.001
predictionRegularizationTerm = 0.001
epochs = 3

model = Predictor(maximumTag + 1, len(questionFeatures), len(interactionFeatures)).cuda()
model_optimizer = torch.optim.Adam(
    model.parameters(), lr=0.0001, weight_decay=1e-5)


train_loss  = []
train_acc = []
val_loss = []
val_acc = []

gc.collect()

for epoch in range(epochs):
    model.train()
    running_acc = 0.0
    batch_loss = []
    for index, (interaction, question, tags, y) in enumerate(train_loader):
        interaction = Variable(interaction).cuda()
        question = Variable(question).cuda()
        tags = Variable(tags).cuda()
        y = Variable(y).cuda()
        # ===================forward=====================
        prediction, decodedTag, predictOn, predictDecoded = model(interaction, question, tags)
        loss = loss_criterion(tags, decodedTag, tagRegularizationTerm, predictOn, predictDecoded, predictionRegularizationTerm, y, prediction)
        # ===================backward====================
        model_optimizer.zero_grad()
        loss.backward()
        model_optimizer.step()

        # print statistics
        batch_loss.append(loss.item())

        out = torch.argmax(prediction.detach(),dim=1).unsqueeze(1)
        assert out.shape==y.shape
        running_acc += (out==y).sum().item()
        
        if index % 50000 == 0:
            gc.collect()
    train_loss.append(np.mean(batch_loss))
    train_acc.append(running_acc*100/len(train_dataset))
    print(f"Train loss {epoch+1}: {train_loss[-1]},Train Acc:{running_acc*100/len(train_dataset)}%")



    model.eval()
    batch_loss  = []
    correct = 0.0
    with torch.no_grad():
        totalLoss = 0
        for interaction, question, tags, y in val_loader:
            interaction = Variable(interaction).cuda()
            question = Variable(question).cuda()
            tags = Variable(tags).cuda()
            y = Variable(y).cuda()
            
            prediction, decodedTag, predictOn, predictDecoded = model(interaction, question, tags)
            loss = loss_criterion(tags, decodedTag, tagRegularizationTerm, predictOn, predictDecoded, predictionRegularizationTerm, y, prediction)
            
            batch_loss.append(loss.item())

            out = torch.argmax(prediction,dim=1).unsqueeze(1)
            acc = (y==out).sum().item()
            correct += acc
    val_loss.append(np.mean(batch_loss))
    val_acc.append(correct*100/len(val_dataset))
    print(f"Val accuracy:{correct*100/len(val_dataset)}% Val loss:{np.mean(batch_loss)}")
    
    if correct/len(val_dataset) > 0.65:
        break
    
    gc.collect()



In [None]:
print(train_loss)
print(train_acc)
print(val_loss)
print(val_acc)

# Submission

In [None]:
env = riiideducation.make_env()
gc.collect()

In [None]:
model.eval()
with torch.no_grad():
    for (test_df, sample_prediction_df) in env.iter_test():
        test_df = cleanUpUserInteractions(test_df, garbageCollect=True)
        trainData.switchToTest(test_df)
        n = len(test_df)
        answeredCorrectly = np.zeros((n,))
        testdata = test_df.iloc
        for index, (interactions, question, tags) in enumerate(DataLoader(trainData, batch_size=batch_size, shuffle=False)):
            interactions = Variable(interactions).cuda()
            question = Variable(question).cuda()
            tags = Variable(tags).cuda()
            prediction, *rest = model(interactions, question, tags)
            prediction = torch.argmax(prediction,dim=1).cpu().detach().numpy()
            answeredCorrectly[(index * batch_size):(index * batch_size + len(prediction))] = prediction
        test_df['answered_correctly'] = answeredCorrectly
        env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])