In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
from pathlib import Path
from sklearn.metrics import roc_auc_score
        
from fastai.tabular.all import *
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm as tqdm
import datatable as dt
import gc

In [None]:
# Check fast ai version
import fastai
fastai.__version__

In [None]:
path = Path('/kaggle/input/riiid-test-answer-prediction')
assert path.exists()

### Initial Load

In [None]:
%%time
train_df = dt.fread("/kaggle/input/riidtrainjay/train.jay").to_pandas()

In [None]:
for f in ['timestamp', 'prior_question_elapsed_time']:
    train_df[f] = pd.to_numeric(train_df[f], downcast='float')

In [None]:
train_df.info()

### Feature generation

In [None]:
%%time

train_df.drop(train_df.columns.difference(['timestamp', 'user_id', 'content_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']), 1, inplace=True)
train_df = train_df.loc[train_df['answered_correctly'] != -1].reset_index(drop=True)

In [None]:
train_df = train_df.sort_values(['timestamp'], ascending=True)
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
del train_df['timestamp']

In [None]:
# Create a unique tag combination representation

unique_tags_combos_keys = {v:i for i,v in enumerate(questions_df['tags'].unique())}
questions_df['tags_encoded'] = questions_df['tags'].apply(lambda x : unique_tags_combos_keys[x])
question_tags_df = questions_df[['question_id', 'tags_encoded']].copy()
question_tags_df.set_index('question_id', inplace=True)
question_tags_df['tags_encoded'] = pd.to_numeric(question_tags_df['tags_encoded'], downcast='integer')

In [None]:
def extract_tag_factory(tag_pos):
    def extract_tag(x):
        if isinstance(x, str) and tag_pos < len(x.split()):
            splits = x.split()
            splits.sort()
            return int(splits[tag_pos])
        else:
            return 255
    return extract_tag
        
for i in range(0, 2):
    questions_df[f'tag_{i + 1}'] = questions_df['tags'].apply(extract_tag_factory(i))
    questions_df[f'tag_{i + 1}'] = questions_df[f'tag_{i + 1}'].astype('uint8')


In [None]:
questions_df.info()

In [None]:
question_tags_df = question_tags_df.merge(questions_df[['question_id', 'bundle_id', 'tag_1', 'tag_2']], how='left', on='question_id')

In [None]:
question_tags_df

In [None]:
train_df = train_df.merge(question_tags_df, how='left', left_on='content_id', right_index=True)
del train_df['question_id']
train_df

In [None]:
train_df.info()

In [None]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value = False).astype(bool)

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [None]:
thresh = int(len(train_df) * 0.1)
X = train_df[:thresh]
features_df = train_df[thresh:]

In [None]:
del train_df
gc.collect()

In [None]:
features_df

In [None]:
%%time

# Create a field to see how far a user is from the mean of content id answered_correctly

content_id_answered_correctly_mean = pd.DataFrame(features_df.groupby('content_id')['answered_correctly'].agg('mean'))
content_id_answered_correctly_mean.columns = ['content_id_answered_correctly_mean']
features_df = features_df.merge(content_id_answered_correctly_mean, how='left', left_on='content_id', right_index=True)
features_df['resid'] = features_df['answered_correctly'] - features_df['content_id_answered_correctly_mean']
del features_df['content_id_answered_correctly_mean']

In [None]:
del content_id_answered_correctly_mean
gc.collect()

In [None]:
stats_funcs = ['mean', 'count', 'std', 'median', 'skew']

feature_user_cols = ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy', 'residual_content_mean']
feature_content_cols = ['mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy']

In [None]:
%%time

# User based stats on answered_correctly

user_answers_df = features_df.groupby('user_id').agg({'answered_correctly': stats_funcs, 'resid': ['mean']}).copy()
user_answers_df.columns = feature_user_cols

In [None]:
print(f"percentage of common users: {float(len(user_answers_df)) / float(len(X['user_id'].unique()))}")

In [None]:
%%time

# Question based stats on answered_correctly

content_answers_df = features_df.groupby('content_id').agg({'answered_correctly': stats_funcs}).copy()
content_answers_df.columns = feature_content_cols

In [None]:
# Tag based stats on answered_correctly

content_tags_df = features_df.groupby('tags_encoded').agg({'answered_correctly': ['mean']}).copy()
content_tags_df.columns = ['tags_encoded_answered_mean']
content_tags_df

In [None]:
# Bundle based stats on answered_correctly

# bundle_df = features_df.groupby('bundle_id').agg({'answered_correctly': ['mean']}).copy()
# bundle_df.columns = ['bundle_answered_mean']
# bundle_df

In [None]:
del features_df
gc.collect()

In [None]:
def merge_features(df):
    df = df.merge(user_answers_df, how='left', on='user_id')
    df = df.merge(content_answers_df, how='left', on='content_id')
    df = df.merge(content_tags_df, how='left', left_on='tags_encoded', right_index=True)
#     df = df.merge(bundle_df, how='left', left_on='bundle_id', right_index=True)
    return df
    
X = merge_features(X)

In [None]:
features = feature_user_cols + feature_content_cols + ['prior_question_elapsed_time', 'prior_question_had_explanation', 'tags_encoded', 'tag_1', 'tag_2', 'tags_encoded_answered_mean']
target = 'answered_correctly'

In [None]:
# Simple na, overriden by another option below
def replace_na(x):
    return x.replace([np.inf, -np.inf], np.nan).fillna(0)

mean_user_accuracy_default = user_answers_df['mean_user_accuracy'].median()
std_user_accuracy_default = user_answers_df['std_user_accuracy'].mean()
median_user_accuracy_default = user_answers_df['median_user_accuracy'].mean()
skew_user_accuracy_default = user_answers_df['skew_user_accuracy'].mean()
residual_content_mean_default = user_answers_df['residual_content_mean'].median()
questions_answered_default = user_answers_df['questions_answered'].median()

## Alternative replace na method, which relies on median values
def replace_na(x):
    x = x.replace([np.inf, -np.inf], np.nan)
    x['mean_user_accuracy'] = x['mean_user_accuracy'].fillna(mean_user_accuracy_default)
    x['std_user_accuracy'] = x['std_user_accuracy'].fillna(std_user_accuracy_default)
    x['median_user_accuracy'] = x['median_user_accuracy'].fillna(median_user_accuracy_default)
    x['skew_user_accuracy'] = x['skew_user_accuracy'].fillna(skew_user_accuracy_default)
    x['residual_content_mean'] = x['residual_content_mean'].fillna(residual_content_mean_default)
    x['questions_answered'] = x['questions_answered'].fillna(questions_answered_default)
    return x.fillna(0)

In [None]:
X = X[features + [target]]
X = replace_na(X)
X['prior_question_had_explanation'] = X['prior_question_had_explanation'].astype(np.int8)
X['answered_correctly'] = X['answered_correctly'].astype(np.float32)

In [None]:
for f in ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy', 'residual_content_mean', 
          'mean_accuracy', 'std_accuracy', 'median_accuracy', 'skew_accuracy', 'tags_encoded_answered_mean']:
    X[f] = pd.to_numeric(X[f], downcast='float')

X.info()

##### Training

In [None]:
# valid_range = list(X.sample(frac=0.1, random_state=42).index)
valid_range = list(range(len(X) - int(len(X) / 10), len(X)))

In [None]:
EPOCHS=5
BATCH_SIZE=2048

In [None]:
%%time

dls = TabularDataLoaders.from_df(X, 
    procs=[Categorify, FillMissing, Normalize],
    cat_names=['prior_question_had_explanation', 'tags_encoded', 'tag_1', 'tag_2'], 
    cont_names=['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy', 
                'mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy', 'prior_question_elapsed_time', 'residual_content_mean', 'tags_encoded_answered_mean'],
    y_names='answered_correctly', valid_idx=valid_range, bs=BATCH_SIZE)

In [None]:
def my_auc(inp, targ):
    "Simple wrapper around scikit's roc_auc_score function for regression problems"
    inp,targ = flatten_check(inp,targ)
    return roc_auc_score(targ.cpu().numpy(), inp.cpu().numpy())

In [None]:
from torch.nn import functional as F

def huber(inp,targ):
    "Huber loss between `inp` and `targ`."
    inp,targ = flatten_check(inp,targ)
    loss = F.smooth_l1_loss(inp, targ)
    return loss

def bce(inp,targ):
    "Huber loss between `inp` and `targ`."
    inp,targ = flatten_check(inp,targ)
    loss = F.binary_cross_entropy(inp, targ)
    return loss

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=my_auc)

In [None]:
learn.model.layers.add_module('sigmoid', nn.Sigmoid())
learn.loss_func = bce

In [None]:
lr_find_res = learn.lr_find()

In [None]:
lr_find_res.lr_min

In [None]:
%%time

learn.fit_one_cycle(5, lr=lr_find_res.lr_min)

In [None]:
learn.recorder.plot_loss()

In [None]:
learn.recorder.plot_sched()

In [None]:
def predict_batch(self, df):
    dl = self.dls.test_dl(df)
    dl.dataset.conts = dl.dataset.conts.astype(np.float32)
    inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
    return preds.numpy()

setattr(learn, 'predict_batch', predict_batch)

In [None]:
%%time

sample_size = 2_000_000
preds = learn.predict_batch(learn, X[features].iloc[:sample_size])
y_pred_sample = X[target][:sample_size].values
roc_auc_score(y_pred_sample, preds)

##### Prediction

In [None]:
import riiideducation

env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = test_df.merge(question_tags_df, how='left', left_on='content_id', right_index=True)
    test_df = merge_features(test_df)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(np.int8)
    test_df = replace_na(test_df)
    test_df.fillna(value=0, inplace = True)
    
    y_preds = learn.predict_batch(learn, test_df)

    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
# Used for checking the users that are new on the test set
test_df.loc[~test_df['user_id'].isin(user_answers_df.index)]