This notebook is heavily based on

- https://www.kaggle.com/ulrich07/riiid-keras-starter

Please upvote this notebook too.

Here I added logistic regression to get better insight into features and diversity to the ensemble.

# Libaries

In [None]:
# useful
import os
import random as rn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from logging import getLogger, Formatter, StreamHandler, FileHandler, INFO

# neural nets
from sklearn import linear_model
from tensorflow.keras.layers import (Dropout, BatchNormalization, Flatten, Convolution1D, Activation, Input, Dense, GaussianNoise, Lambda, Bidirectional,
                                     Add, AveragePooling1D, Multiply, GRU, GRUCell, LSTMCell, SimpleRNNCell, SimpleRNN, TimeDistributed, RNN,
                                     RepeatVector, Conv1D, MaxPooling1D, Concatenate, GlobalAveragePooling1D, UpSampling1D)
from tensorflow.keras.layers import Reshape, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy, mean_squared_error
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras import losses, models, optimizers
from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_addons as tfa

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')

# custom
import riiideducation

# Config

In [None]:
SEED = 42
LR = 0.008
BATCH_SIZE = 24000
WEIGHTS = [0.9, 0.1]
EPOCHS = 8
VAL_SIZE = 0.2

In [None]:
def seed_everything(seed : int) -> NoReturn :
    
    rn.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# Data organization

In [None]:
# PIVOT DATAFRAMES
piv1 = pd.read_csv("../input/riiid-fixed-infos/content.csv")
piv2 = pd.read_csv("../input/riiid-fixed-infos/task.csv")
piv3 = pd.read_csv("../input/riiid-fixed-infos/user.csv")

for col, df in zip(["content_sum", "task_container_sum", "user_sum"], [piv1, piv2, piv3]):
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
#
m1 = piv1["content_sum"].median()
m2 = piv2["task_container_sum"].median()
m3 = piv3["user_sum"].median()


# OTHER CONSTABTS
TARGET = "answered_correctly"
TIME_MEAN = 21000.0
TIME_MIN = 0.0
TIME_MAX = 300000.0
map_prior = {True:1, False:0}

In [None]:
def preprocess(df):
    df = df.merge(piv1, how="left", on="content_id")
    df["content_emb"] = df["content_emb"].fillna(0.5)
    df["content_sum"] = df["content_sum"].fillna(m1)
    df = df.merge(piv2, how="left", on="task_container_id")
    df["task_container_emb"] = df["task_container_emb"].fillna(0.5)
    df["task_container_sum"] = df["task_container_sum"].fillna(m2)
    df = df.merge(piv3, how="left", on="user_id")
    df["user_emb"] = df["user_emb"].fillna(0.5)
    df["user_sum"] = df["user_sum"].fillna(m3)
    df["prior_question_elapsed_time"] = df["prior_question_elapsed_time"].fillna(TIME_MEAN)
    df["duration"] = (df["prior_question_elapsed_time"] - TIME_MIN) / (TIME_MAX - TIME_MIN)
    df["prior_answer"] = df["prior_question_had_explanation"].map(map_prior)
    df["prior_answer"] = df["prior_answer"].fillna(0.5)
    #df = df.fillna(-1)
    epsilon = 1e-6
    df["score"] = 2*df["content_emb"]*df["user_emb"] / (df["content_emb"]+ df["user_emb"] + epsilon)
    return df
#=========

# Train

In [None]:
%%time
tr = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", 
                 low_memory=False, nrows=10**7)

In [None]:
%%time
tr = preprocess(tr)

In [None]:
FE = ["content_emb","content_sum" ,"task_container_emb", "task_container_sum",
      "user_emb", "user_sum","duration", "prior_answer","score"]

In [None]:
x = tr.loc[tr.answered_correctly!=-1, FE].values
y = tr.loc[tr.answered_correctly!=-1, TARGET].values

# Fit Neural Net

In [None]:
def make_ann(n_in):
    inp = Input(shape=(n_in,), name="inp")
    x = Dense(512, activation='relu')(inp)
    x = Dropout(0.08)(x)
    x = GaussianNoise(0.01)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.08)(x)
    x = GaussianNoise(0.01)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.08)(x)
    x = GaussianNoise(0.01)(x)
    x = BatchNormalization()(x)
    preds = Dense(1, activation="sigmoid", name = "out")(x)
    model = models.Model(inp, preds)
    
    opt = tfa.optimizers.RectifiedAdam(lr=LR)
    opt = tfa.optimizers.SWA(opt)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05)
    model.compile(loss=loss, optimizer=opt, metrics=["accuracy"])
    return model
#===================

In [None]:
net = make_ann(x.shape[1])
print(net.summary())

In [None]:
seed_everything(SEED)
K.clear_session()
history = net.fit(x, y, validation_split=VAL_SIZE, batch_size=BATCH_SIZE, epochs=EPOCHS)

In [None]:
# Plot training & validation loss values
def plot_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right', frameon=False)
    plt.show()
    
plot_history(history)

# Logistic regression

In [None]:
params = {
        "C": 8.0, 
        "solver": "lbfgs", 
        "warm_start": False,
        "max_iter": 8000,
        "fit_intercept": True,
        "random_state": SEED,
        "tol": 1e-04,
        "n_jobs": -1, 
        "verbose": 1, 
}
lin_model = linear_model.LogisticRegression(**params)
lin_model.fit(x, y)

In [None]:
fi = pd.DataFrame()
fi['features'] = FE
fi['linear_weights'] = lin_model.coef_.ravel()

sns.barplot(x='linear_weights', y='features', data=fi.sort_values(by='linear_weights', ascending=False))

# Prediction

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
#it = 0
for test_df, sample_prediction_df in iter_test:
    #it += 1
    #if it % 100 == 0:
    #    print(it)
    test_df = preprocess(test_df)
    x_te = test_df[FE].values
    nn_pred = net.predict(x_te, batch_size=BATCH_SIZE, verbose=0)[:, 0]
    lin_pred = lin_model.predict(x_te)
    test_df['answered_correctly'] = WEIGHTS[0] * nn_pred + WEIGHTS[1] * lin_pred
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
#=================================================
#print(it)

In [None]:
test_df.head()