In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer as SI


import tensorflow as tf
from tensorflow.keras import layers as L
from tensorflow.keras.models import Model
import kerastuner as kt
from kerastuner.tuners import RandomSearch

ss = StandardScaler()
si = SI(strategy="median")

# Loading Data

We gotta take care of data loading as its a very huge dataset, hence we have to specify the dtypes of each column so that our RAM won't crash.

In [None]:
dtypes_dict = {'row_id': 'int64',
               'timestamp': 'int64',
               'user_id': 'int32', 
               'content_id': 'int16',
               'content_type_id': 'int8',
               'task_container_id': 'int16', 
               'user_answer': 'int8', 
               'answered_correctly': 'int8',
               'prior_question_elapsed_time': 'float32', 
               'prior_question_had_explanation': 'boolean'
              }

train_df = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',
                      nrows=10**7,
                      dtype=dtypes_dict,
                      index_col=0)  # as row_id is same as index, I am making it default index col
train_df

We will just look at the target cols and will make a simple NN starter model using keras.

More on EDA and interesting findings can be found here(under working): https://www.kaggle.com/mrutyunjaybiswal/understanding-the-answer-correctness-eda

# Target and Features

> Target
- Did the `student(user_id)` answered the question `(answered_correctly)` from 

> Features
- When? `(timestamp)`. Well, its actually the time diffrenece between the time of attempt to user's first interaction
- a particular `content(content_id)` or 
- `type of content(content_type_id)` or 
- `task container(task_container_id)` or 
- `how much time` did S/he take to answer the previous question (or question bundle) `prior_question_elapsed_time` or 
- had he referred to any explanation or say it any tutorial for ansdwering the previous question bundle`(prior_question_had_explanation)`.

In [None]:
target_col = 'answered_correctly'

# let's see with how many classes we are dealing with
train_df[target_col].value_counts()

Ah, well! According to the host, `-1` depicts the data represents a nan value. So, we will discard those samples with `answered_correctly = -1`.

In [None]:
working_data = train_df[train_df[target_col]!=-1]
working_data

Let's check for nan values if there is more.

In [None]:
working_data.isna().sum()

As we have seen so far, there are ~39.4K nan values still in our dataset. How can we deal with that? I am thinking of handling it by not handling it. Why did I say so? We might have some way to fill up th time elapsed column, but how will we do the same for `prev_question_had_explanation` col. Though I have an idea to deal with that. Fill the nan value with the `max(no_of_false, no_of_true)` for each individual user. Well, I am skipping it for now.

For now, I am simply dropping the nan values.

In [None]:
working_data

Now, let's talk about below feature columns:

- user_id
- content_id
- task_container_id

Look at their value counts.

In [None]:
print("Number of unique users: ", working_data.user_id.nunique())
print("Number of unique content(or unique user interaction): ", working_data.content_id.nunique())
print("Number of unique tasks(or batch of lectures): ", working_data.task_container_id.nunique())

Still you want to treat them as a categorical feature? Well, I don't. I am thinking of creating features per individual users/content/tasks though. 

In [None]:
userGroup = working_data.groupby("user_id")[target_col].mean().reset_index()
userGroup

In [None]:
contentGroup = working_data.groupby("content_id")[target_col].mean().reset_index()
contentGroup

In [None]:
taskGroup = working_data.groupby("task_container_id")[target_col].mean().reset_index()
taskGroup

So what I did, I kinda tried to measure performance in terms of how many times did they answer correctly per each user and content and task. Let's rename the cols and make them ready to merge with our working data. 

In [None]:
userGroup.columns = ['user_id', 'user_performance']
contentGroup.columns = ['content_id', 'content_performance']
taskGroup.columns = ['task_container_id', 'task_performance']

In [None]:
working_data = working_data.reset_index()
working_data

In [None]:
working_data.loc[:, "prior_question_elapsed_time"] = working_data['prior_question_elapsed_time'].fillna(0)
working_data.loc[:, "prior_question_had_explanation"] = working_data['prior_question_had_explanation'].fillna(0)
working_data

In [None]:
features = ['timestamp', 'prior_question_elapsed_time', 'prior_question_had_explanation']
cat_cols = ['user_id', 'content_id', 'task_container_id']
selected_data = working_data[features + cat_cols + [target_col]].copy()
selected_data

In [None]:
def preprocess(df):
    """
    Merge user, task and content performance and return df with seleted features.
    """
    df.loc[:, 'timestamp'] = df['timestamp'].rolling(window=5, min_periods=1, center=True).sum()
    df.loc[:, 'prior_question_elapsed_time'] = df['prior_question_elapsed_time'].rolling(window=5, min_periods=1, center=True).sum()
    df = df.merge(userGroup, how='left', on='user_id')
    # deal with possible nan values
    df.loc[:, 'user_performance'] = df['user_performance'].fillna(0.5)
    df = df.merge(contentGroup, how='left', on='content_id')    
    df.loc[:, 'content_performance'] = df['content_performance'].fillna(0.5)
    df = df.merge(taskGroup, how='left', on='task_container_id') 
    df.loc[:, 'task_performance'] = df['task_performance'].fillna(0.5)
    
    # rescale the time values
    df['timestamp'] = ss.fit_transform(df['timestamp'].values.reshape(-1, 1))
    df['prior_question_elapsed_time'] = ss.fit_transform(df['prior_question_elapsed_time'].values.reshape(-1, 1))

    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].map({True:1, False: 0})
    df['prior_question_had_explanation'] = si.fit_transform(df['prior_question_had_explanation'].values.reshape(-1, 1))

    return df

In [None]:
preprocess(selected_data)

In [None]:
final_features = ['timestamp',
                  'prior_question_elapsed_time',
                  'prior_question_had_explanation',
                  'user_performance',
                  'content_performance',
                  'task_performance']

final_train = preprocess(selected_data)[final_features + [target_col]]
final_train

# Modelling and Hyperparameter tunning with Keras Tuner

In [None]:
def build_model(hp):
    inputs = tf.keras.Input(shape=(6, ))
    x = inputs
    x = tf.keras.layers.Dense(hp.Int('hidden_size', 30, 100, step=10, default=50), activation='relu')(x)
    x = tf.keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.5))(x)
    x = tf.keras.layers.Dense(hp.Int('hidden_size', 30, 100, step=10, default=50), activation='relu')(x)
    x = tf.keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.5))(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

In [None]:
tuner = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=5,
    executions_per_trial=3
)
tuner.search_space_summary()

# Cross-validation 

> I choose to use Stratified KFold cv with 5 splits. As there is 2:1 imbalancement between classes, this my choice of cv strategy as of now.

In [None]:
X = final_train.drop([target_col], axis=1).values
y = final_train[target_col].values
print(X.shape)
print(y.shape)

In [None]:
cv = StratifiedKFold(n_splits=5)
params = {}
models = {}
for i, (tr, val) in enumerate(cv.split(X, y)):
    print("===================")
    print(f"Fold: {i}")
    tuner.search(X[tr], y[tr],
                 validation_data=(X[val], y[val]),
                 epochs=5,
                 callbacks=[tf.keras.callbacks.ModelCheckpoint(f"model_cv{i}.h5", save_best_only=True)])
    
    params[i] = tuner.get_best_models(1)[0]
    models[i] = tuner.get_best_hyperparameters(1)[0]
    pass

# Prediction

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for test_df, sample_prediction_df in iter_test:
    y_preds = []
    test_df = preprocess(test_df)
    x_test = test_df[final_features].values
    
    for model in models:
        y_pred = model_v1.predict(x_test, verbose=1)
        y_preds.append(y_pred)
    
    y_preds = sum(y_preds) / len(y_preds)
    test_df[target_col] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0,
               ['row_id', target_col]])

EOF!