In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.metrics import AUC

In [None]:
piv1 = pd.read_csv("../input/riiid-fixed-infos/content.csv")
piv2 = pd.read_csv("../input/riiid-fixed-infos/task.csv")
piv3 = pd.read_csv("../input/riiid-fixed-infos/user.csv")

for col, df in zip(["content_sum", "task_container_sum", "user_sum"], [piv1, piv2, piv3]):
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
#
m1 = piv1["content_sum"].median()
m2 = piv2["task_container_sum"].median()
m3 = piv3["user_sum"].median()


# OTHER CONSTANTS
data_path = "../input/riiid-test-answer-prediction/train.csv"
TARGET = "answered_correctly"
TIME_MEAN = 21000.0
TIME_MIN = 0.0
TIME_MAX = 300000.0
map_prior = {True:1, False:0}
epsilon = 1e-6
FE = ["content_emb","content_sum" ,"task_container_emb", "task_container_sum",
      "user_emb", "user_sum","duration", "prior_answer","score"]

In [None]:
def preprocess(df):
    df = df.merge(piv1, how="left", on="content_id")
    df["content_emb"] = df["content_emb"].fillna(0.5)
    df["content_sum"] = df["content_sum"].fillna(m1)
    df = df.merge(piv2, how="left", on="task_container_id")
    df["task_container_emb"] = df["task_container_emb"].fillna(0.5)
    df["task_container_sum"] = df["task_container_sum"].fillna(m2)
    df = df.merge(piv3, how="left", on="user_id")
    df["user_emb"] = df["user_emb"].fillna(0.5)
    df["user_sum"] = df["user_sum"].fillna(m3)
    df["prior_question_elapsed_time"] = df["prior_question_elapsed_time"].fillna(TIME_MEAN)
    df["duration"] = (df["prior_question_elapsed_time"] - TIME_MIN) / (TIME_MAX - TIME_MIN)
    df["prior_answer"] = df["prior_question_had_explanation"].map(map_prior)
    df["prior_answer"] = df["prior_answer"].fillna(0)
    df["score"] = 2*df["content_emb"]*df["user_emb"] / (df["content_emb"]+ df["user_emb"] + epsilon)
    return df

In [None]:
%%time
#tr = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", low_memory=False, nrows=10**7)

with open(data_path) as fp:
    for (rows, _) in enumerate(fp, 1):
       pass
rows

In [None]:
%%time

import tensorflow as tf
from keras import backend as k

data_batch_size = 5*10**6
train_batch_size = 50_000

batch = pd.read_csv(data_path, chunksize=data_batch_size)
for idx, ds in enumerate(batch):
    print('-'*20)
    print("Batch: {}".format(idx))
    print('-'*20)
    ds = preprocess(ds)
    X = ds.loc[ds.answered_correctly!=-1, FE].values
    Y = ds.loc[ds.answered_correctly!=-1, TARGET].values
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cvscores = []
    input_dim = X.shape[1]
    del ds
    if idx==0:
        # create model
        model = Sequential()
        model.add(Dense(60, input_dim=input_dim, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(180, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  ##AUC()])
    else:
        model = keras.models.load_model('riiid_model.h5')
    for train, val in kfold.split(X, Y):
        # Fit the model
        model.fit(X[train], Y[train], epochs=5, batch_size=train_batch_size, verbose=0)
        # evaluate the model
        scores = model.evaluate(X[val], Y[val], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
    model.save('riiid_model.h5')
    del X, Y

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for test_df, sample_prediction_df in iter_test:
    test_df = preprocess(test_df)
    Xtest = test_df[FE].values
    preds = model.predict(Xtest, batch_size=50_000, verbose=0)[:, 0]
    test_df['answered_correctly'] = preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])