In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
# neural nets
import tensorflow as tf
import tensorflow.keras.models as M
import tensorflow.keras.layers as L
# riiid
import riiideducation

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Garbage collect 
### Just run me every now and then to avoid overuse of resources!! 

In [None]:
gc.collect()

# Globals

In [None]:
INPUT_DIR = '/kaggle/input/riiid-test-answer-prediction/'
TRAIN_FILE = os.path.join(INPUT_DIR,'train.csv')
TEST_FILE = os.path.join(INPUT_DIR,'test.csv')
QUES_FILE = os.path.join(INPUT_DIR,'questions.csv')
LEC_FILE = os.path.join(INPUT_DIR,'lectures.csv')

# Read Dataset

## Training Set

In [None]:
tr = pd.read_csv(TRAIN_FILE,
                   usecols=[1, 2, 3, 4, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

tr.head()

## Convert to Pickle

In [None]:
def ds_to_pickle(ds, ds_file, pkl_file):
    ds.to_pickle(pkl_file)
    print("Saving to pkl file to save some space and time, take a look at some stats:")
    print("train.csv:", os.stat(ds_file).st_size * 1e-6)
    print("train.pkl:", os.stat(pkl_file).st_size * 1e-6)
    del ds
    return pd.read_pickle('tr.pkl')
    
    

In [None]:
tr = ds_to_pickle(tr, TRAIN_FILE, 'tr.pkl')

In [None]:
tr.info()

# EDA

## Num Users

In [None]:
total_num_users = tr.user_id.unique().size
unique_user_ids = list(tr.user_id.unique())
print('Total num users:', total_num_users)
print('Sneak peek of the list of all unique user ids... \
        \nMin:', min(unique_user_ids), '\nMax:', max(unique_user_ids),
        '\nFirst 10 user IDs:', unique_user_ids[:10])

## Total number of Questions

In [None]:
total_num_ques = tr.loc[tr.content_type_id==0].content_id.unique().size
unique_ques = list(tr.loc[tr.content_type_id==0].content_id.unique())
print('Total num ques:',total_num_ques)
print('Sneak peek of unique ques:', '\nMin:',min(unique_ques), '\nMax:',max(unique_ques),\
       '\nFirst 10 Ques:',unique_ques[:10])

# Reduce the Dataset by user_ids that answered very few questions

## How many questions did each user answer?

In [None]:
num_ques_per_user = pd.DataFrame({"user_id":list(tr.loc[tr.content_type_id==0].user_id.unique()), \
                                 "num_ques_answered":list(tr.loc[tr.content_type_id==0].user_id.value_counts())}, 
                                 )
num_ques_answered = num_ques_per_user.sort_values('num_ques_answered')['num_ques_answered'].\
                                                    to_frame(name='num_ques_answered')


In [None]:
print(num_ques_answered.min(), num_ques_answered.max())

- We have number of questions answered by a user ranging from 1 to 17609
  - take only user_id s that answered > 100 questions

## Take off these Users from tr

In [None]:
def remove_user_by_num_ques_ans(num_ques_ans_thresh=100, tr=None):
    num_ques_ans_filtered = num_ques_answered.loc[num_ques_answered.num_ques_answered > num_ques_ans_thresh].\
                                            rename(columns=\
                                            {'num_ques_answered':'num_ques_answered_gt_'+str(num_ques_ans_thresh)})
    num_ques_per_user_gt_thresh = num_ques_per_user.loc[num_ques_per_user.num_ques_answered > num_ques_ans_thresh].\
                                                rename(columns={'num_ques_answered':'num_ques_answered_gt'+str(num_ques_ans_thresh)})
    new_tr = tr[tr['user_id'].isin(list(num_ques_per_user_gt_thresh['user_id']))]
    print(new_tr)
    return num_ques_per_user_gt_thresh, new_tr

In [None]:
num_ques_answered_gt_100, tr_user_ques_gt_100 = remove_user_by_num_ques_ans(100, tr=tr)

In [None]:
new_num_rows = len(tr_user_ques_gt_100.index)
old_num_rows = len(tr.index)

print('Old rows:', old_num_rows, '\nNew rows:', new_num_rows, \
      '\nReduced to:', new_num_rows*100/old_num_rows,'% of original dataset size')
print('That\'s a 70% reduction, YAY!')

In [None]:
tr_user_ques_gt_100.to_pickle('tr_user_ans_gt_100_ques.pkl')

In [None]:
tr.info()

In [None]:
tr_user_ques_gt_100.info()

In [None]:
del tr

In [None]:
tr = tr_user_ques_gt_100

# Pre-process Dataset

In [None]:
%%time
piv1 = tr.loc[tr.answered_correctly!=-1].groupby("content_id")["answered_correctly"].mean().reset_index()
piv1.columns = ["content_id", "content_emb"]

piv3 = tr.loc[tr.answered_correctly!=-1].groupby("user_id")["answered_correctly"].mean().reset_index()
piv3.columns = ["user_id", "user_emb"]

In [None]:
TIME_MEAN = tr.prior_question_elapsed_time.median()
TIME_MIN = tr.prior_question_elapsed_time.min()
TIME_MAX = tr.prior_question_elapsed_time.max()
print(TIME_MEAN,TIME_MAX, TIME_MIN)
map_prior = {True:1, False:0}

In [None]:
def preprocess(df):
#     print('before merging:\n',df[:10])
    df = df.merge(piv1, how="left", on="content_id")
#     print('merged piv1:\n',df[:10])
    df["content_emb"] = df["content_emb"].fillna(0.5)
    df = df.merge(piv3, how="left", on="user_id")
    df["user_emb"] = df["user_emb"].fillna(0.5)
    df["prior_question_elapsed_time"] = df["prior_question_elapsed_time"].fillna(TIME_MEAN)
    df["duration"] = (df["prior_question_elapsed_time"] - TIME_MIN) / (TIME_MAX - TIME_MIN)
    df["prior_answer"] = df["prior_question_had_explanation"].map(map_prior)
    df["prior_answer"] = df["prior_answer"].fillna(0.5)
    return df

In [None]:
%%time
tr_preprocessed = preprocess(tr)

In [None]:
FE = ["content_emb",  "user_emb", "duration", "prior_answer"]
TARGET = "answered_correctly"

In [None]:
x = tr_preprocessed.loc[tr_preprocessed.answered_correctly!=-1, FE].values
y = tr_preprocessed.loc[tr_preprocessed.answered_correctly!=-1, TARGET].values

# Build and Train a Model

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

In [None]:
# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    def make_ann(n_in):
        inp = L.Input(shape=(n_in,), name="inp")
        d1 = L.Dense(100, activation="relu", name="d1")(inp)
        d2 = L.Dense(100, activation="relu", name="d2")(d1)
        preds = L.Dense(1, activation="sigmoid", name="preds")(d2)

        model = M.Model(inp, preds, name="ANN")
        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
        return model


In [None]:
net = make_ann(x.shape[1])
print(net.summary())

In [None]:
net.fit(x, y, validation_split=0.2, batch_size=30_000, epochs=1)

In [None]:
net.save('min_100_ques_50_epochs.h5')