Here, i did basic EDA https://www.kaggle.com/yaroslavmavliutov/riiid-answer-correctness-prediction-basic-eda

## Import necessary libraries

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import pandas as pd
import datatable as dt
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

import riiideducation

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization,Dropout,Dense,Flatten,Conv1D
from tensorflow.keras.optimizers import Adam
from keras.metrics import BinaryAccuracy
from keras import backend as K

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Check data available

We have 4 datasets at our disposal

In [None]:
os.listdir('../input/riiid-test-answer-prediction')

In [None]:
lectures_csv = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")
example_test_csv = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")
#train_csv = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", low_memory=False)
train_csv = dt.fread("../input/riiid-test-answer-prediction/train.csv").to_pandas()
questions_csv = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
example_test_csv = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")

## Pre-Processing

In [None]:
# 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture. So, let's keep just the questions
train_csv = train_csv[train_csv.content_type_id == 0]
# read -1 as null, for lectures
train_csv = train_csv[train_csv.answered_correctly != -1]

In [None]:
train_csv = train_csv.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [None]:
train_csv.head(5) # prints the first 5 rows. 

In [None]:
content_mean_final = train_csv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
content_mean_final.columns = ["answered_correctly_content_mean"]

In [None]:
user_mean_final = train_csv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
user_mean_final.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
#saving value to fillna
elapsed_time_mean_final = train_csv.prior_question_elapsed_time.mean()

In [None]:
train_csv.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

## Validation/Train datasets

In [None]:
validation = pd.DataFrame()
for i in range(4):
    last_records = train_csv.drop_duplicates('user_id', keep = 'last')
    train_csv = train_csv[~train_csv.index.isin(last_records.index)]
    validation = validation.append(last_records)

In [None]:
X = pd.DataFrame()
for i in range(15):
    last_records = train_csv.drop_duplicates('user_id', keep = 'last')
    train_csv = train_csv[~train_csv.index.isin(last_records.index)]
    X = X.append(last_records)

In [None]:
results_c = train_csv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content_mean"]

results_u = train_csv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
result_time_mean = train_csv.prior_question_elapsed_time.mean()

In [None]:
#clearing memory
del(train_csv)

In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")

In [None]:
validation = pd.merge(validation, results_u, on=['user_id'], how="left")
validation = pd.merge(validation, results_c, on=['content_id'], how="left")

In [None]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

In [None]:
X.columns

In [None]:
lencoder = LabelEncoder()

X['prior_question_had_explanation'].fillna(False, inplace = True)
X['prior_question_had_explanation_enc'] = lencoder.fit_transform(X['prior_question_had_explanation'])
X['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X['sum_correct'].fillna(0, inplace = True)
X['count'].fillna(0, inplace = True)
X['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

X_val['prior_question_had_explanation'].fillna(False, inplace = True)
X_val['prior_question_had_explanation_enc'] = lencoder.fit_transform(X_val['prior_question_had_explanation'])
X_val['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X_val['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X_val['sum_correct'].fillna(0, inplace = True)
X_val['count'].fillna(0, inplace = True)
X_val['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

In [None]:
X = X[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]
X_val = X_val[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)

### cnn

In [None]:
K.clear_session()
X_train = X.reshape(X.shape[0], X.shape[1], 1)
X_test = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    
model=Sequential()
model.add(Conv1D(32, 2, activation='relu', input_shape=X_train[0].shape))
model.add(Conv1D(64, 2, activation='relu', padding='causal'))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.05), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = model.fit(X_train, y, epochs=35, verbose=2, batch_size=50000)

In [None]:
y_pred = model.predict(X_test)
y_true = np.array(y_val)

In [None]:
roc_auc_score(y_true, y_pred)

## Prediction

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, user_mean_final, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, content_mean_final, on=['content_id'],  how="left")
    
    test_df['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
    test_df['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
    test_df['sum_correct'].fillna(0, inplace=True)
    test_df['count'].fillna(0, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_time_mean_final, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lencoder.transform(test_df["prior_question_had_explanation"])

    # fit transform cnn
    X = scaler.transform(test_df[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
                                  'prior_question_elapsed_time', 'prior_question_had_explanation_enc']])
    test_df['answered_correctly'] = model.predict(X.reshape(X.shape[0], X.shape[1], 1))
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])