In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

## EDA

In [None]:
os.listdir('../input/riiid-test-answer-prediction')

In [None]:
lectCsv = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")
exampleTestCsv = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")
trainCsv = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", low_memory=False, nrows=1000000)
questionsCsv = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")

In [None]:
trainCsv.head()

In [None]:
trainCsv.info()

In [None]:
trainCsv.describe()

In [None]:
trainCsv.nunique()

In [None]:
trainCsv.isnull().sum()

In [None]:
#trainCsv["prior_question_elapsed_time"] = trainCsv.groupby(["user_id", "content_id"]).transform(lambda x: x.fillna(x.mean()))
#trainCsv["prior_question_had_explanation"] = trainCsv.groupby(["user_id", "content_id"]).transform(lambda x: x.fillna(x.mean()))

In [None]:
#trainCsv.isnull().sum()

In [None]:
trainCsv['timestamp'].hist(bins = 50)


In [None]:
plt.figure(figsize=(15, 7))
ax = sns.countplot(trainCsv.groupby('user_id')['user_answer'].count().value_counts(), palette="hls")
plt.title("Count of answers per user", fontsize=12)
plt.xticks(rotation=90, fontsize=13)
plt.ylabel('Number of answers')
plt.xlabel('Count of users')

In [None]:
plt.figure(figsize=(15, 7))
ax = sns.countplot(trainCsv.user_answer)
plt.title("Distribution of Mean's answer per user", fontsize=12)
plt.xticks(rotation=90, fontsize=13)
plt.ylabel('Frequency')
plt.xlabel('Average answer')

## Preprocessing

In [None]:
# 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture. So, let's keep just the questions
trainCsv = trainCsv[trainCsv.content_type_id == 0]
# read -1 as null, for lectures
trainCsv = trainCsv[trainCsv.answered_correctly != -1]

In [None]:
trainCsv = trainCsv.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [None]:
trainCsv.head(5)

In [None]:
content_mean_final = trainCsv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
content_mean_final.columns = ["answered_correctly_content_mean"]

In [None]:
user_mean_final = trainCsv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
user_mean_final.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
#saving value to fillna
elapsed_time_mean_final = trainCsv.prior_question_elapsed_time.mean()

In [None]:
trainCsv.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

In [None]:
validation = pd.DataFrame()
for i in range(4):
    last_records = trainCsv.drop_duplicates('user_id', keep = 'last')
    train_csv = trainCsv[~trainCsv.index.isin(last_records.index)]
    validation = validation.append(last_records)
    #print(validation)

In [None]:
X = pd.DataFrame()
for i in range(15):
    last_records = train_csv.drop_duplicates('user_id', keep = 'last')
    train_csv = train_csv[~train_csv.index.isin(last_records.index)]
    X = X.append(last_records)

In [None]:
results_c = trainCsv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content_mean"]

results_u = trainCsv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
results_c.head()

In [None]:
results_c.shape

In [None]:
results_u.head()

In [None]:
results_u.shape

In [None]:
result_time_mean = trainCsv.prior_question_elapsed_time.mean()

In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")

In [None]:
X.head()

In [None]:
X.info()

In [None]:
X.shape

In [None]:
validation = pd.merge(validation, results_u, on=['user_id'], how="left")
validation = pd.merge(validation, results_c, on=['content_id'], how="left")

In [None]:
validation.head()

In [None]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

In [None]:
X.shape

In [None]:
X.head()

In [None]:
X.columns

In [None]:
from sklearn.preprocessing import LabelEncoder
lencoder = LabelEncoder()

X['prior_question_had_explanation'].fillna(False, inplace = True)
X['prior_question_had_explanation_enc'] = lencoder.fit_transform(X['prior_question_had_explanation'])
X['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X['sum_correct'].fillna(0, inplace = True)
X['count'].fillna(0, inplace = True)
X['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

X_val['prior_question_had_explanation'].fillna(False, inplace = True)
X_val['prior_question_had_explanation_enc'] = lencoder.fit_transform(X_val['prior_question_had_explanation'])
X_val['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X_val['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X_val['sum_correct'].fillna(0, inplace = True)
X_val['count'].fillna(0, inplace = True)
X_val['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

In [None]:
X.head()

In [None]:
X.info()

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["content_id","prior_question_elapsed_time","answered_correctly_user_mean", "answered_correctly_content_mean"]
scatter_matrix(X[attributes], figsize=(12, 8))

In [None]:
#import matplotlib.pyplot as plt
#X.hist(bins=50, figsize=(20,15))
#plt.show()

In [None]:
X = X[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]
X_val = X_val[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]

In [None]:
X

In [None]:
X_val

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)

## Model: 1D-CNN

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization,Dropout,Dense,Flatten,Conv1D
from tensorflow.keras.optimizers import Adam
from keras.metrics import BinaryAccuracy
from keras import backend as K

In [None]:
X1 = X
X_val1 = X_val

In [None]:
K.clear_session()
X_train = X1.reshape(X1.shape[0], X1.shape[1], 1)
X_test = X_val1.reshape(X_val1.shape[0], X_val1.shape[1], 1)

    
model=Sequential()
model.add(Conv1D(32, 2, activation='relu', input_shape=X_train[0].shape))
model.add(Conv1D(64, 2, activation='relu'))#, padding='causal'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

In [None]:
hist = model.fit(X_train, y, epochs=100, batch_size=50000,validation_data =(X_test,y_val))
print(hist)

In [None]:
import matplotlib.pyplot as plot
plot.plot(hist.history['accuracy'])
plot.plot(hist.history['val_accuracy'])
plot.title('Regularized Model accuracy')
plot.ylabel('Accuracy')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

plot.plot(hist.history['loss'])
plot.plot(hist.history['val_loss'])
plot.title('Model loss')
plot.ylabel('Loss')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

In [None]:
y_pred = model.predict(X_test)
y_true = np.array(y_val)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true, y_pred)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_proba = model.predict(X_test) 
y_pred = model.predict_classes(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_pred, y_val))
print('Classification Report')
print(classification_report(y_pred, y_val))

In [None]:
x1=X_test
y1=y_val
score = model.evaluate(x1, y1, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

xtrain1 = X_train
ytrain1 = y
score = model.evaluate(xtrain1, ytrain1, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

## RNN

In [None]:
X3 = X
X_val3 = X_val

In [None]:
K.clear_session()
X_train = X3.reshape(X3.shape[0], X3.shape[1], 1)
X_test = X_val3.reshape(X_val3.shape[0], X_val3.shape[1], 1)

In [None]:
import keras
modelRNN = keras.models.Sequential([
keras.layers.SimpleRNN(100, return_sequences=True, input_shape=X_train[0].shape),
Dropout(0.5),
keras.layers.SimpleRNN(100, return_sequences=True),
Dropout(0.5),

keras.layers.SimpleRNN(1)
])
modelRNN.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

In [None]:
history1 = modelRNN.fit(X_train, y, epochs=100, batch_size=50000,validation_data =(X_test,y_val))
print(history1)

In [None]:
import matplotlib.pyplot as plot
plot.plot(history1.history['accuracy'])
plot.plot(history1.history['val_accuracy'])
plot.title('Regularized Model accuracy')
plot.ylabel('Accuracy')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

plot.plot(history1.history['loss'])
plot.plot(history1.history['val_loss'])
plot.title('Model loss')
plot.ylabel('Loss')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_proba = modelRNN.predict(X_test) 
y_pred = modelRNN.predict_classes(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_pred, y_val))
print('Classification Report')
print(classification_report(y_pred, y_val))

In [None]:
y_pred1 = modelRNN.predict(X_test)
y_true1 = np.array(y_val)


In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true1, y_pred1)

In [None]:
x2=X_test
y2=y_val
score = modelRNN.evaluate(x2, y2, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

xtrain2 = X_train
ytrain2 = y
score = modelRNN.evaluate(xtrain2, ytrain2, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

## LSTM

In [None]:
X4 = X
X_val4 = X_val
K.clear_session()
X_train4 = X4.reshape(X4.shape[0], X4.shape[1], 1)
X_test4 = X_val4.reshape(X_val4.shape[0], X_val4.shape[1], 1)

In [None]:
modelLSTM = keras.models.Sequential([
keras.layers.LSTM(20, return_sequences=True, input_shape=X_train4[0].shape),
keras.layers.LSTM(20, return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(1))
])
modelLSTM.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

In [None]:
history2 = modelLSTM.fit(X_train, y, epochs=150, batch_size=50000,validation_data =(X_test,y_val))
print(history2)

In [None]:
y_pred2 = modelLSTM.predict(X_test)
y_true2 = np.array(y_val)

In [None]:
import matplotlib.pyplot as plot
plot.plot(history2.history['accuracy'])
plot.plot(history2.history['val_accuracy'])
plot.title('Regularized Model accuracy')
plot.ylabel('Accuracy')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

plot.plot(history2.history['loss'])
plot.plot(history2.history['val_loss'])
plot.title('Model loss')
plot.ylabel('Loss')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

In [None]:
x3=X_test
y3=y_val
score = modelLSTM.evaluate(x3, y3, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

xtrain3 = X_train
ytrain3 = y
score = modelLSTM.evaluate(xtrain3, ytrain3, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

## Parameter Tuning

Parameter tuning is done on the 1D-CNN Model since the model accuracy plot shows the train and test curves underfitting.....

In [None]:
X2 = X
X_val2 = X_val

In [None]:
from keras.layers.convolutional import MaxPooling1D

K.clear_session()
X_train2 = X2.reshape(X2.shape[0], X2.shape[1], 1)
X_test2 = X_val2.reshape(X_val2.shape[0], X_val2.shape[1], 1)


modelTuning = Sequential()
modelTuning.add(Conv1D(256, 3, activation='relu', input_shape=X_train2[0].shape))
modelTuning.add(Dropout(0.35))
modelTuning.add(Conv1D(128, 3, activation='relu', padding='valid'))
modelTuning.add(Dropout(0.35))
modelTuning.add(Flatten())
modelTuning.add(Dense(128, activation='relu'))
modelTuning.add(Dropout(0.35))
modelTuning.add(Dense(64, activation='relu'))
modelTuning.add(Dropout(0.35))
modelTuning.add(Dense(1, activation='sigmoid'))

modelTuning.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

In [None]:
hist3 = modelTuning.fit(X_train, y, epochs=100, batch_size=100000,validation_data =(X_test2,y_val))
print(hist3)

In [None]:
import matplotlib.pyplot as plot
plot.plot(hist3.history['accuracy'])
plot.plot(hist3.history['val_accuracy'])
plot.title('Regularized Model accuracy')
plot.ylabel('Accuracy')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

plot.plot(hist3.history['loss'])
plot.plot(hist3.history['val_loss'])
plot.title('Model loss')
plot.ylabel('Loss')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_proba = modelTuning.predict(X_test) 
y_pred = modelTuning.predict_classes(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_pred, y_val))
print('Classification Report')
print(classification_report(y_pred, y_val))

In [None]:
y_pred3 = modelTuning.predict(X_test)
y_true3 = np.array(y_val)


In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true3, y_pred3)

In [None]:
x3=X_test
y3=y_val
score = modelTuning.evaluate(x3, y3, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

xtrain3 = X_train
ytrain3 = y
score = modelTuning.evaluate(xtrain3, ytrain3, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

## Parameter Tuning LSTM

In [None]:
X5 = X
X_val5 = X_val
K.clear_session()
X_train5 = X5.reshape(X5.shape[0], X5.shape[1], 1)
X_test5 = X_val5.reshape(X_val5.shape[0], X_val5.shape[1], 1)

In [None]:
modelLSTMTuned = keras.models.Sequential([
keras.layers.RNN(keras.layers.LSTMCell(50), return_sequences=True,
input_shape=X_train4[0].shape),
keras.layers.RNN(keras.layers.LSTMCell(50), return_sequences=True),
keras.layers.TimeDistributed(keras.layers.Dense(1))
])
modelLSTMTuned.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

In [None]:
history5 = modelLSTMTuned.fit(X_train, y, epochs=350, batch_size=50000,validation_data =(X_test,y_val))
print(history5)

In [None]:
y_pred5 = modelLSTMTuned.predict(X_test)
y_true5 = np.array(y_val)

In [None]:
plot.plot(history5.history['accuracy'])
plot.plot(history5.history['val_accuracy'])
plot.title('Regularized Model accuracy')
plot.ylabel('Accuracy')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

plot.plot(history5.history['loss'])
plot.plot(history5.history['val_loss'])
plot.title('Model loss')
plot.ylabel('Loss')
plot.xlabel('Epoch')
plot.legend(['Train', 'Test'], loc='upper left')
plot.grid()
plot.show()

In [None]:
x5=X_test
y5=y_val
score = modelLSTMTuned.evaluate(x5, y5, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

xtrain5 = X_train
ytrain5 = y
score = modelLSTMTuned.evaluate(xtrain5, ytrain5, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

In [None]:
#y_proba5 = modelLSTMTuned.predict(X_test) 
#y_pred5 = modelLSTMTuned.predict_classes(X_test)
#print('Confusion Matrix')
#print(confusion_matrix(y_pred5, y5))
#print('Classification Report')
#print(classification_report(y_pred5, y5))

## Submission

In [None]:
import riiideducation
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, user_mean_final, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, content_mean_final, on=['content_id'],  how="left")
    
    test_df['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
    test_df['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
    test_df['sum_correct'].fillna(0, inplace=True)
    test_df['count'].fillna(0, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_time_mean_final, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lencoder.transform(test_df["prior_question_had_explanation"])

    # fit transform cnn
    X = scaler.transform(test_df[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
                                  'prior_question_elapsed_time', 'prior_question_had_explanation_enc']])
    test_df['answered_correctly'] = model.predict(X.reshape(X.shape[0], X.shape[1], 1))
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])