## Cài đặt các thư viện cần thiết

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import pandas as pd
import datatable as dt
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

import riiideducation

from sklearn.metrics import roc_auc_score,mean_squared_error,roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization,Dropout,Dense,Flatten,Conv1D, MaxPooling1D,LSTM,GRU
from tensorflow.keras.optimizers import Adam
from keras.metrics import BinaryAccuracy,MeanSquaredError
from keras import backend as K

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Kiểm tra dữ liệu đang có

In [None]:
os.listdir('../input/riiid-test-answer-prediction')

In [None]:
lectures_csv = dt.fread("../input/riiid-test-answer-prediction/lectures.csv").to_pandas()
example_test_csv = dt.fread("../input/riiid-test-answer-prediction/example_test.csv").to_pandas()
train_csv = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", nrows=10**6, low_memory=False)
questions_csv = dt.fread("../input/riiid-test-answer-prediction/questions.csv").to_pandas()

## Pre-Processing

In [None]:
# 0 nếu sự kiện là câu hỏi cho người dùng, 1 nếu sự kiện là người dùng đang xem bài giảng. Chỉ cần lấy câu hỏi
train_csv = train_csv[train_csv.content_type_id == 0]
# giá trị -1 là bài giảng
train_csv = train_csv[train_csv.answered_correctly != -1]

In [None]:
train_csv = train_csv.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [None]:
train_csv.head(5)

In [None]:
content_mean_final = train_csv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
content_mean_final.columns = ["answered_correctly_content_mean"]

In [None]:
user_mean_final = train_csv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
user_mean_final.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
#dữ liệu cho fillna
elapsed_time_mean_final = train_csv.prior_question_elapsed_time.mean()

In [None]:
train_csv.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

## Validation/Train datasets

In [None]:
validation = pd.DataFrame()
for i in range(4):
    last_records = train_csv.drop_duplicates('user_id', keep = 'last')
    train_csv = train_csv[~train_csv.index.isin(last_records.index)]
    validation = validation.append(last_records)

In [None]:
X = pd.DataFrame()
for i in range(15):
    last_records = train_csv.drop_duplicates('user_id', keep = 'last')
    train_csv = train_csv[~train_csv.index.isin(last_records.index)]
    X = X.append(last_records)

In [None]:
results_c = train_csv[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content_mean"]

results_u = train_csv[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ["answered_correctly_user_mean", 'sum_correct', 'count']

In [None]:
result_time_mean = train_csv.prior_question_elapsed_time.mean()

In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")

In [None]:
validation = pd.merge(validation, results_u, on=['user_id'], how="left")
validation = pd.merge(validation, results_c, on=['content_id'], how="left")

In [None]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

In [None]:
X.columns

In [None]:
lencoder = LabelEncoder()

X['prior_question_had_explanation'].fillna(False, inplace = True)
X['prior_question_had_explanation_enc'] = lencoder.fit_transform(X['prior_question_had_explanation'])
X['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X['sum_correct'].fillna(0, inplace = True)
X['count'].fillna(0, inplace = True)
X['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

X_val['prior_question_had_explanation'].fillna(False, inplace = True)
X_val['prior_question_had_explanation_enc'] = lencoder.fit_transform(X_val['prior_question_had_explanation'])
X_val['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
X_val['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
X_val['sum_correct'].fillna(0, inplace = True)
X_val['count'].fillna(0, inplace = True)
X_val['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)

In [None]:
X = X[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]
X_val = X_val[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
       'prior_question_elapsed_time','prior_question_had_explanation_enc']]

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)

### cnn

In [None]:
from tensorflow.keras.layers import Conv1D
K.clear_session()
X_train = X.reshape(X.shape[0], X.shape[1], 1)
X_test = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    
model=Sequential()

model.add(Conv1D(32, 2, activation='sigmoid', input_shape=X_train[0].shape))

model.add(Conv1D(24, 2, activation='relu', padding='causal'))

model.add(Conv1D(128, 2, activation='relu', padding='causal'))
model.add(Conv1D(256, 2, activation='relu', padding='causal'))

model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = model.fit(X_train, y, epochs=30, verbose=2, batch_size=50000)

In [None]:
y_pred = model.predict(X_test)
y_true = np.array(y_val)

In [None]:
roc_auc_score(y_true, y_pred)

In [None]:
fpr, tpr, _ = roc_curve(y_true, y_pred)

plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
mean_squared_error(y_true, y_pred)

## Prediction

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, user_mean_final, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, content_mean_final, on=['content_id'],  how="left")
    
    test_df['answered_correctly_user_mean'].fillna(0.5,  inplace=True)
    test_df['answered_correctly_content_mean'].fillna(0.5,  inplace=True)
    test_df['sum_correct'].fillna(0, inplace=True)
    test_df['count'].fillna(0, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_time_mean_final, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lencoder.transform(test_df["prior_question_had_explanation"])

    # fit transform cnn
    X = scaler.transform(test_df[['answered_correctly_user_mean', 'answered_correctly_content_mean', 'sum_correct', 'count',
                                  'prior_question_elapsed_time', 'prior_question_had_explanation_enc']])
    test_df['answered_correctly'] = model.predict(X.reshape(X.shape[0], X.shape[1], 1))
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])