In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
import riiideducation
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization,Dropout,Dense,Flatten,Conv1D
from tensorflow.keras.optimizers import Adam
from keras.metrics import BinaryAccuracy
from keras import backend as K

# Preprocess

In [None]:
data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)

In [None]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)

In [None]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
train_df = train_df.groupby('user_id').tail(24).reset_index(drop=True)

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [None]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_avg'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [None]:
valid_df = train_df.groupby('user_id').tail(6)
last_df = train_df.groupby('user_id').tail(1)
train_df.drop(valid_df.index, inplace=True)

In [None]:
train_df.head()

# CNN Training

In [None]:
lencoder = LabelEncoder()

result_time_mean = train_df.prior_question_elapsed_time.mean()

train_df['prior_question_had_explanation'].fillna(False, inplace = True)
train_df['prior_question_had_explanation_enc'] = lencoder.fit_transform(train_df['prior_question_had_explanation'])
train_df['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)
train_df['user_correctness'].fillna(0.7,  inplace=True)
train_df['part'].fillna(4,  inplace=True)

valid_df['prior_question_had_explanation'].fillna(False, inplace = True)
valid_df['prior_question_had_explanation_enc'] = lencoder.fit_transform(valid_df['prior_question_had_explanation'])
valid_df['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)
valid_df['user_correctness'].fillna(0.7,  inplace=True)
valid_df['part'].fillna(4,  inplace=True)

last_df['prior_question_had_explanation'].fillna(False, inplace = True)
last_df['prior_question_had_explanation_enc'] = lencoder.fit_transform(last_df['prior_question_had_explanation'])
last_df['prior_question_elapsed_time'].fillna(result_time_mean, inplace = True)
last_df['user_correctness'].fillna(0.7,  inplace=True)
last_df['part'].fillna(4,  inplace=True)



In [None]:
# train_df[:5]

In [None]:
last_df.to_csv('cnn_last_data.csv',index=True)

In [None]:
X = train_df[['prior_question_elapsed_time','prior_question_had_explanation_enc','part','user_correctness','content_count','content_avg']]
X_val = valid_df[['prior_question_elapsed_time','prior_question_had_explanation_enc','part','user_correctness','content_count','content_avg']]
y = train_df[target]
y_val = valid_df[target]

In [None]:
X.isnull().any()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_val = scaler.transform(X_val)

In [None]:
K.clear_session()
X_train = X.reshape(X.shape[0], X.shape[1], 1)
X_test = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

model = Sequential()
model.add(Conv1D(64, 2, activation='relu', input_shape=X_train[0].shape))
model.add(Conv1D(64, 2, activation='relu', padding='causal'))
model.add(Dropout(0.1))

model.add(Flatten())

model.add(Dense(256, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y , epochs=30, verbose=2, batch_size=50000)

In [None]:
model.save('CNN.model')

In [None]:
# y_pred = model.predict(X_test)

In [None]:
# y_true = np.array(y_val)

In [None]:
# roc_auc_score(y_true, y_pred)

# Inference

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
import joblib
joblib.dump(user_sum_dict, 'user_sum_dict.pkl')
joblib.dump(user_count_dict, 'user_count_dict.pkl')
joblib.dump(content_sum_dict, 'content_sum_dict.pkl')
joblib.dump(content_count_dict, 'content_count_dict.pkl')
joblib.dump(lencoder,'exp_encoder.pkl')
joblib.dump(scaler,'cnn_scaler.pkl')


In [None]:
# try:
#     env = riiideducation.make_env()
# except:
# #     pass
# iter_test = env.iter_test()
# prior_test_df = None

In [None]:
# for (test_df, sample_prediction_df) in iter_test:
#     if prior_test_df is not None:
#         prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
#         prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
#         user_ids = prior_test_df['user_id'].values
#         content_ids = prior_test_df['content_id'].values
#         targets = prior_test_df[target].values
        
#         for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
#             user_sum_dict[user_id] += answered_correctly
#             user_count_dict[user_id] += 1
#             content_sum_dict[content_id] += answered_correctly
#             content_count_dict[content_id] += 1

#     prior_test_df = test_df.copy()
    
#     test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
#     test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
#     test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')  
#     test_df["prior_question_had_explanation_enc"] = lencoder.transform(test_df["prior_question_had_explanation"])
    
#     user_sum = np.zeros(len(test_df), dtype = np.int16)
#     user_count = np.zeros(len(test_df), dtype = np.int16)
#     content_sum = np.zeros(len(test_df), dtype = np.int32)
#     content_count = np.zeros(len(test_df), dtype = np.int32)
    
#     for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
#         user_sum[i] = user_sum_dict[user_id]
#         user_count[i] = user_count_dict[user_id]
#         content_sum[i] = content_sum_dict[content_id]
#         content_count[i] = content_count_dict[content_id]

#     test_df['user_correctness'] = user_sum / user_count
#     test_df['content_count'] = content_count
#     test_df['content_avg'] = content_sum / content_count
    

#     X = scaler.transform(test_df[['prior_question_elapsed_time','prior_question_had_explanation_enc','part','user_correctness','content_count','content_avg']])
    
#     test_df['answered_correctly'] = model.predict_proba(X.reshape(X.shape[0], X.shape[1], 1))
    
#     env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])