In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

with open('../input/pretrained-for-riiid/user_ele_dict.pkl', 'rb') as f:
    user_ele_dict = pickle.load(f)

with open('../input/pretrained-for-riiid/user_num_dict.pkl', 'rb') as f:
    user_num_dict = pickle.load(f)

with open('../input/pretrained-for-riiid/ques_ele_vec.pkl', 'rb') as f:
    ques_ele_vec = pickle.load(f)

with open('../input/pretrained-for-riiid/ques_num_vec.pkl', 'rb') as f:
    ques_num_vec = pickle.load(f)  

In [2]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

questions_data.tags.fillna('92',inplace=True)

def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')

In [3]:
train_dtypes_dict = {
    "row_id": "int64",
    #"timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    #"task_container_id": "int16",
    #"user_answer": "int8",
    "answered_correctly": "int8",
    #"prior_question_elapsed_time": "float32", 
    #"prior_question_had_explanation": "boolean"
}

data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                         nrows=10**5,
                         usecols = train_dtypes_dict.keys(),
                         dtype=train_dtypes_dict,
                         #index_col = 0,
                        )
data_q = data[data.content_type_id == 0]

In [4]:
del data
train_data_q = data_q[:int(9 /10 * len(data_q))]
valid_data_q = data_q[int(9 /10 * len(data_q)):]
del data_q

In [5]:
print(len(train_data_q),len(valid_data_q))

88363 9819


In [6]:
def pre_data(row_data, ele_dict, num_dict, ques_ele=ques_ele_vec,ques_num=ques_num_vec,q_data=questions_data):
    m = len(row_data)
    X = np.ones((m,188*3)) * 0.25
    y = np.zeros(m)
    i = 0
    ave = ques_ele/ques_num
    for index, row in row_data.iterrows():
        mask = q_data.vec[row.content_id]
        if row.user_id in ele_dict:
            X[i,:188] = np.nan_to_num(ele_dict[row.user_id]/num_dict[row.user_id],nan=0.25)
        X[i,188:188*2] = mask
        X[i,188*2:] = ave
        y[i] = row.answered_correctly
        i = i+1
    return X, y

In [7]:
X_valid, y_valid = pre_data(valid_data_q, user_ele_dict, user_num_dict)

  # Remove the CWD from sys.path while we load stuff.


In [8]:
from tensorflow import keras
from tensorflow.keras import layers

# 188*[3,2,1] acc7.0slow |188*[2,1] acc7.0+ 20epoch|
model = keras.Sequential([
    layers.Dense(units=188*2,input_shape=[188*3], activation='relu'),
    layers.Dense(units=188, activation='relu'),
    layers.Dense(units=1, activation='sigmoid')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),#1e-2 can't descend 1e-4best
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [9]:
import math
import matplotlib.pyplot as plt
n = len(train_data_q)
batch_size = 256
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []
epochs = 20

for epoch in range(epochs):
    epoch_train_loss = []
    epoch_train_acc = []
    for batch_index in range(math.ceil(n/batch_size)):
        
        if batch_index*batch_size+batch_size < n:
            X_train, y_train = pre_data(train_data_q[batch_index*batch_size:batch_index*batch_size+batch_size], user_ele_dict, user_num_dict)
        else:
            X_train, y_train = pre_data(train_data_q[batch_index*batch_size:], user_ele_dict, user_num_dict)
        
        batch_loss = model.train_on_batch(X_train, y_train,reset_metrics=False,return_dict=True)
        epoch_train_loss.append(batch_loss['loss'])
        epoch_train_acc.append(batch_loss['binary_accuracy'])
        
    train_loss.append(np.mean(epoch_train_loss))
    train_acc.append(np.mean(epoch_train_acc))
    
    epoch_valid_loss = model.test_on_batch(X_valid, y_valid,reset_metrics=False, return_dict=True)
    valid_loss.append(epoch_valid_loss['loss'])
    valid_acc.append(epoch_valid_loss['binary_accuracy'])
    print("Epoch: ", epoch, 
          "train_loss: ", round(np.mean(epoch_train_loss),4),
          "valid_loss: ", round(epoch_valid_loss['loss'],4),
          "train_acc: ", round(np.mean(epoch_train_acc),4),
          "valid_acc: ", round(epoch_valid_loss['binary_accuracy'],4))

model.save("my_h5_model.h5")
print("model saved!")

  # Remove the CWD from sys.path while we load stuff.


Epoch:  0 train_loss:  0.6167 valid_loss:  0.6036 train_acc:  0.6781 valid_acc:  0.6904
Epoch:  1 train_loss:  0.6016 valid_loss:  0.5994 train_acc:  0.692 valid_acc:  0.6934
Epoch:  2 train_loss:  0.5984 valid_loss:  0.5974 train_acc:  0.6942 valid_acc:  0.6947
Epoch:  3 train_loss:  0.5967 valid_loss:  0.5959 train_acc:  0.6952 valid_acc:  0.6957
Epoch:  4 train_loss:  0.5954 valid_loss:  0.5948 train_acc:  0.6961 valid_acc:  0.6964
Epoch:  5 train_loss:  0.5943 valid_loss:  0.5939 train_acc:  0.6968 valid_acc:  0.697
Epoch:  6 train_loss:  0.5935 valid_loss:  0.5931 train_acc:  0.6972 valid_acc:  0.6974
Epoch:  7 train_loss:  0.5927 valid_loss:  0.5924 train_acc:  0.6977 valid_acc:  0.6978
Epoch:  8 train_loss:  0.5921 valid_loss:  0.5918 train_acc:  0.698 valid_acc:  0.6982
Epoch:  9 train_loss:  0.5914 valid_loss:  0.5912 train_acc:  0.6984 valid_acc:  0.6986
Epoch:  10 train_loss:  0.5909 valid_loss:  0.5907 train_acc:  0.6988 valid_acc:  0.6989
Epoch:  11 train_loss:  0.5904 val

In [None]:
plt.plot(train_loss)
plt.plot(valid_loss)

In [None]:
plt.plot(train_acc)
plt.plot(valid_acc)

In [None]:
# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_valid, y_valid),
#     batch_size=256,
#     epochs=20,
# )

In [None]:
# # convert the training history to a dataframe
# history_df = pd.DataFrame(history.history)
# # use Pandas native plot method
# #history_df['loss'].plot();
# history_df.loc[:, ['loss', 'val_loss']].plot()
# history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()

In [None]:
test_sample = train_data_q[70000:]
X_test, y_test = pre_data(test_sample, user_ele_dict, user_num_dict)

In [None]:
predict_prob = model.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_prob)

In [None]:
# def predict_y(row_data, user_ele, user_num, ques_ele=ques_ele_vec, ques_num=ques_num_vec, q_data=questions_data):
#     y = np.ones(len(row_data))*0.25
#     i = 0
#     for index, row in row_data.iterrows():
#         mask = q_data.vec[row.content_id]
#         ques_prob_vec = ques_ele/ques_num
#         if row.user_id in user_ele.keys():
#             borrow_index = (user_num[row.user_id] == 0)
#             user_prob_vec = user_ele[row.user_id]/user_num[row.user_id]
#             user_prob_vec[borrow_index] = ques_prob_vec[borrow_index]
#             y[i] = sum(user_prob_vec * mask)/sum(mask)
#         else:
#             y[i] = sum(ques_prob_vec * mask)/sum(mask)
#         i = i + 1
#     return y

In [None]:
# import riiideducation
# env = riiideducation.make_env()
# iter_test = env.iter_test()
# for (test_df, sample_prediction_df) in iter_test:
#     test_df = test_df[test_df.content_type_id == 0]
#     test_df['answered_correctly'] = predict_y(test_df,user_ele_dict, user_num_dict)
#     #print('-----------------------------------------')
#     #print(test_df[['row_id', 'answered_correctly']])
#     #print('-----------------------------------------')
#     env.predict(test_df[['row_id', 'answered_correctly']])