In [1]:
import numpy as np
import pandas as pd
#from tqdm import tqdm
import pickle

with open('../input/pretrained-for-riiid/user_ele_dict.pkl', 'rb') as f:
    user_ele_dict = pickle.load(f)

with open('../input/pretrained-for-riiid/user_num_dict.pkl', 'rb') as f:
    user_num_dict = pickle.load(f)

with open('../input/pretrained-for-riiid/ques_ele_vec.pkl', 'rb') as f:
    ques_ele_vec = pickle.load(f)

with open('../input/pretrained-for-riiid/ques_num_vec.pkl', 'rb') as f:
    ques_num_vec = pickle.load(f)  

In [2]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

questions_data.tags.fillna('92',inplace=True)

def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')

In [3]:
train_dtypes_dict = {
    "row_id": "int64",
    #"timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    #"task_container_id": "int16",
    #"user_answer": "int8",
    "answered_correctly": "int8",
    #"prior_question_elapsed_time": "float32", 
    #"prior_question_had_explanation": "boolean"
}

train_data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                         #nrows=10**6,
                         usecols = train_dtypes_dict.keys(),
                         dtype=train_dtypes_dict,
                         #index_col = 0,
                        )
train_data_q = train_data[train_data.content_type_id == 0]

In [4]:
del train_data

In [5]:
def pre_data(row_data, ele_dict, num_dict, q_data=questions_data):
    m = len(row_data)
    X = np.ones((m,188)) * 0.5
    y = np.zeros(m)
    i = 0
    for index, row in tqdm(row_data.iterrows()):
        mask = q_data.vec[row.content_id]
        if row.user_id in ele_dict:
            X[i] = np.nan_to_num(ele_dict[row.user_id]/num_dict[row.user_id],nan=0.5) * mask
        else:
            X[i] = X[i] * mask
        y[i] = row.answered_correctly
        i = i+1
    return X, y

In [None]:
from tqdm import tqdm
X_train, y_train = pre_data(train_data_q.sample(n=10**7), user_ele_dict, user_num_dict)
print(X_train.shape, y_train.shape)
X_valid, y_valid = pre_data(train_data_q.sample(n=10**5), user_ele_dict, user_num_dict)
print(X_valid.shape, y_valid.shape)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Create a network with 1 linear unit
model = keras.Sequential([
    layers.Dense(units=1,input_shape=[188], activation='sigmoid')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-2),
    loss='binary_crossentropy',
    #metrics=['binary_accuracy'],
)

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=256,
    epochs=10,
)

In [None]:
# convert the training history to a dataframe
history_df = pd.DataFrame(history.history)
# use Pandas native plot method
#history_df['loss'].plot();
history_df.loc[:, ['loss', 'val_loss']].plot()

In [None]:
test_sample = train_data_q.sample(n=10**5)
X_test, y_test = pre_data(test_sample, user_ele_dict, user_num_dict)

In [None]:
predict_prob = model.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_prob)

In [None]:
# def predict_y(row_data, user_ele, user_num, ques_ele=ques_ele_vec, ques_num=ques_num_vec, q_data=questions_data):
#     y = np.ones(len(row_data))*0.25
#     i = 0
#     for index, row in row_data.iterrows():
#         mask = q_data.vec[row.content_id]
#         ques_prob_vec = ques_ele/ques_num
#         if row.user_id in user_ele.keys():
#             borrow_index = (user_num[row.user_id] == 0)
#             user_prob_vec = user_ele[row.user_id]/user_num[row.user_id]
#             user_prob_vec[borrow_index] = ques_prob_vec[borrow_index]
#             y[i] = sum(user_prob_vec * mask)/sum(mask)
#         else:
#             y[i] = sum(ques_prob_vec * mask)/sum(mask)
#         i = i + 1
#     return y

In [None]:
# import riiideducation
# env = riiideducation.make_env()
# iter_test = env.iter_test()
# for (test_df, sample_prediction_df) in iter_test:
#     test_df = test_df[test_df.content_type_id == 0]
#     test_df['answered_correctly'] = predict_y(test_df,user_ele_dict, user_num_dict)
#     #print('-----------------------------------------')
#     #print(test_df[['row_id', 'answered_correctly']])
#     #print('-----------------------------------------')
#     env.predict(test_df[['row_id', 'answered_correctly']])