In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

In [None]:
#print(questions_data.loc[questions_data.tags.isnull()])
questions_data.tags.fillna('92',inplace=True)

In [None]:
#tags_set = set()
#print(len(questions_data))
#for i in range(len(questions_data)):
#    tags_set = tags_set.union(questions_data.tags[i].split())
#print(tags_set)
#print(len(tags_set))

In [None]:
def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')

In [None]:
questions_data.head()

In [None]:
data = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
data['prior_question_had_explanation'] = data['prior_question_had_explanation'].astype('bool')

In [None]:
data = data[data.content_type_id == 0]
data.head()

In [None]:
user_id_array = data.user_id.unique()
user_id_str = [str(id_n) for id_n in user_id_array]
user_id_str

In [None]:
u_num_control = 300
t_v_control = 2
record_data = pd.DataFrame()
future_data = pd.DataFrame()
user_r_rate = dict()
for record_user in user_id_str[:u_num_control]:
    record_data = pd.concat([record_data,data[data.user_id==int(record_user)][:-t_v_control]], axis=0)
    future_data = pd.concat([future_data,data[data.user_id==int(record_user)][-t_v_control:]], axis=0)
    user_r_rate[record_user] = record_data[record_data.user_id == int(record_user)].answered_correctly.mean()
r_rate = record_data.answered_correctly.mean()

In [None]:
print(r_rate)
print(user_r_rate)

In [None]:
def cal_vec(row,ele_dict,num_dict,q_data=questions_data):
    num_dict[str(row.user_id)] += q_data.vec[row.content_id]
    ele_dict[str(row.user_id)] += q_data.vec[row.content_id] * row.answered_correctly
    
user_ele_dict = dict()
user_num_dict = dict()
ques_ele_vec = np.zeros(188)
ques_num_vec = np.zeros(188)
for index, row in tqdm(record_data.iterrows()):
    ques_ele_vec += questions_data.vec[row.content_id] * row.answered_correctly
    ques_num_vec += questions_data.vec[row.content_id]
    if str(row.user_id) in user_ele_dict.keys():
        cal_vec(row,user_ele_dict,user_num_dict)
    else:
        user_ele_dict[str(row.user_id)] = np.zeros(188)
        user_num_dict[str(row.user_id)] = np.zeros(188)
        cal_vec(row,user_ele_dict,user_num_dict)

In [None]:
len(user_num_dict)

In [None]:
#magic number
def predict_y(row_data, user_ele, user_num, ques_ele=ques_ele_vec, ques_num=ques_num_vec, user_r_rate=user_r_rate,r_rate=r_rate,q_data=questions_data):
    y = np.ones(len(row_data))*0.25
    i = 0
    for index, row in tqdm(row_data.iterrows()):
        mask = q_data.vec[row.content_id]
        #ques_prob_vec = ques_ele/ques_num
        ques_prob_vec = np.nan_to_num(ques_ele/ques_num, nan=0.25)
        if str(row.user_id) in user_ele.keys():
            borrow_index = (user_num[str(row.user_id)] == 0)
            user_prob_vec = user_ele[str(row.user_id)]/user_num[str(row.user_id)]
            user_prob_vec[borrow_index] = ques_prob_vec[borrow_index] * min(1,user_r_rate[str(row.user_id)]/r_rate)
            #user_prob_vec[borrow_index] = user_prob_vec[borrow_index] * (user_r_rate[row.user_id]/r_rate)
            y[i] = sum(user_prob_vec * mask)/sum(mask)
        else:
            y[i] = sum(ques_prob_vec * mask)/sum(mask)
        i = i + 1
    return y

prob = predict_y(future_data, user_ele_dict, user_num_dict)

In [None]:
user_r_rate['12741']/r_rate

In [None]:
from sklearn.metrics import roc_auc_score
#print(prob)
#print(future_data.answered_correctly.values)
print(roc_auc_score(future_data.answered_correctly.values, prob))

In [None]:
future_data[future_data.user_id==115].answered_correctly.mean()

In [None]:
questions_data.vec[1014]

In [None]:
questions_data[questions_data.question_id == 1014]

In [None]:
user_num_dict[2746]

In [None]:
mask = questions_data.vec[1014]
ques_prob_vec = np.nan_to_num(ques_ele_vec/ques_num_vec, nan=0.25)
borrow_index = (user_num_dict[2746] == 0)
user_prob_vec = user_ele_dict[2746]/user_num_dict[2746]
user_prob_vec[borrow_index] = ques_prob_vec[borrow_index]
sum(user_prob_vec * mask)/sum(mask)

In [None]:
action_n_l = data.user_id.value_counts().values

In [None]:
data.user_id.value_counts().describe()

In [None]:
import matplotlib.pyplot as plt
action_n = data.user_id.value_counts()
fig = plt.figure(figsize=(12,6))
action_n.plot.hist()
plt.show()

In [None]:
sum(action_n_l > 2500)

In [None]:
data[data.user_id==24600].describe()

In [None]:
data[data.user_id==24600].answered_correctly.mean()

In [None]:
print(user_ele_dict[24600])
print(user_num_dict[24600])
print(user_ele_dict[24600]/user_num_dict[24600])