In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

In [3]:
#print(questions_data.loc[questions_data.tags.isnull()])
questions_data.tags.fillna('92',inplace=True)

In [4]:
#tags_set = set()
#print(len(questions_data))
#for i in range(len(questions_data)):
#    tags_set = tags_set.union(questions_data.tags[i].split())
#print(tags_set)
#print(len(tags_set))

In [5]:
def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')
#questions_data.head()

In [6]:
train_dtypes_dict = {
    "row_id": "int64",
    #"timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    #"task_container_id": "int16",
    #"user_answer": "int8",
    "answered_correctly": "int8",
    #"prior_question_elapsed_time": "float32", 
    #"prior_question_had_explanation": "boolean"
}

train_data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                         nrows=10**7,
                         usecols = train_dtypes_dict.keys(),
                         dtype=train_dtypes_dict,
                         #index_col = 0,
                        )

In [7]:
train_data_q = train_data[train_data.content_type_id == 0]
#print(len(train_data_q))
#print(len(train_data_q.user_id.unique()))
#print(train_data_q.head())

In [8]:
#del train_data
#train_data_q_sample = train_data_q#.sample(n=10**6, random_state=1)
#train_data_q_sample.head()

In [9]:
def cal_vec(train_row,ele_dict,num_dict,q_data=questions_data):
    num_dict[train_row.user_id] += q_data.vec[train_row.content_id]
    ele_dict[train_row.user_id] += q_data.vec[train_row.content_id] * train_row.answered_correctly

In [10]:
user_ele_dict = dict()
user_num_dict = dict()
ques_ele_vec = np.zeros(188)
ques_num_vec = np.zeros(188)
for index, row in train_data_q.iterrows():
    ques_ele_vec += questions_data.vec[row.content_id] * row.answered_correctly
    ques_num_vec += questions_data.vec[row.content_id]
    if row.user_id in user_ele_dict.keys():
        cal_vec(row,user_ele_dict,user_num_dict)
    else:
        user_ele_dict[row.user_id] = np.zeros(188)
        user_num_dict[row.user_id] = np.zeros(188)
        cal_vec(row,user_ele_dict,user_num_dict)

In [11]:
#print(len(user_ele_dict),len(user_num_dict))
#print(ques_ele_vec/ques_num_vec)

In [12]:
def predict_y(row_data, user_ele, user_num, ques_ele=ques_ele_vec, ques_num=ques_num_vec, q_data=questions_data):
    y = np.ones(len(row_data))*0.25
    i = 0
    for index, row in row_data.iterrows():
        mask = q_data.vec[row.content_id]
        ques_prob_vec = ques_ele/ques_num
        if row.user_id in user_ele.keys():
            borrow_index = (user_num[row.user_id] == 0)
            user_prob_vec = user_ele[row.user_id]/user_num[row.user_id]
            user_prob_vec[borrow_index] = ques_prob_vec[borrow_index]
            y[i] = sum(user_prob_vec * mask)/sum(mask)
        else:
            y[i] = sum(ques_prob_vec * mask)/sum(mask)
        i = i + 1
    return y

In [13]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df[test_df.content_type_id == 0]
    test_df['answered_correctly'] = predict_y(test_df,user_ele_dict, user_num_dict)
    #print('-----------------------------------------')
    #print(test_df[['row_id', 'answered_correctly']])
    #print('-----------------------------------------')
    env.predict(test_df[['row_id', 'answered_correctly']])

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


In [14]:
#test_sample = train_data_q.sample(n=10**6, random_state=99)
#prob = predict_y(test_sample,user_ele_dict, user_num_dict)

In [15]:
#from sklearn.metrics import roc_auc_score
#print(prob)
#print(test_sample.answered_correctly.values)
#print(roc_auc_score(test_sample.answered_correctly.values, prob))