In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

In [3]:
print(questions_data.loc[questions_data.tags.isnull()])
questions_data.tags.fillna('92',inplace=True)

       question_id tags
10033        10033  NaN


In [4]:
tags_set = set()
print(len(questions_data))
for i in range(len(questions_data)):
    tags_set = tags_set.union(questions_data.tags[i].split())
print(tags_set)
print(len(tags_set))

13523
{'6', '26', '104', '167', '130', '187', '8', '82', '13', '48', '22', '61', '165', '19', '124', '142', '108', '11', '186', '14', '146', '176', '68', '132', '155', '83', '71', '158', '93', '111', '73', '62', '106', '90', '74', '125', '115', '149', '10', '116', '78', '164', '162', '181', '184', '86', '109', '15', '52', '119', '69', '96', '39', '156', '168', '45', '76', '136', '21', '148', '161', '28', '17', '99', '54', '151', '79', '145', '182', '40', '112', '94', '144', '87', '169', '174', '177', '64', '51', '25', '50', '152', '23', '33', '42', '92', '175', '185', '57', '128', '9', '47', '72', '103', '179', '102', '43', '173', '4', '91', '84', '7', '29', '12', '154', '63', '30', '80', '36', '178', '163', '31', '97', '34', '70', '56', '134', '127', '172', '166', '66', '107', '77', '5', '98', '143', '126', '55', '35', '59', '171', '95', '105', '139', '159', '16', '113', '24', '101', '67', '58', '49', '37', '110', '138', '137', '120', '147', '0', '160', '135', '2', '140', '46', '44', 

In [5]:
def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')
questions_data.head()

Unnamed: 0,question_id,tags,vec
0,0,51 131 162 38,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,131 36 81,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,131 101 162 92,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,131 149 162 29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,131 5 162 38,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [6]:
train_dtypes_dict = {
    "row_id": "int64",
    #"timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    #"task_container_id": "int16",
    #"user_answer": "int8",
    "answered_correctly": "int8",
    #"prior_question_elapsed_time": "float32", 
    #"prior_question_had_explanation": "boolean"
}

train_data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                         #nrows=10**5,
                         usecols = train_dtypes_dict.keys(),
                         dtype=train_dtypes_dict,
                         #index_col = 0,
                        )

In [7]:
train_data_q = train_data[train_data.content_type_id == 0]
print(len(train_data_q))
print(len(train_data_q.user_id.unique()))
print(train_data_q.head())

99271300
393656
   row_id  user_id  content_id  content_type_id  answered_correctly
0       0      115        5692                0                   1
1       1      115        5716                0                   1
2       2      115         128                0                   1
3       3      115        7860                0                   1
4       4      115        7922                0                   1


In [10]:
#del train_data
train_data_q_sample = train_data_q.sample(n=10**6, random_state=1)
train_data_q_sample.head()

Unnamed: 0,row_id,user_id,content_id,content_type_id,answered_correctly
28002082,28002082,597853231,2910,0,1
52041514,52041514,1103993007,9183,0,0
94925272,94925272,2015536216,10419,0,1
35713452,35713452,762527375,5579,0,0
33515886,33515886,718268173,193,0,1


In [11]:
def cal_vec(train_row,ele_dict,num_dict,q_data=questions_data):
    num_dict[train_row.user_id] += q_data.vec[train_row.content_id]
    ele_dict[train_row.user_id] += q_data.vec[train_row.content_id] * train_row.answered_correctly

In [12]:
ele_dict = dict()
num_dict = dict()
for index, row in tqdm(train_data_q_sample.iterrows()):
    if row.user_id in ele_dict.keys():
        cal_vec(row,ele_dict,num_dict)
    else:
        ele_dict[row.user_id] = np.zeros(188)
        num_dict[row.user_id] = np.zeros(188)
        cal_vec(row,ele_dict,num_dict)

1000000it [03:52, 4306.19it/s]


In [13]:
print(len(ele_dict),len(num_dict))
#print(ele_dict[115], num_dict[115])

190675 190675


In [14]:
def predict_y(row_data, ele_dict, num_dict, q_data=questions_data):
    y = np.ones(len(row_data))*0.5
    i = 0
    for index, row in tqdm(row_data.iterrows()):
        mask = q_data.vec[row.content_id]
        if row.user_id in ele_dict:
            y[i] = sum(np.nan_to_num(ele_dict[row.user_id]/num_dict[row.user_id],nan=0.5) * mask)/sum(mask)
        i = i + 1
    return y

In [28]:
#test_sample = train_data_q.sample(n=10**5, random_state=6)
test_sample = train_data_q_sample.sample(n=10**5, random_state=7)
prob = predict_y(test_sample,ele_dict, num_dict)

  import sys
100000it [00:43, 2297.45it/s]


In [30]:
from sklearn.metrics import roc_auc_score
print(prob)
print(test_sample.answered_correctly.values)
print(roc_auc_score(test_sample.answered_correctly.values, prob))

[0.         0.83333333 0.77407407 ... 1.         0.         1.        ]
[0 1 1 ... 1 0 1]
0.9935607883394175
