In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

with open('../input/pretrained-for-riiid/user_ele_dict.pkl', 'rb') as f:
    user_ele_dict = pickle.load(f)

with open('../input/pretrained-for-riiid/user_num_dict.pkl', 'rb') as f:
    user_num_dict = pickle.load(f)


In [2]:
question_dtype = {
    'question_id':'int16',
    'tags':'object'
}
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv',
                             usecols = question_dtype.keys(), 
                             dtype = question_dtype)

questions_data.tags.fillna('92',inplace=True)

def gen_vec(row):
    row['vec'] = np.zeros(188)
    index_list = row.tags.split()
    for index_ in index_list:
        row.vec[int(index_)] = 1.0
    return row

questions_data = questions_data.apply(gen_vec, axis='columns')

In [3]:
questions_data

Unnamed: 0,question_id,tags,vec
0,0,51 131 162 38,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,131 36 81,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,131 101 162 92,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,131 149 162 29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,131 5 162 38,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
13518,13518,14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13519,13519,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
13520,13520,73,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13521,13521,125,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [4]:
data = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
data['prior_question_had_explanation'] = data['prior_question_had_explanation'].astype('bool')
data = data[data.content_type_id == 0]
data.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,False
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False


In [5]:
len(data)

99271300

In [None]:
#train_data = data.sample(n=1000000, random_state = 1)

In [None]:
train_data.head()

In [6]:
def pre_data(df, ele_dict=user_ele_dict, num_dict=user_num_dict, q_data=questions_data):
    m = len(df)
    X = np.zeros((m,188*2))
    y = np.zeros(m)
    i = 0
    for index, row in tqdm(df.iterrows()):
        mask = q_data.vec[row.content_id]
        X[i,:188] = np.nan_to_num(ele_dict[row.user_id]/num_dict[row.user_id],nan=0.25)
        X[i,188:] = mask
        y[i] = row.answered_correctly
        i = i+1
    return X, y

In [18]:
del X,y,X_big,y_big,X_valid,y_valid,X_train,y_train,lgb_train,lgb_eval

In [13]:
X, y = pre_data(data[2000000:3000000])

  
1000000it [03:57, 4206.86it/s]


In [14]:
from sklearn.model_selection import train_test_split
X_big, X_test, y_big, y_test = train_test_split(X, y, test_size=0.1, random_state=40)
X_train, X_valid, y_train, y_valid = train_test_split(X_big, y_big, test_size=0.1, random_state=10)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(810000, 376) (810000,)
(90000, 376) (90000,)
(100000, 376) (100000,)


In [15]:
import lightgbm as lgb
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 2020,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid)

In [10]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

Training until validation scores don't improve for 8 rounds
[50]	training's auc: 0.69697	valid_1's auc: 0.694217
[100]	training's auc: 0.727064	valid_1's auc: 0.722577
[150]	training's auc: 0.742927	valid_1's auc: 0.737319
[200]	training's auc: 0.753612	valid_1's auc: 0.746695
[250]	training's auc: 0.761646	valid_1's auc: 0.753349
[300]	training's auc: 0.767841	valid_1's auc: 0.758021
[350]	training's auc: 0.772919	valid_1's auc: 0.761418
[400]	training's auc: 0.777334	valid_1's auc: 0.764373
[450]	training's auc: 0.780584	valid_1's auc: 0.766284
[500]	training's auc: 0.783449	valid_1's auc: 0.767657
Early stopping, best iteration is:
[497]	training's auc: 0.783354	valid_1's auc: 0.767706


In [11]:
model.save_model('lgb-model.txt')

<lightgbm.basic.Booster at 0x7fe3c60a5ed0>

In [16]:
model = lgb.train(
    params, lgb_train,
    init_model = 'lgb-model.txt',
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

Training until validation scores don't improve for 8 rounds
[500]	training's auc: 0.7832	valid_1's auc: 0.784416
[550]	training's auc: 0.789779	valid_1's auc: 0.789003
[600]	training's auc: 0.793093	valid_1's auc: 0.790484
[650]	training's auc: 0.795708	valid_1's auc: 0.791438
Early stopping, best iteration is:
[684]	training's auc: 0.797153	valid_1's auc: 0.79187


In [17]:
model.save_model('lgb-model.txt')

<lightgbm.basic.Booster at 0x7fe3c6ef3cd0>

In [19]:
predict_prob = model.predict(X_test)

In [21]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_prob)

0.7929800453001778

In [22]:
model = lgb.Booster(model_file='lgb-model.txt')

In [23]:
predict_prob = model.predict(X_test)

In [24]:
roc_auc_score(y_test, predict_prob)

0.7929800453001778

In [13]:
import joblib
# save model
joblib.dump(model, 'lgb.pkl')

['lgb.pkl']

In [14]:
gbm_pickle = joblib.load('lgb.pkl')

In [15]:
predict_prob = gbm_pickle.predict(X_test)

In [None]:
# def predict_y(row_data, user_ele, user_num, ques_ele=ques_ele_vec, ques_num=ques_num_vec, q_data=questions_data):
#     y = np.ones(len(row_data))*0.25
#     i = 0
#     for index, row in row_data.iterrows():
#         mask = q_data.vec[row.content_id]
#         ques_prob_vec = ques_ele/ques_num
#         if row.user_id in user_ele.keys():
#             borrow_index = (user_num[row.user_id] == 0)
#             user_prob_vec = user_ele[row.user_id]/user_num[row.user_id]
#             user_prob_vec[borrow_index] = ques_prob_vec[borrow_index]
#             y[i] = sum(user_prob_vec * mask)/sum(mask)
#         else:
#             y[i] = sum(ques_prob_vec * mask)/sum(mask)
#         i = i + 1
#     return y

In [None]:
# import riiideducation
# env = riiideducation.make_env()
# iter_test = env.iter_test()
# for (test_df, sample_prediction_df) in iter_test:
#     test_df = test_df[test_df.content_type_id == 0]
#     test_df['answered_correctly'] = predict_y(test_df,user_ele_dict, user_num_dict)
#     #print('-----------------------------------------')
#     #print(test_df[['row_id', 'answered_correctly']])
#     #print('-----------------------------------------')
#     env.predict(test_df[['row_id', 'answered_correctly']])