- This Notebook is a continuation of [the notebook](https://www.kaggle.com/imazekishota/riiid-word2vec-with-content-id)
- Base Notebook: [Riiid! LGBM bagging2](https://www.kaggle.com/zephyrwang666/riiid-lgbm-bagging2)

# Import

In [None]:
DEBUG = True

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import numpy as np
import random
import pandas as pd
import datatable as dt
import gc
from tqdm.notebook import tqdm
from collections import defaultdict

from gensim.models import Word2Vec, KeyedVectors
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
from sklearn.metrics import roc_auc_score

In [None]:
N = 3

# Read Data

In [None]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()

In [None]:
# delete lecture rows
train_df = train_df[~train_df['content_type_id']]
train_df.drop('content_type_id', axis=1, inplace=True)
gc.collect()

# Feature Engineering

In [None]:
train_df['word'] = train_df['content_id'] + train_df['answered_correctly'] * 100000 # 6digit is answered_correctly

In [None]:
for n in range(1, N):
    train_df[f'lag_word{n}'] = train_df.groupby('user_id')['word'].shift(n)
    train_df[f'lag_word{n}'].fillna(-1, inplace=True)
    train_df[f'lag_word{n}'] = train_df[f'lag_word{n}'].astype('int32')

# sampling
if DEBUG:
    train_df=train_df[:10000]
else:
    train_df = train_df[1200*10000:2*1200*10000]

train_df.head()

In [None]:
gc.collect()

## word2vec Feature
- It takes a lot of time to create this feature...

In [None]:
wv = KeyedVectors.load_word2vec_format('../input/riiid-word2vec/vec.pt', binary=True)

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def apply_cosine_similarity(row, is_correct=True):
#     print('lag_word1', row['lag_word1'])
#     print(row['content_id'])
#     print('content_id')
    arr = []
    for col in cols:
        if row[col]==-1:
            continue
        else:
            v = wv.get_vector(str(row[col]))
            arr.append(v)

    if is_correct:
        word = str(row['content_id'] + 100000)
    else:
        word = str(row['content_id'])
    
    if len(arr) > 0 and word in wv.vocab:
        v1 = np.mean(arr, axis=0)
        v2 = wv.get_vector(word)
        return cosine_similarity(v1, v2)
    else:
        return 0

In [None]:
cols = []
for n in range(1, N):
    cols.append(f'lag_word{n}')


train_df[["cos_sim_correct"]] = train_df[cols + ['content_id']].apply(lambda row: apply_cosine_similarity(row), axis=1)
train_df[["cos_sim_incorrect"]] = train_df[cols + ['content_id']].apply(lambda row: apply_cosine_similarity(row, is_correct=False), axis=1)
train_df.drop(cols + ['word'], axis=1, inplace=True)
train_df['cos_sim_correct'] = train_df['cos_sim_correct'].astype('float32')
train_df['cos_sim_incorrect'] = train_df['cos_sim_incorrect'].astype('float32')
train_df.head()

# Train

In [None]:
features_dict = {
    'timestamp':'float16',
    'content_id':'int16',
    'task_container_id':'int16',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int16',
    'cos_sim_correct': 'float32',
    'cos_sim_incorrect': 'float32',
}

categorical_columns= [
    'content_id',
    'task_container_id',
]

features=list(features_dict.keys())

In [None]:
flag_lgbm=True
clfs = list()
params = {
'num_leaves': 300,
'max_bin':450,
'feature_fraction': 0.52,
'bagging_fraction': 0.52,
'objective': 'binary',
'learning_rate': 0.05,
"boosting_type": "gbdt",
"metric": 'auc',
}
trains=list()
valids=list()
num=1
for i in range(0,num):
    
    users=train_df['user_id'].drop_duplicates()
    users=users.sample(frac=0.08)
    users_df=pd.DataFrame()
    users_df['user_id']=users.values
    valid_df_newuser = pd.merge(train_df, users_df, on=['user_id'], how='inner',right_index=True)
    del users_df
    del users
    gc.collect()

    train_df.drop(valid_df_newuser.index, inplace=True)
    valid_df=train_df.sample(frac=0.1)
    train_df.drop(valid_df.index, inplace=True)
   
    valid_df = valid_df.append(valid_df_newuser)
    del valid_df_newuser
    gc.collect()

    trains.append(train_df)
    valids.append(valid_df)
    print('train_df_clf length：',len(train_df))
    print('valid_df length：',len(valid_df))

In [None]:
del train_df
del valid_df
gc.collect()

In [None]:
for i in range(0,num):
    X_train_np = trains[i][features].values.astype(np.float32)
    X_valid_np = valids[i][features].values.astype(np.float32)
    tr_data = lgb.Dataset(X_train_np, label=trains[i][target], feature_name=list(features))
    va_data = lgb.Dataset(X_valid_np, label=valids[i][target], feature_name=list(features))

    del trains, valids, X_train_np, X_valid_np
    gc.collect()

    model = lgb.train(
        params, 
        tr_data,
        num_boost_round=5000,
        valid_sets=[tr_data, va_data],
        early_stopping_rounds=50,
        feature_name=features,
        categorical_feature=categorical_columns,
        verbose_eval=50
    )
    clfs.append(model)
    print('Training done!!!')

    fig,ax = plt.subplots(figsize=(15,15))
    lgb.plot_importance(model, ax=ax,importance_type='gain',max_num_features=50)
    plt.show()

    del tr_data, va_data
    gc.collect()