In [45]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from dataset import feature_engineering, custom_train_test_split, make_dataset
import os
import pandas as pd
from matplotlib import pyplot as plt
import torch


import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet

import torch
from tqdm import tqdm

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc

In [46]:
def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)

In [35]:
cate_cols = [
            # 'assessmentItemID',
            # 'testId',
            # 'KnowledgeTag',
            'hour',
            'dow',
            'i_head',
            'i_mid',
            'i_tail',
]
cont_cols = [                        
            'user_correct_answer',
            'user_total_answer',
            'user_acc',            
            't_elapsed',            
            'cum_correct',
            # 'last_problem',
            'head_term',
            # 'left_asymptote',
            'elo_prob',
            'pkt',
            'u_head_mean',
            'u_head_count',
            'u_head_std',
            'u_head_elapsed',
            'i_mid_elapsed',
            'i_mid_mean',
            'i_mid_std',
            'i_mid_sum',
            'i_mid_count',
            'i_mid_tag_count',
            'assessment_mean',
            'assessment_sum',
            # 'assessment_std',
            'tag_mean',
            'tag_sum',
            # 'tag_std',
            'tail_mean',
            'tail_sum',
            # 'tail_std',
            'hour_mean',
            'hour_sum',
            # 'hour_std',
            'dow_mean',
            'dow_sum',
            # 'dow_std',
            'tag_elapsed',
            'tag_elapsed_o',
            'tag_elapsed_x',
            'assessment_elapsed',
            'assessment_elapsed_o',
            'assessment_elapsed_x',
            'tail_elapsed',
            'tail_elapsed_o',
            'tail_elapsed_x']
FEATS = cate_cols + cont_cols

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


In [18]:
train_data = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/all.pkl')

categorical_dims =  []

for cate in cate_cols:
    categorical_dims.append(int(train_data[[cate]].nunique()))

In [4]:
cate_cols

['hour', 'dow']

In [5]:
categorical_dims

[24, 7]

In [31]:
model01 = TabNetClassifier(
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.001),
                        scheduler_params={"step_size":50,
                                            "gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        device_name = DEVICE,                        
                        mask_type='sparsemax' # "sparsemax", entmax
                      )

model02 = TabNetClassifier(
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.003),
                        scheduler_params={"step_size":50,
                                            "gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        device_name = DEVICE,                        
                        mask_type='sparsemax' # "sparsemax", entmax
                      )

model03 = TabNetClassifier(
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.005),
                        scheduler_params={"step_size":50,
                                            "gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        device_name = DEVICE,                        
                        mask_type='sparsemax' # "sparsemax", entmax
                      )

model04 = TabNetClassifier(
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.001),
                        scheduler_params={"step_size":50,
                                            "gamma":0.5},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        device_name = DEVICE,                        
                        mask_type='sparsemax' # "sparsemax", entmax
                      )

model05 = TabNetClassifier(
                        optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.005),
                        scheduler_params={"step_size":50,
                                            "gamma":0.7},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        device_name = DEVICE,                        
                        mask_type='sparsemax' # "sparsemax", entmax
                      )                                                                                        

Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda


In [28]:
valid_user = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()

valid_idx = train_data[train_data.userID.isin(valid_user)==True].groupby('userID').tail(1).index



In [54]:
train = train_data[train_data.index.isin(valid_idx) == False]
valid = train_data[train_data.index.isin(valid_idx) == True]

X_train = train[FEATS]
y_train = train.answerCode.values

X_valid = valid[FEATS]
y_valid = valid.answerCode.values

In [43]:

model01.fit(  
            X_train,
            y_train,
            eval_set=[
            (X_train, y_train),
            (X_valid, y_valid)],
            patience= 5,
            batch_size= 2048,
            virtual_batch_size = 128,
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs = 10,
            weights=1,)

model02.fit(  
            X_train,
            y_train,
            eval_set=[
            (X_train, y_train),
            (X_valid, y_valid)],
            patience= 5,
            batch_size= 2048,
            virtual_batch_size = 128,
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs = 10,
            weights=1,)


model03.fit(  
            X_train,
            y_train,
            eval_set=[
            (X_train, y_train),
            (X_valid, y_valid)],
            patience= 5,
            batch_size= 2048,
            virtual_batch_size = 128,
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs = 10,
            weights=1,)

model04.fit(  
            X_train,
            y_train,
            eval_set=[
            (X_train, y_train),
            (X_valid, y_valid)],
            patience= 5,
            batch_size= 2048,
            virtual_batch_size = 128,
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs = 10,
            weights=1,)            

model05.fit(  
            X_train,
            y_train,
            eval_set=[
            (X_train, y_train),
            (X_valid, y_valid)],
            patience= 5,
            batch_size= 2048,
            virtual_batch_size = 128,
            eval_name=['train', 'valid'],
            eval_metric=['auc'],
            max_epochs = 10,
            weights=1,)            

epoch 0  | loss: 0.56924 | train_auc: 0.82886 | valid_auc: 0.81743 |  0:02:22s
epoch 1  | loss: 0.50495 | train_auc: 0.84323 | valid_auc: 0.83397 |  0:04:48s
epoch 2  | loss: 0.48799 | train_auc: 0.85232 | valid_auc: 0.83912 |  0:07:14s
epoch 3  | loss: 0.47845 | train_auc: 0.85559 | valid_auc: 0.843   |  0:09:36s
epoch 4  | loss: 0.47537 | train_auc: 0.85749 | valid_auc: 0.84653 |  0:11:56s
epoch 5  | loss: 0.47291 | train_auc: 0.85879 | valid_auc: 0.85227 |  0:14:17s
epoch 6  | loss: 0.47092 | train_auc: 0.85947 | valid_auc: 0.85541 |  0:16:38s
epoch 7  | loss: 0.4693  | train_auc: 0.86032 | valid_auc: 0.85724 |  0:18:58s
epoch 8  | loss: 0.46815 | train_auc: 0.86077 | valid_auc: 0.85529 |  0:21:19s
epoch 9  | loss: 0.46743 | train_auc: 0.86135 | valid_auc: 0.85795 |  0:23:41s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_auc = 0.85795
Best weights from best epoch are automatically used!
epoch 0  | loss: 0.52666 | train_auc: 0.85059 | valid_auc:

In [49]:
valid_predict01 = model01.predict_proba(X_valid)[:,-1]
valid_predict02 = model02.predict_proba(X_valid)[:,-1]
valid_predict03 = model03.predict_proba(X_valid)[:,-1]
valid_predict04 = model04.predict_proba(X_valid)[:,-1]
valid_predict05 = model05.predict_proba(X_valid)[:,-1]

get_metric(y_valid,valid_predict01)
get_metric(y_valid,valid_predict02)
get_metric(y_valid,valid_predict03)
get_metric(y_valid,valid_predict04)
get_metric(y_valid,valid_predict05)

auc : 0.8579512604070305
acc : 0.7674731182795699
precision : 0.8571428571428571
recall : 0.6358695652173914
auc : 0.8602133441258095
acc : 0.771505376344086
precision : 0.8694029850746269
recall : 0.6331521739130435
auc : 0.8580668940795559
acc : 0.7567204301075269
precision : 0.850187265917603
recall : 0.6168478260869565
auc : 0.859548450508788
acc : 0.771505376344086
precision : 0.8694029850746269
recall : 0.6331521739130435
auc : 0.8612612742830712
acc : 0.7688172043010753
precision : 0.8426573426573427
recall : 0.654891304347826


In [50]:
test = pd.read_pickle('/opt/ml/level2-dkt-level2-recsys-08/data_pkl/test_data-1.pkl')
test = test[test.answerCode==-1]

In [56]:
test_predict01 = model01.predict_proba(test[FEATS].values)[:,-1]
test_predict02 = model02.predict_proba(test[FEATS].values)[:,-1]
test_predict03 = model03.predict_proba(test[FEATS].values)[:,-1]
test_predict04 = model04.predict_proba(test[FEATS].values)[:,-1]
test_predict05 = model05.predict_proba(test[FEATS].values)[:,-1]

In [57]:
new_valid = X_valid.copy()
new_valid.loc[:,'predict01'] = valid_predict01
new_valid.loc[:,'predict02'] = valid_predict02
new_valid.loc[:,'predict03'] = valid_predict03
new_valid.loc[:,'predict04'] = valid_predict04
new_valid.loc[:,'predict05'] = valid_predict05


new_test = test.copy()
new_test.loc[:,'predict01'] = test_predict01
new_test.loc[:,'predict02'] = test_predict02
new_test.loc[:,'predict03'] = test_predict03
new_test.loc[:,'predict04'] = test_predict04
new_test.loc[:,'predict05'] = test_predict05

In [211]:
NEW_FEATS = [
            'predict01',
            'predict02',
            'predict03',
            'predict04',
            'predict05'
            ]

In [219]:
Final = LogisticRegression(max_iter=2000)
Final.fit(new_valid[NEW_FEATS], y_valid)

LogisticRegression(max_iter=2000)

In [220]:
Final_valid_preds = Final.predict_proba(new_valid[NEW_FEATS])[:,-1]

In [221]:
get_metric(y_valid, Final_valid_preds)

auc : 0.8621502081406105
acc : 0.7728494623655914
precision : 0.7900874635568513
recall : 0.7364130434782609


In [201]:
Final_test_preds = Final.predict_proba(new_test[NEW_FEATS])[:,-1]

test_to_csv(Final_test_preds, 'blending_tabnet4')