In [1]:
!pip install pytorch-tabnet==3.1.1

Collecting pytorch-tabnet==3.1.1
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [1]:
import numpy as np
import pandas as pd
import torch

from torch import nn
from pytorch_tabnet.tab_model  import TabNetClassifier 
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

In [4]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(train_df.shape)
train_df.head(3)


(501951, 35)


Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,...,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,...,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,...,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0


In [5]:
# Vadliation 구분
train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<11].copy()
val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)==11].copy()
test = test_df.copy()

print(train.shape)
train.head(3)

(456972, 35)


Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,...,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,...,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,...,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0


간단한 Preprocessing


In [6]:
for df in [train,val,test]:
    df.drop(['contents_open_dt','contents_rn','id','person_rn','person_prefer_f','person_prefer_g'],axis=1,inplace=True)
# 각 열에서 Tabnet에 사용하지 않는 변수들 제거.
# contents, person rn : 컨텐츠-사용자 번호
    
columns = sorted(test.columns)
train = train[columns+['target']]*1
val = val[columns+['target']]*1
test = test[columns]*1

In [7]:
columns

['contents_attribute_a',
 'contents_attribute_c',
 'contents_attribute_d',
 'contents_attribute_e',
 'contents_attribute_h',
 'contents_attribute_i',
 'contents_attribute_j',
 'contents_attribute_j_1',
 'contents_attribute_k',
 'contents_attribute_l',
 'contents_attribute_m',
 'd_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a',
 'person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3']

In [8]:
train

Unnamed: 0,contents_attribute_a,contents_attribute_c,contents_attribute_d,contents_attribute_e,contents_attribute_h,contents_attribute_i,contents_attribute_j,contents_attribute_j_1,contents_attribute_k,contents_attribute_l,...,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,target
0,3,1,275,4,139,3,2,10,2,1608,...,3,5,275,370,369,8,4,95,59,1
1,3,1,275,4,133,1,1,5,2,1608,...,4,1,114,181,175,4,131,101,96,0
2,1,1,94,4,53,3,2,10,1,1600,...,3,5,464,175,452,3,54,263,56,0
3,3,1,275,3,74,1,1,5,2,1608,...,2,5,703,705,704,3,72,227,2,0
4,1,1,275,4,74,1,2,10,2,1608,...,4,5,275,370,369,4,214,210,209,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,3,1,147,5,65,3,1,5,2,354,...,2,2,1192,935,1228,3,59,4,95,1
501947,3,1,120,4,142,3,2,10,2,163,...,2,1,118,113,110,4,105,142,95,1
501948,1,1,147,7,65,3,1,5,2,438,...,4,1,147,46,145,4,59,127,139,1
501949,2,1,147,4,259,3,1,5,2,660,...,2,1,46,147,145,4,251,49,258,1


In [9]:
# cat idxs : cat_idxs 는 Column index를 의미한다.
# cat dims : cat_dims 는 각 Column을 LabelEncoder 했을 때 몇개의 범주 값이 존재하는지
cat_idxs = []
cat_dims = []

for idx, col in enumerate(train.columns):
    if 'match' not in col and col!='target': 
        # 데이터 변수들마다 LabelEncoder 적용해준다.
        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

In [12]:
# 숫자 순서대로 속성이 적혀있는 것이 아니라 중간에 숫자가 넘어가기도 한다.
# label encoding 을 통해서 각 변수들을 모두 0에서 부터 시작하여 범주형으로 변환
le_dict

{2: 0,
 4: 1,
 5: 2,
 6: 3,
 7: 4,
 8: 5,
 9: 6,
 10: 7,
 11: 8,
 12: 9,
 13: 10,
 14: 11,
 15: 12,
 16: 13,
 17: 14,
 18: 15,
 19: 16,
 20: 17,
 21: 18,
 22: 19,
 23: 20,
 24: 21,
 25: 22,
 26: 23,
 27: 24,
 28: 25,
 29: 26,
 31: 27,
 32: 28,
 33: 29,
 34: 30,
 35: 31,
 36: 32,
 37: 33,
 38: 34,
 39: 35,
 40: 36,
 41: 37,
 42: 38,
 43: 39,
 44: 40,
 45: 41,
 46: 42,
 47: 43,
 49: 44,
 50: 45,
 51: 46,
 52: 47,
 53: 48,
 54: 49,
 55: 50,
 56: 51,
 57: 52,
 59: 53,
 60: 54,
 61: 55,
 63: 56,
 64: 57,
 65: 58,
 66: 59,
 67: 60,
 68: 61,
 69: 62,
 70: 63,
 72: 64,
 73: 65,
 74: 66,
 75: 67,
 76: 68,
 77: 69,
 79: 70,
 80: 71,
 81: 72,
 82: 73,
 83: 74,
 84: 75,
 86: 76,
 87: 77,
 88: 78,
 89: 79,
 90: 80,
 91: 81,
 93: 82,
 95: 83,
 96: 84,
 97: 85,
 98: 86,
 99: 87,
 100: 88,
 101: 89,
 102: 90,
 103: 91,
 104: 92,
 105: 93,
 106: 94,
 107: 95,
 108: 96,
 109: 97,
 113: 98,
 114: 99,
 115: 100,
 116: 101,
 117: 102,
 118: 103,
 119: 104,
 120: 105,
 121: 106,
 122: 107,
 123: 108,
 124: 

In [13]:
# 위에서 train 출력한 샘플과 비교하면 값이 달라진 것을 확인할 수 있음.
train.head(3)

Unnamed: 0,contents_attribute_a,contents_attribute_c,contents_attribute_d,contents_attribute_e,contents_attribute_h,contents_attribute_i,contents_attribute_j,contents_attribute_j_1,contents_attribute_k,contents_attribute_l,...,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,target
0,2,0,230,4,109,2,1,8,1,1423,...,3,4,238,322,313,8,1,83,53,1
1,2,0,230,4,103,0,0,4,1,1423,...,4,0,95,161,154,4,116,89,84,0
2,0,0,74,4,44,2,1,8,0,1415,...,3,4,402,155,373,3,49,237,51,0


In [14]:
X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target',axis=1).values
y_val = val['target'].values
X_test = test.values
eval_set = (X_val,y_val)

In [16]:
X_train

array([[  2,   0, 230, ...,   1,  83,  53],
       [  2,   0, 230, ..., 116,  89,  84],
       [  0,   0,  74, ...,  49, 237,  51],
       ...,
       [  0,   0, 125, ...,  53, 112, 124],
       [  1,   0, 125, ..., 225,  44, 232],
       [  2,   0, 125, ..., 251,  27,  44]], dtype=int64)

In [18]:
cat_idxs

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]

In [19]:
cat_dims

[4,
 5,
 1066,
 13,
 251,
 4,
 3,
 10,
 3,
 1753,
 6,
 3,
 9,
 7,
 6,
 1094,
 1082,
 1044,
 13,
 280,
 280,
 280]

In [22]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=3,
                       optimizer_fn=torch.optim.AdamW, # Any optimizer works here
                       mask_type='entmax', # "sparsemax",
                      )

Device used : cuda


In [23]:
class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [24]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'val'],
    eval_metric=['logloss','f1'],
    max_epochs=100 , patience=2, 
    batch_size=1024,
    virtual_batch_size=256,
    num_workers=1,
    drop_last=False,
) 
# max_epochs=100 , patience=2, 기존
# best_val_f1 = 0.64264 

# max_epochs=500 , patience=4, 기존
# best_val_f1 = 0.6459

# 하지만 public LB에서는 점수가 떨어짐.
# overfitting ?

epoch 0  | loss: 0.68623 | train_logloss: 0.67736 | train_f1: 0.55951 | val_logloss: 0.68047 | val_f1: 0.54833 |  0:00:23s
epoch 1  | loss: 0.67304 | train_logloss: 0.6628  | train_f1: 0.64143 | val_logloss: 0.66983 | val_f1: 0.62861 |  0:00:46s
epoch 2  | loss: 0.65589 | train_logloss: 0.64792 | train_f1: 0.64619 | val_logloss: 0.65734 | val_f1: 0.62822 |  0:01:10s
epoch 3  | loss: 0.64645 | train_logloss: 0.64011 | train_f1: 0.66003 | val_logloss: 0.65508 | val_f1: 0.64027 |  0:01:33s
epoch 4  | loss: 0.6415  | train_logloss: 0.63307 | train_f1: 0.65627 | val_logloss: 0.65284 | val_f1: 0.63134 |  0:01:56s
epoch 5  | loss: 0.63746 | train_logloss: 0.62959 | train_f1: 0.66969 | val_logloss: 0.65299 | val_f1: 0.64264 |  0:02:20s
epoch 6  | loss: 0.6338  | train_logloss: 0.62624 | train_f1: 0.65991 | val_logloss: 0.64993 | val_f1: 0.62962 |  0:02:43s
epoch 7  | loss: 0.63113 | train_logloss: 0.62402 | train_f1: 0.66925 | val_logloss: 0.65036 | val_f1: 0.64015 |  0:03:06s
epoch 8  | loss:

In [39]:
preds = clf.predict_proba(X_test)
preds

array([[0.6034333 , 0.39656663],
       [0.67914814, 0.32085183],
       [0.3777823 , 0.62221766],
       ...,
       [0.39370224, 0.6062978 ],
       [0.3009409 , 0.69905907],
       [0.27082747, 0.7291725 ]], dtype=float32)

In [40]:
preds[:,1]

array([0.39656663, 0.32085183, 0.62221766, ..., 0.6062978 , 0.69905907,
       0.7291725 ], dtype=float32)

In [41]:
preds = (preds[:,1]>0.5)*1
preds

array([0, 0, 1, ..., 1, 1, 1])

In [25]:
preds = clf.predict_proba(X_test)
preds = (preds[:,1]>0.5)*1

In [27]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = preds
submission.tail()

Unnamed: 0,id,target
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1
46403,46403,1


In [28]:
submission.to_csv('submit/share_tabnet2.csv',index=False)


### 진행해보며
- tabnet은 hyperparameter tuning 이 중요하다고 한다. 나중에 tabnet을 더 진행할 것이라면 sweep 같은 것을 고려해보아야 할 수 있다.
- 공유코드 그대로 사용한 것이기 때문에, 내가 하나씩 구현하며 tabnet 구조 등을 파악해볼 필요성도 있다.
- tabnet은 boosting + DL 중간다리 느낌이라고 한다. 일반적으로 정형 데이터에서는 통계적 기법들(xgb, lgbm ...)이 성능이 좋다고 한다. 그런데 딥러닝 개념도 섞은 tabnet이 성능이 꽤나 잘 나온다고 함. 데이터 바이 데이터 인듯.

https://bigwaveai.tistory.com/7?category=953606  
https://lv99.tistory.com/83