pytorch-widedeep https://github.com/jrzaurin/pytorch-widedeep

In [3]:
# !pip install pytorch-widedeep

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision, F1Score

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(train_df.shape)
train_df.head(3)


(501951, 35)


Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0


In [6]:
wide_cols = [
'contents_attribute_a',
 'contents_attribute_c',
 'contents_attribute_d',
 'contents_attribute_e',
 'contents_attribute_h',
 'contents_attribute_i',
 'contents_attribute_j',
 'contents_attribute_j_1',
 'contents_attribute_k',
 'contents_attribute_l',
 'contents_attribute_m',
 'd_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a',
 'person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3'
]
crossed_cols = None # [("education", "occupation"), ("native_country", "occupation")]
cat_embed_cols = [
('contents_attribute_a',16),
 ('contents_attribute_c',16),
 ('contents_attribute_d',16),
 ('contents_attribute_e',16),
 ('contents_attribute_h',16),
 ('contents_attribute_i',16),
 ('contents_attribute_j',16),
 ('contents_attribute_j_1',16),
 ('contents_attribute_k',16),
 ('contents_attribute_l',16),
 ('contents_attribute_m',16),
 ('d_l_match_yn',16),
 ('d_m_match_yn',16),
 ('d_s_match_yn',16),
 ('h_l_match_yn',16),
 ('h_m_match_yn',16),
 ('h_s_match_yn',16),
 ('person_attribute_a',16),
 ('person_attribute_a_1',16),
 ('person_attribute_b',16),
 ('person_prefer_c',16),
 ('person_prefer_d_1',16),
 ('person_prefer_d_2',16),
 ('person_prefer_d_3',16),
 ('person_prefer_e',16),
 ('person_prefer_h_1',16),
 ('person_prefer_h_2',16),
 ('person_prefer_h_3',16)
]
continuous_cols = None #["age", "hours_per_week"]
target_col = "target"

In [7]:
# TARGET
target = train_df[target_col].values

# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(train_df)

# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=continuous_cols
)
X_tab = tab_preprocessor.fit_transform(train_df)

In [8]:
X_wide

array([[   1,    4,    8, ..., 6370, 6649, 6928],
       [   1,    4,    8, ..., 6371, 6650, 6929],
       [   2,    4,    9, ..., 6372, 6651, 6930],
       ...,
       [   2,    4,  172, ..., 6387, 6657, 6935],
       [   3,    4,  172, ..., 6410, 6660, 6933],
       [   1,    4,  172, ..., 6438, 6727, 6969]])

In [9]:
X_tab

array([[  1,   1,   1, ...,   1,   1,   1],
       [  1,   1,   1, ...,   2,   2,   2],
       [  2,   1,   2, ...,   3,   3,   3],
       ...,
       [  2,   1, 165, ...,  18,   9,   8],
       [  3,   1, 165, ...,  41,  12,   6],
       [  1,   1, 165, ...,  69,  79,  42]])

In [10]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [11]:
model

WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(7207, 1, padding_idx=0)
  )
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_embed_and_cont): CatEmbeddingsAndCont(
        (embed_layers): ModuleDict(
          (emb_layer_contents_attribute_a): Embedding(4, 16, padding_idx=0)
          (emb_layer_contents_attribute_c): Embedding(5, 16, padding_idx=0)
          (emb_layer_contents_attribute_d): Embedding(1066, 16, padding_idx=0)
          (emb_layer_contents_attribute_e): Embedding(13, 16, padding_idx=0)
          (emb_layer_contents_attribute_h): Embedding(251, 16, padding_idx=0)
          (emb_layer_contents_attribute_i): Embedding(4, 16, padding_idx=0)
          (emb_layer_contents_attribute_j): Embedding(3, 16, padding_idx=0)
          (emb_layer_contents_attribute_j_1): Embedding(10, 16, padding_idx=0)
          (emb_layer_contents_attribute_k): Embedding(3, 16, padding_idx=0)
          (emb_layer_contents_attribute_l): Embedding(1753, 16, padding_idx=0)
          (em

In [12]:
trainer = Trainer(model, objective="binary", metrics=[F1Score, Accuracy, Precision])


In [13]:
trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2
)

epoch 1: 100%|██████████| 6275/6275 [02:35<00:00, 40.27it/s, loss=0.762, metrics={'f1': 0.5637, 'acc': 0.5607, 'prec': 0.5596}]
valid: 100%|██████████| 1569/1569 [00:29<00:00, 52.82it/s, loss=0.682, metrics={'f1': 0.5852, 'acc': 0.5882, 'prec': 0.5892}]
epoch 2: 100%|██████████| 6275/6275 [02:33<00:00, 40.88it/s, loss=0.676, metrics={'f1': 0.5993, 'acc': 0.5928, 'prec': 0.5896}]
valid: 100%|██████████| 1569/1569 [00:29<00:00, 53.21it/s, loss=0.664, metrics={'f1': 0.633, 'acc': 0.6036, 'prec': 0.5891}]
epoch 3: 100%|██████████| 6275/6275 [02:34<00:00, 40.74it/s, loss=0.658, metrics={'f1': 0.6153, 'acc': 0.607, 'prec': 0.6023}]
valid: 100%|██████████| 1569/1569 [00:29<00:00, 53.86it/s, loss=0.656, metrics={'f1': 0.616, 'acc': 0.6094, 'prec': 0.6055}]
epoch 4: 100%|██████████| 6275/6275 [02:33<00:00, 40.90it/s, loss=0.65, metrics={'f1': 0.6247, 'acc': 0.6151, 'prec': 0.6092}]
valid: 100%|██████████| 1569/1569 [00:28<00:00, 55.76it/s, loss=0.655, metrics={'f1': 0.5917, 'acc': 0.6089, 'prec

In [14]:
X_wide_te = wide_preprocessor.transform(test_df)
X_tab_te = tab_preprocessor.transform(test_df)
preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

predict: 100%|██████████| 726/726 [00:07<00:00, 94.59it/s]


In [18]:
preds

array([0, 1, 1, ..., 1, 1, 1])

In [20]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds
submission.tail()

Unnamed: 0,id,target
46399,46399,1
46400,46400,0
46401,46401,1
46402,46402,1
46403,46403,1


In [22]:
submission.to_csv('widedeep_light.csv', index=False)

- big model(epoch, batch)

In [23]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=deeptabular)

trainer = Trainer(model, objective="binary", metrics=[F1Score, Accuracy, Precision])

trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=100, batch_size=256, val_split=0.2
)

preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds
submission.tail()
submission.to_csv('widedeep_heavy.csv', index=False)

epoch 1: 100%|██████████| 1569/1569 [00:45<00:00, 34.18it/s, loss=0.792, metrics={'f1': 0.5525, 'acc': 0.5506, 'prec': 0.5498}]
valid: 100%|██████████| 393/393 [00:07<00:00, 50.60it/s, loss=0.707, metrics={'f1': 0.5444, 'acc': 0.5755, 'prec': 0.587}]
epoch 2: 100%|██████████| 1569/1569 [00:45<00:00, 34.28it/s, loss=0.701, metrics={'f1': 0.5841, 'acc': 0.5789, 'prec': 0.5767}]
valid: 100%|██████████| 393/393 [00:08<00:00, 48.74it/s, loss=0.674, metrics={'f1': 0.6095, 'acc': 0.5945, 'prec': 0.5874}]
epoch 3: 100%|██████████| 1569/1569 [00:46<00:00, 33.49it/s, loss=0.675, metrics={'f1': 0.6008, 'acc': 0.5938, 'prec': 0.5903}]
valid: 100%|██████████| 393/393 [00:08<00:00, 48.14it/s, loss=0.663, metrics={'f1': 0.6245, 'acc': 0.6017, 'prec': 0.5904}]
epoch 4: 100%|██████████| 1569/1569 [00:46<00:00, 33.39it/s, loss=0.663, metrics={'f1': 0.611, 'acc': 0.6027, 'prec': 0.5982}]
valid: 100%|██████████| 393/393 [00:08<00:00, 46.84it/s, loss=0.658, metrics={'f1': 0.6325, 'acc': 0.6059, 'prec': 0.5

In [None]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=deeptabular)

trainer = Trainer(model, objective="binary", metrics=[F1Score, Accuracy, Precision])

trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=50, batch_size=256, val_split=0.2
)

preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds
submission.tail()
submission.to_csv('widedeep_middle.csv', index=False)

epoch 1: 100%|██████████| 1569/1569 [00:46<00:00, 33.84it/s, loss=0.803, metrics={'f1': 0.5497, 'acc': 0.5482, 'prec': 0.5476}]
valid: 100%|██████████| 393/393 [00:08<00:00, 47.54it/s, loss=0.71, metrics={'f1': 0.5976, 'acc': 0.5753, 'prec': 0.5675}]
epoch 2: 100%|██████████| 1569/1569 [00:46<00:00, 33.43it/s, loss=0.703, metrics={'f1': 0.5808, 'acc': 0.577, 'prec': 0.5754}]
valid: 100%|██████████| 393/393 [00:08<00:00, 46.72it/s, loss=0.678, metrics={'f1': 0.6222, 'acc': 0.5917, 'prec': 0.5787}]
epoch 3: 100%|██████████| 1569/1569 [00:46<00:00, 33.50it/s, loss=0.677, metrics={'f1': 0.5972, 'acc': 0.5911, 'prec': 0.5882}]
valid: 100%|██████████| 393/393 [00:08<00:00, 47.66it/s, loss=0.666, metrics={'f1': 0.6062, 'acc': 0.5978, 'prec': 0.5935}]
epoch 4: 100%|██████████| 1569/1569 [00:47<00:00, 33.34it/s, loss=0.664, metrics={'f1': 0.6079, 'acc': 0.6007, 'prec': 0.5968}]
valid: 100%|██████████| 393/393 [00:08<00:00, 48.47it/s, loss=0.659, metrics={'f1': 0.6254, 'acc': 0.6055, 'prec': 0.5