In [4]:
!pip install pytorch-widedeep
!pip install -r widedeep_requirements.txt







In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision, F1Score

In [6]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(train_df.shape)
train_df.head(3)


(501951, 35)


Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,...,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,...,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,...,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0


In [8]:
wide_cols = [
'contents_attribute_a',
 'contents_attribute_c',
 'contents_attribute_d',
 'contents_attribute_e',
 'contents_attribute_h',
 'contents_attribute_i',
 'contents_attribute_j',
 'contents_attribute_j_1',
 'contents_attribute_k',
 'contents_attribute_l',
 'contents_attribute_m',
 'd_l_match_yn',
 'd_m_match_yn',
 'd_s_match_yn',
 'h_l_match_yn',
 'h_m_match_yn',
 'h_s_match_yn',
 'person_attribute_a',
 'person_attribute_a_1',
 'person_attribute_b',
 'person_prefer_c',
 'person_prefer_d_1',
 'person_prefer_d_2',
 'person_prefer_d_3',
 'person_prefer_e',
 'person_prefer_h_1',
 'person_prefer_h_2',
 'person_prefer_h_3'
]
crossed_cols = None # [("education", "occupation"), ("native_country", "occupation")]
cat_embed_cols = [
('contents_attribute_a',16),
 ('contents_attribute_c',16),
 ('contents_attribute_d',16),
 ('contents_attribute_e',16),
 ('contents_attribute_h',16),
 ('contents_attribute_i',16),
 ('contents_attribute_j',16),
 ('contents_attribute_j_1',16),
 ('contents_attribute_k',16),
 ('contents_attribute_l',16),
 ('contents_attribute_m',16),
 ('d_l_match_yn',16),
 ('d_m_match_yn',16),
 ('d_s_match_yn',16),
 ('h_l_match_yn',16),
 ('h_m_match_yn',16),
 ('h_s_match_yn',16),
 ('person_attribute_a',16),
 ('person_attribute_a_1',16),
 ('person_attribute_b',16),
 ('person_prefer_c',16),
 ('person_prefer_d_1',16),
 ('person_prefer_d_2',16),
 ('person_prefer_d_3',16),
 ('person_prefer_e',16),
 ('person_prefer_h_1',16),
 ('person_prefer_h_2',16),
 ('person_prefer_h_3',16)
]
continuous_cols = None #["age", "hours_per_week"]
target_col = "target"

In [9]:
# TARGET
target = train_df[target_col].values

# wide
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
X_wide = wide_preprocessor.fit_transform(train_df)

# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=continuous_cols
)
X_tab = tab_preprocessor.fit_transform(train_df)

In [12]:
X_wide

array([[   1,    4,    8, ..., 6370, 6649, 6928],
       [   1,    4,    8, ..., 6371, 6650, 6929],
       [   2,    4,    9, ..., 6372, 6651, 6930],
       ...,
       [   2,    4,  172, ..., 6387, 6657, 6935],
       [   3,    4,  172, ..., 6410, 6660, 6933],
       [   1,    4,  172, ..., 6438, 6727, 6969]])

In [13]:
X_tab

array([[  1,   1,   1, ...,   1,   1,   1],
       [  1,   1,   1, ...,   2,   2,   2],
       [  2,   1,   2, ...,   3,   3,   3],
       ...,
       [  2,   1, 165, ...,  18,   9,   8],
       [  3,   1, 165, ...,  41,  12,   6],
       [  1,   1, 165, ...,  69,  79,  42]])

In [14]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
deeptabular = TabMlp(
    mlp_hidden_dims=[64, 32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=continuous_cols,
)
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [15]:
model

WideDeep(
  (wide): Wide(
    (wide_linear): Embedding(7207, 1, padding_idx=0)
  )
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_embed_and_cont): CatEmbeddingsAndCont(
        (embed_layers): ModuleDict(
          (emb_layer_contents_attribute_a): Embedding(4, 16, padding_idx=0)
          (emb_layer_contents_attribute_c): Embedding(5, 16, padding_idx=0)
          (emb_layer_contents_attribute_d): Embedding(1066, 16, padding_idx=0)
          (emb_layer_contents_attribute_e): Embedding(13, 16, padding_idx=0)
          (emb_layer_contents_attribute_h): Embedding(251, 16, padding_idx=0)
          (emb_layer_contents_attribute_i): Embedding(4, 16, padding_idx=0)
          (emb_layer_contents_attribute_j): Embedding(3, 16, padding_idx=0)
          (emb_layer_contents_attribute_j_1): Embedding(10, 16, padding_idx=0)
          (emb_layer_contents_attribute_k): Embedding(3, 16, padding_idx=0)
          (emb_layer_contents_attribute_l): Embedding(1753, 16, padding_idx=0)
          (em

In [19]:
trainer = Trainer(model, objective="binary", metrics=[F1Score, Accuracy, Precision])


In [22]:
trainer.fit(
    X_wide=X_wide, X_tab=X_tab, target=target, n_epochs=5, batch_size=64, val_split=0.2
)

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
epoch 1:   0%|          | 0/6275 [00:00<?, ?it/s]


RuntimeError: CUDA error: no kernel image is available for execution on the device