#### A PyTorch implementation of the deepFM model with hyperparameters search using Bayesian Optimization

In [2]:
!nvidia-smi

Wed Jul 28 17:07:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials
import pandas as pd
import numpy as np
import random
import os
import time


In [4]:
## set device

In [5]:
device = 'cuda' if torch.cuda.is_available() == True else 'cpu'

In [6]:
device

'cuda'

In [7]:
torch.__version__

'1.7.0'

In [8]:
## load data

In [9]:
train_data = '../input/privacychallenge2021/X_train.csv'
y_train_data = '../input/privacychallenge2021/y_train.csv'


In [10]:
test_data = '../input/privacychallenge2021/X_test.csv'

In [11]:
## dataframes

In [12]:
df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)
df_y_train = pd.read_csv(y_train_data)

In [13]:
X_train = np.array(df_train.values.tolist())

In [14]:
y_train = np.array(df_y_train.click.values.tolist())

In [15]:
def set_seeds(seed=999):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seeds()

In [16]:
# since the min-max feature values in both train and test sets belong to the [-499999 , 499999 ] the one-hot coding length will be 1M per feature (in the Dataloader transform)

In [17]:
class CleanDataset(Dataset):
    def __init__(self, data, target):
        self.data = data
        self.target = torch.FloatTensor(target)
        
    def __getitem__(self, index):
       
        x = self.data[index]      
        x_feat = np.zeros((1,19),dtype = np.int64)
        x_feat[0] = [500000+x_i for x_i in x]
        y = self.target[index]
        return  x_feat, y
    
    def __len__(self):
        return len(self.data)

In [18]:
# For this particular problem we are gonna follow the original deepFM model,
# the MLP in the deepFM model will have the structure of 3 layers of equal size, 
# and the number of hidden units will be found by hyperparam opt

In [19]:
class DeepFM(nn.Module):
  def __init__(self, field_size, emb_dim,
               hidden_units, BATCH, drop_rate):
    super(DeepFM, self).__init__()
    self.embedding = nn.Embedding(field_size, emb_dim)
    self.linear_layer = nn.Linear(19,1)
    self.sigmoid = nn.Sigmoid()
    self.batch = BATCH
    input_dim = self.embed_output_dim = 19 * emb_dim # 
    self.mlp = nn.Sequential(
                nn.Linear(input_dim, hidden_units),
                nn.ReLU(),
                nn.Dropout(drop_rate),
                nn.Linear(hidden_units, hidden_units),
                nn.ReLU(),
                nn.Dropout(drop_rate),
                nn.Linear(hidden_units, hidden_units),
                nn.ReLU(),
                nn.Dropout(drop_rate),
                nn.Linear(hidden_units, 1),
                nn.Dropout(drop_rate)
              )
    


  def forward(self, x):
    embed_x = self.embedding(x)
    square_of_sum = (embed_x.sum(axis=2)) ** 2
    sum_of_square = (embed_x ** 2).sum(axis=2)

    inputs = embed_x.reshape(self.batch,1,self.embed_output_dim)   
    x = x.float()
    linear_term = self.linear_layer(x)
    int_term = 0.5 * (square_of_sum - sum_of_square).sum(2, keepdims=True) # interaction term
    deep_term = self.mlp(inputs)
    out = linear_term + int_term + deep_term 
    deepFM_out = self.sigmoid(out)

    return deepFM_out

In [20]:
criterion = nn.BCELoss()

In [21]:
# hyperparams:

# learning rate
# dropout
# hidden units
# embedding units
# batch size

In [22]:
SPACE = {
    'hidden_units': hp.quniform('hidden_units', low=200, high=900, q=100),
    'embedding_units': hp.quniform('embedding_units', low=4, high=64, q=4),
    'batch_size': hp.quniform('batch_size', low=2, high=18, q=2),
    'dropout': hp.quniform('dropout', low=0.1, high=0.4, q=0.05),
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.1)
}

In [23]:
FOLDS = 5
skf = StratifiedKFold(n_splits = FOLDS, random_state=44, shuffle=True)

In [24]:
def training(params):
  # cv-5 fold training
  # return : mean BCELoss
  print('point: ',params)
  mean_loss = 0.0
  hidden_units = int(params['hidden_units'])
  embedding_units = int(params['embedding_units'])
  batch_size = int(params['batch_size'])
  dropout = params['dropout']
  l_r  = params['learning_rate']
  field_size = int(1e6)
  
  for train_index, test_index in skf.split(X_train, y_train):
      X_train_, X_test = X_train[train_index], X_train[test_index]
      y_train_, y_test = y_train[train_index], y_train[test_index]
      dataset = CleanDataset(X_train_, y_train_)
      test_dataset = CleanDataset(X_test, y_test)
      dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,
                          num_workers=0)
      test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=0)
      fm_model = DeepFM(field_size,embedding_units,hidden_units,batch_size,dropout).to(device)
      optimizerFM = optim.Adagrad(fm_model.parameters(), lr=l_r)
      n_epochs = 3
    
      for epoch in range(n_epochs):  # 
        start = time.time()
        running_loss = 0.0
        for i, batch in enumerate(dataloader, 0):
        
            x, clicks = batch
            x = x.to(device)
            clicks = clicks.to(device)

            optimizerFM.zero_grad()
            #remove last batch
            if x.shape[0] == batch_size:
                outputs = fm_model(x)
                loss = criterion(outputs.squeeze(), clicks.squeeze())
                loss.backward()
                optimizerFM.step()

                running_loss += loss.item()

                if i % 2000 == 1999:    # print every 10000 mini-batches
                  print('[%d, %5d] loss: %.3f' %
                        (epoch + 1, i + 1, running_loss / 2000))
                  running_loss = 0.0
        print('Time after this epoch training: ', time.time() - start)
      test_loss = get_test_score(fm_model,batch_size,test_dataloader)
      print('Test loss: ',test_loss)
      mean_loss += test_loss
  mean_loss /= FOLDS
  print('mean_loss',mean_loss)
  return {'loss': mean_loss.unsqueeze(0),
           'status': STATUS_OK,
            'model': fm_model, 
            'params': params}


In [25]:
def get_test_score(fm_model,BATCH,testloader):
  test_loss = 0.0
  criterion = nn.BCELoss()
  'starting evaluation'
  with torch.no_grad():
    for k, batch_test in enumerate(testloader, 0):
      x_clean_test, clicks_clean_test = batch_test
      x_clean_test = x_clean_test.to(device)
      clicks_clean_test = clicks_clean_test.to(device)
      if x_clean_test.shape[0] == BATCH:
        outputs = fm_model(x_clean_test)

        test_loss += criterion(outputs.squeeze(), clicks_clean_test.squeeze())

  test_loss = test_loss / len(testloader)
  return test_loss

In [26]:
trials = Trials()
fmin(training,SPACE,algo=tpe.suggest, max_evals=20, rstate=np.random.RandomState(789),trials=trials)

point:                                                
{'batch_size': 10.0, 'dropout': 0.15000000000000002, 'embedding_units': 60.0, 'hidden_units': 700.0, 'learning_rate': 0.0775598873064874}
[1,  2000] loss: 90.375                               
[1,  4000] loss: 89.785                               
[1,  6000] loss: 89.980                               
[1,  8000] loss: 89.975                               
Time after this epoch training:                       
80.08348369598389                                     
[2,  2000] loss: 89.845                               
[2,  4000] loss: 90.040                               
[2,  6000] loss: 90.145                               
[2,  8000] loss: 90.230                               
Time after this epoch training:                       
79.64785385131836                                     
[3,  2000] loss: 90.025                               
[3,  4000] loss: 90.260                               
[3,  6000] loss: 90.135              

{'batch_size': 8.0,
 'dropout': 0.4,
 'embedding_units': 52.0,
 'hidden_units': 500.0,
 'learning_rate': 0.06293870619889874}

In [27]:
best_model = trials.results[np.argmin([r['loss'] for r in 
    trials.results])]['model']
best_params = trials.results[np.argmin([r['loss'] for r in 
    trials.results])]['params']

In [28]:
print(best_model)
print(best_params)  

DeepFM(
  (embedding): Embedding(1000000, 52)
  (linear_layer): Linear(in_features=19, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (mlp): Sequential(
    (0): Linear(in_features=988, out_features=500, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=500, out_features=500, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=500, out_features=500, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=500, out_features=1, bias=True)
    (10): Dropout(p=0.4, inplace=False)
  )
)
{'batch_size': 8.0, 'dropout': 0.4, 'embedding_units': 52.0, 'hidden_units': 500.0, 'learning_rate': 0.06293870619889874}
