In [None]:
#!pip install -U deepctr-torch

In [None]:
#!git clone https://github.com/shenweichen/DeepCTR-Torch.git

Cloning into 'DeepCTR-Torch'...
remote: Enumerating objects: 1347, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 1347 (delta 148), reused 172 (delta 109), pack-reused 1097[K
Receiving objects: 100% (1347/1347), 4.97 MiB | 32.41 MiB/s, done.
Resolving deltas: 100% (857/857), done.


## **1. Mount the drive and import all the libraries.**

In [1]:
from google.colab import  drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from numpy import loadtxt
import matplotlib.pyplot as plt
import torch
import pickle
import json
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from drive.MyDrive.RCSYS_finalproject.DeepCTRTorch.deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from drive.MyDrive.RCSYS_finalproject.DeepCTRTorch.deepctr_torch.models import *
from drive.MyDrive.RCSYS_finalproject.DeepCTRTorch.deepctr_torch.models.SExdeepfm import *
from drive.MyDrive.RCSYS_finalproject.DeepCTRTorch.deepctr_torch.inputs import VarLenSparseFeat

## **2. Prepare the Criteo Dataset**

In [None]:
with open('/content/drive/MyDrive/RCSYS_finalproject/criteo_original/dnn_feature_columns_criteo.p', 'rb') as fp:
    dnn_feature_columns = pickle.load(fp)

with open('/content/drive/MyDrive/RCSYS_finalproject/criteo_original/linear_feature_columns_criteo.p', 'rb') as fp:
    linear_feature_columns = pickle.load(fp)

with open('/content/drive/MyDrive/RCSYS_finalproject/criteo_original/feature_names_criteo.p', 'rb') as fp:
    feature_names = pickle.load(fp)

In [None]:
#or we want to use the criteo full dataset
train = pd.read_pickle('/content/drive/MyDrive/RCSYS_finalproject/criteo_original/criteo_train.pkl')
test = pd.read_pickle('/content/drive/MyDrive/RCSYS_finalproject/criteo_original/criteo_test.pkl')

## **3. Or prepare the Avazu Dataset**

In [3]:
#or we want to use the avazu full dataset
train = pd.read_pickle('/content/drive/MyDrive/RCSYS_finalproject/avazu/avazu_train.pkl')
test = pd.read_pickle('/content/drive/MyDrive/RCSYS_finalproject/avazu/avazu_test.pkl')

In [4]:
with open('/content/drive/MyDrive/RCSYS_finalproject/avazu/dnn_feature_columns_avazu.p', 'rb') as fp:
    dnn_feature_columns = pickle.load(fp)

with open('/content/drive/MyDrive/RCSYS_finalproject/avazu/linear_feature_columns_avazu.p', 'rb') as fp:
    linear_feature_columns = pickle.load(fp)

with open('/content/drive/MyDrive/RCSYS_finalproject/avazu/feature_names_avazu.p', 'rb') as fp:
    feature_names = pickle.load(fp)

## **3. Start training**

In [5]:
#choose the hyper parameters we want to use
# lr_list = [0.01, 0.001, 0.0001]
# batch_size_list = [256,512,1024,4096]
# lr_reg_list = [0.01, 0.001, 0.0001]
# epochs_list = [2, 3]
# model_name_list = ['DeepFM_avazu_original']

lr_list = [0.01, 0.001, 0.0001]
batch_size_list = [4096]
lr_reg_list = [0.01, 0.001, 0.0001]
epochs_list = [3]
model_name_list = ['xDeepFM_avazu_original10']

target = ['label']

In [6]:
def train_model(model_name, batch_size, epochs,lr, l2_reg_dnn, model_type_name='xdeepFM',attention_channels=21):
  if model_type_name == 'deepFM' or model_type_name == 'xdeepFM' or model_type_name == 'SExdeepFM':
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

  #Define Model,train,predict and evaluate
  device = 'cpu'
  use_cuda = True
  if use_cuda and torch.cuda.is_available():
      print('cuda ready...')
      device = 'cuda:0'
  
  #xDeepFMmodel
  if model_type_name == 'xdeepFM':
    print('check')
    model = xDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                    task='binary',
                    l2_reg_embedding=1e-5,l2_reg_dnn = l2_reg_dnn, dnn_dropout=0.5, device=device)

  #SExDeepFMmodel
  if model_type_name == 'SExdeepFM':
    model = SExDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                    task='binary',
                    l2_reg_embedding=1e-5,l2_reg_dnn = l2_reg_dnn, device=device,attention_channels = attention_channels)
  
  #DeepFMmodel
  if model_type_name == 'deepFM':
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,task='binary',
                  l2_reg_embedding=1e-5,l2_reg_dnn = l2_reg_dnn,device='cuda')
  
  if model_type_name == 'DIN':
    x, y, feature_columns, behavior_feature_list = get_xy_fd_train()
    model = DIN(feature_columns, behavior_feature_list, device=device, att_weight_normalization=True)

  if model_type_name == 'deepFM' or model_type_name == 'xdeepFM' or model_type_name == 'SExdeepFM':
    print('check')
    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy"], lr=lr,)
    history = model.fit(train_model_input,train[target].values, batch_size=batch_size, epochs=epochs,verbose=2,validation_split=0.15)
    #save_loss_history_and_figure(model_name,batch_size,epochs,lr,l2_reg_dnn)
    pred_ans = model.predict(test_model_input, batch_size)
    print("")
    test_LogLoss = round(log_loss(test[target].values, pred_ans), 4)
    test_AUC = round(roc_auc_score(test[target].values, pred_ans), 4)
    print("test LogLoss", test_LogLoss)
    print("test AUC", test_AUC)

    return history.history['loss'] , history.history['val_binary_crossentropy'], test_LogLoss, test_AUC
  
  else:
    model.compile('adagrad', 'binary_crossentropy',
              metrics=['binary_crossentropy'])
    history = model.fit(x, y, batch_size=128, epochs=100, verbose=2, validation_split=0.15)
    x, y, feature_columns, behavior_feature_list = get_xy_fd_test()
    pred_ans = model.predict(x, 128)
    print("")
    test_LogLoss = round(log_loss(y, pred_ans), 4)
    test_AUC = round(roc_auc_score(y, pred_ans), 4)
    print("test LogLoss", test_LogLoss)
    print("test AUC", test_AUC)
    return history.history['loss'], test_LogLoss, test_AUC


In [None]:
results = pd.DataFrame(columns=['Model_name','Iterations','Batch_Size','Learning_Rate','DNN_Reg','test_AUC','test_LogLoss','Train_History','Val_History'])
model_type_name='xdeepFM'

for model_name in model_name_list:
  for epochs in epochs_list:
    for batch_size in batch_size_list:
      for lr in lr_list:
        for l2_reg_dnn in lr_reg_list:
          dic = {'Model_name':model_name,'Iterations':epochs,'Batch_Size':batch_size,'Learning_Rate':lr,'DNN_Reg':l2_reg_dnn}
          if model_type_name== 'DIN':
            train_loss_history, test_LogLoss, test_AUC = train_model(model_name,batch_size,epochs,lr,l2_reg_dnn,model_type_name=model_type_name)
            dic['test_AUC'] = test_AUC
            dic['test_LogLoss'] = test_LogLoss
            dic['Train_History'] = train_loss_history
            results = results.append(dic,ignore_index=True)
            results.to_pickle('/content/drive/MyDrive/RCSYS_finalproject/results/results_'+ model_name + '.pkl')
          else: 
            train_loss_history, val_loss_history, test_LogLoss, test_AUC = train_model(model_name,batch_size,epochs,lr,l2_reg_dnn,model_type_name=model_type_name)
            dic['test_AUC'] = test_AUC
            dic['test_LogLoss'] = test_LogLoss
            dic['Train_History'] = train_loss_history
            dic['Val_History'] = val_loss_history
            results = results.append(dic,ignore_index=True)
            results.to_pickle('/content/drive/MyDrive/RCSYS_finalproject/results/results_'+ model_name + '.pkl')

cuda ready...
check
check
cuda:0
Train on 21819182 samples, validate on 3850445 samples, 5327 steps per epoch
Epoch 1/3
2049s - loss:  0.3866 - binary_crossentropy:  0.3866 - val_binary_crossentropy:  0.3789
Epoch 2/3
2057s - loss:  0.3191 - binary_crossentropy:  0.3190 - val_binary_crossentropy:  0.4007
Epoch 3/3
2043s - loss:  0.2859 - binary_crossentropy:  0.2859 - val_binary_crossentropy:  0.4197

test LogLoss 0.42
test AUC 0.7563
cuda ready...
check
check
cuda:0
Train on 21819182 samples, validate on 3850445 samples, 5327 steps per epoch
Epoch 1/3
2046s - loss:  0.3866 - binary_crossentropy:  0.3866 - val_binary_crossentropy:  0.3789
Epoch 2/3
2051s - loss:  0.3205 - binary_crossentropy:  0.3204 - val_binary_crossentropy:  0.4003


In [None]:
#if we want to read the result dataframe
results = pd.read_pickle('/content/drive/MyDrive/RCSYS_finalproject/results/results_SExDeepFM_avazu.pkl')
results

In [None]:
def save_loss_history_and_figure(model_name,batch_size,epochs,lr,l2_reg_dnn):
  train_loss_file_name = "/content/drive/MyDrive/RCSYS_finalproject/results/" + "train_loss_history_model_" + model_name +'_bs_' + str(batch_size) + '_epochs_' + str(epochs) +'_lr_'+ str(lr)[2:] + '_reg_'+ str(l2_reg_dnn)[2:]
  val_loss_file_name = "/content/drive/MyDrive/RCSYS_finalproject/results/" + "val_loss_history_model_" + model_name +'_bs_' + str(batch_size) + '_epochs_' + str(epochs) +'_lr_'+ str(lr)[2:] + '_reg_'+ str(l2_reg_dnn)[2:]
  with open(train_loss_file_name, "wb") as fp:   #Pickling
    pickle.dump(history.history['loss'], fp)
  # with open(train_loss_file_name, "rb") as fp:   # Unpickling
  #   train_loss = pickle.load(fp)
  with open(val_loss_file_name, "wb") as fp:   #Pickling
    pickle.dump(history.history['val_binary_crossentropy'], fp)
  # with open(val_loss_file_name, "rb") as fp:   # Unpickling
  #   val_loss = pickle.load(fp)

  save_loss_figure(history.history['loss'],history.history['val_binary_crossentropy'],model_name,batch_size,epochs,lr,l2_reg_dnn)

In [None]:
def save_loss_figure(train_loss,val_loss,model_name,batch_size,epochs,lr,l2_reg_dnn):
  plt.figure(figsize=(12,5))
  plt.plot(train_loss)
  plt.plot(val_loss)
  plt.xticks(np.arange(len(train_loss)+1))
  plt.title('')
  plt.xlabel('Iterations')
  plt.ylabel('binary_crossentropy')
  plt.legend(['Train Loss', 'Validation Loss'])
  figure_name = "/content/drive/MyDrive/RCSYS_finalproject/results/" + "figure_loss_" + model_name +'_bs_' + str(batch_size) + '_epochs_' + str(epochs) +'_lr_'+ str(lr)[2:] + '_reg_' + str(l2_reg_dnn)[2:] + '.png'
  plt.savefig(figure_name)
  plt.show()