In [None]:
# default_exp models.deepfm

# DeepFM
> A pytorch implementation of DeepFM.

DeepFM consists of an FM component and a deep component which are integrated in a parallel structure. The FM component is the same as the 2-way factorization machines which is used to model the low-order feature interactions. The deep component is a multi-layered perceptron that is used to capture high-order feature interactions and nonlinearities. These two components share the same inputs/embeddings and their outputs are summed up as the final prediction. It is worth pointing out that the spirit of DeepFM resembles that of the Wide & Deep architecture which can capture both memorization and generalization. The advantages of DeepFM over the Wide & Deep model is that it reduces the effort of hand-crafted feature engineering by identifying feature combinations automatically.

![https://github.com/RecoHut-Stanzas/S021355/raw/main/images/img12.png](https://github.com/RecoHut-Stanzas/S021355/raw/main/images/img12.png)

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

# v1

In [None]:
#export
import torch
from torch import nn

from recohut.models.layers.embedding import EmbeddingLayer
from recohut.models.layers.common import MLP_Layer, LR_Layer, FM_Layer

from recohut.models.bases.ctr import CTRModel

In [None]:
#export
class DeepFM(CTRModel):
    def __init__(self, 
                 feature_map, 
                 model_id="DeepFM",
                 task="binary_classification",
                 learning_rate=1e-3, 
                 embedding_initializer="torch.nn.init.normal_(std=1e-4)",
                 embedding_dim=10, 
                 hidden_units=[64, 64, 64], 
                 hidden_activations="ReLU", 
                 net_dropout=0, 
                 batch_norm=False, 
                 **kwargs):
        super(DeepFM, self).__init__(feature_map, 
                                           model_id=model_id,
                                           **kwargs)
        self.embedding_layer = EmbeddingLayer(feature_map, embedding_dim)
        self.fm_layer = FM_Layer(feature_map, output_activation=None, use_bias=False)
        self.dnn = MLP_Layer(input_dim=embedding_dim * feature_map.num_fields,
                             output_dim=1, 
                             hidden_units=hidden_units,
                             hidden_activations=hidden_activations,
                             output_activation=None, 
                             dropout_rates=net_dropout, 
                             batch_norm=batch_norm, 
                             use_bias=True)
        self.output_activation = self.get_final_activation(task)
        self.init_weights(embedding_initializer=embedding_initializer)

    def forward(self, inputs):
        feature_emb = self.embedding_layer(inputs)
        y_pred = self.fm_layer(inputs, feature_emb)
        y_pred += self.dnn(feature_emb.flatten(start_dim=1))
        if self.output_activation is not None:
            y_pred = self.output_activation(y_pred)
        return y_pred

Example

In [None]:
params = {'model_id': 'DeepFM',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'learning_rate': 1e-3,
              'optimizer': 'adamw',
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'hidden_units': [300, 300, 300],
              'hidden_activations': 'relu',
              'net_regularizer': 0,
              'embedding_regularizer': 0,
              'batch_norm': False,
              'net_dropout': 0,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

In [None]:
model = DeepFM(ds.dataset.feature_map, **params)

In [None]:
pl_trainer(model, ds, max_epochs=5)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name              | Type           | Params
-----------------------------------------------------
0 | embedding_layer   | EmbeddingLayer | 4.8 K 
1 | fm_layer          | FM_Layer       | 854   
2 | dnn               | MLP_Layer      | 223 K 
3 | output_activation | Sigmoid        | 0     
-----------------------------------------------------
228 K     Trainable params
378       Non-trainable params
228 K     Total params
0.915     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test Metrics': {'AUC': tensor(1.), 'logloss': tensor(0.1845)}}
--------------------------------------------------------------------------------


[{'Test Metrics': {'AUC': tensor(1.), 'logloss': tensor(0.1845)}}]

## v2

> **References:-**
- H Guo, et al. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 2017.
- https://github.com/rixwew/pytorch-fm/blob/master/torchfm/model/dfm.py

In [None]:
#export
import torch

from recohut.models.layers.common import FeaturesEmbedding, FeaturesLinear, MultiLayerPerceptron

In [None]:
#export
class FactorizationMachine(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

class DeepFM_v2(torch.nn.Module):
    """
    A pytorch implementation of DeepFM.
    Reference:
        H Guo, et al. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 2017.
    """

    def __init__(self, field_dims, embed_dim, mlp_dims, dropout):
        super().__init__()
        self.linear = FeaturesLinear(field_dims)
        self.fm = FactorizationMachine(reduce_sum=True)
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        embed_x = self.embedding(x)
        x = self.linear(x) + self.fm(embed_x) + self.mlp(embed_x.view(-1, self.embed_output_dim))
        return torch.sigmoid(x.squeeze(1))

## v3

> **References:-**
- https://github.com/huangjunheng/recommendation_model/tree/master/deepFM

In [None]:
#export
from collections import namedtuple, defaultdict

import torch
from torch import nn as nn

In [None]:
#exporti
class FM(nn.Module):
    def __init__(self, p, k):
        super(FM, self).__init__()
        self.p = p
        self.k = k
        self.linear = nn.Linear(self.p, 1, bias=True)
        self.v = nn.Parameter(torch.Tensor(self.p, self.k), requires_grad=True)
        self.v.data.uniform_(-0.01, 0.01)
        self.drop = nn.Dropout(0.3)

    def forward(self, x):
        linear_part = self.linear(x)
        inter_part1 = torch.pow(torch.mm(x, self.v), 2)
        inter_part2 = torch.mm(torch.pow(x, 2), torch.pow(self.v, 2))
        pair_interactions = torch.sum(torch.sub(inter_part1, inter_part2), dim=1)
        self.drop(pair_interactions)
        output = linear_part.transpose(1, 0) + 0.5 * pair_interactions
        return output.view(-1, 1)

In [None]:
#export
class DeepFM_v3(nn.Module):
    def __init__(self, feat_sizes, sparse_feature_columns, dense_feature_columns,dnn_hidden_units=[400, 400,400], dnn_dropout=0.0, ebedding_size=4,
                 l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
                 device='cpu'):
        super(DeepFM_v3, self).__init__()
        self.feat_sizes = feat_sizes
        self.device = device
        self.dense_feature_columns = dense_feature_columns
        self.sparse_feature_columns = sparse_feature_columns
        self.embedding_size = ebedding_size
        self.l2_reg_linear = l2_reg_linear

        self.bias = nn.Parameter(torch.zeros((1, )))
        self.init_std = init_std
        self.dnn_dropout = dnn_dropout

        self.embedding_dic = nn.ModuleDict({feat:nn.Embedding(self.feat_sizes[feat], self.embedding_size, sparse=False)
                                            for feat in self.sparse_feature_columns})
        for tensor in self.embedding_dic.values():
            nn.init.normal_(tensor.weight, mean=0, std=self.init_std)
        self.embedding_dic.to(self.device)

        self.feature_index = defaultdict(int)
        start = 0
        for feat in self.feat_sizes:
            if feat in self.feature_index:
                continue
            self.feature_index[feat] = start
            start += 1

        self.input_size = self.embedding_size * len(self.sparse_feature_columns)+len(self.dense_feature_columns)
        # fm
        self.fm = FM(self.input_size, 10)

        # DNN
        self.dropout = nn.Dropout(self.dnn_dropout)
        self.hidden_units = [self.input_size] + dnn_hidden_units
        self.Linears = nn.ModuleList([nn.Linear(self.hidden_units[i], self.hidden_units[i+1]) for i in range(len(self.hidden_units)-1)])
        self.relus = nn.ModuleList([nn.ReLU() for i in range(len(self.hidden_units)-1)])
        for name, tensor in self.Linears.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=self.init_std)
        self.dnn_outlayer = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(self.device)


    def forward(self, x):
        # x shape 1024*39

        sparse_embedding = [self.embedding_dic[feat](x[:, self.feature_index[feat]].long()) for feat in self.sparse_feature_columns]
        sparse_embedding = torch.cat(sparse_embedding, dim=-1)
        # print(sparse_embedding.shape)  # batch * 208

        dense_value = [x[:, self.feature_index[feat]] for feat in
                            self.dense_feature_columns]

        dense_value = torch.cat(dense_value, dim=0)
        dense_value = torch.reshape(dense_value, (len(self.dense_feature_columns), -1))
        dense_value = dense_value.T
        # print(dense_value.shape) # batch * 13

        input_x = torch.cat((dense_value, sparse_embedding), dim=1)
        # print(input_x.shape) # batch * 221

        fm_logit = self.fm(input_x)

        for i in range(len(self.Linears)):
            fc = self.Linears[i](input_x)
            fc = self.relus[i](fc)
            fc = self.dropout(fc)
            input_x = fc
        dnn_logit = self.dnn_outlayer(input_x)

        y_pre = torch.sigmoid(fm_logit+dnn_logit+self.bias)
        return y_pre

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss, roc_auc_score

from recohut.datasets.criteo import CriteoSampleDataset


def get_auc(loader, model):
    pred, target = [], []
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)
            pred += list(y_hat.cpu().numpy())
            target += list(y.cpu().numpy())
    auc = roc_auc_score(target, pred)
    return auc


root = '/content/data'
batch_size = 1024
epochs = 10
seed = 1024
lr = 0.00005
wd = 0.00001
device = 'cpu'

ds = CriteoSampleDataset(root=root)
train_tensor_data, test_tensor_data = ds.load()
train_loader = DataLoader(train_tensor_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_tensor_data, batch_size=batch_size)

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

# model = NFM(ds.feat_sizes, embedding_size, ds.linear_feature_columns, ds.dnn_feature_columns).to(device)
model = DeepFM_v3(ds.feat_sizes, sparse_feature_columns=sparse_features, dense_feature_columns=dense_features,
                dnn_hidden_units=[1000, 500, 250], dnn_dropout=0.9, ebedding_size=16,
                l2_reg_linear=1e-3, device=device)
loss_func = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

for epoch in range(epochs):
    total_loss_epoch = 0.0
    total_tmp = 0
    model.train()
    for index, (x, y) in enumerate(train_loader):
        x, y = x.to(device).float(), y.to(device).float()
        y_hat = model(x)

        optimizer.zero_grad()
        loss = loss_func(y_hat, y)
        loss.backward()
        optimizer.step()
        total_loss_epoch += loss.item()
        total_tmp += 1
    auc = get_auc(test_loader, model)
    print('epoch/epoches: {}/{}, train loss: {:.3f}, test auc: {:.3f}'.format(epoch, epochs, total_loss_epoch / total_tmp, auc))

epoch/epoches: 0/10, train loss: 0.570, test auc: 0.684
epoch/epoches: 1/10, train loss: 0.534, test auc: 0.714
epoch/epoches: 2/10, train loss: 0.511, test auc: 0.725
epoch/epoches: 3/10, train loss: 0.486, test auc: 0.732
epoch/epoches: 4/10, train loss: 0.459, test auc: 0.738
epoch/epoches: 5/10, train loss: 0.431, test auc: 0.743
epoch/epoches: 6/10, train loss: 0.401, test auc: 0.743
epoch/epoches: 7/10, train loss: 0.368, test auc: 0.740
epoch/epoches: 8/10, train loss: 0.337, test auc: 0.735
epoch/epoches: 9/10, train loss: 0.312, test auc: 0.729


In [None]:
# class DeepFM(PointModel):

#     def __init__(self, n_users, n_items, embedding_dim, batch_norm=True, dropout=0.1, num_layers=3, act_function='relu'):
#         """
#         Args:
#             n_users : int, the number of users
#             n_items : int, the number of items
#             embedding_dim : int, the number of latent factoact_function : str, activation function for hidden layer
#             num_layers : int, number of hidden layers
#             batch_norm : bool, whether to normalize a batch of data
#             dropout : float, dropout rate
#         """
#         super().__init__()

#         self.num_layers = num_layers

#         self.user_embedding = nn.Embedding(
#             num_embeddings=n_users, embedding_dim=embedding_dim
#         )
#         self.item_embedding = nn.Embedding(
#             num_embeddings=n_items, embedding_dim=embedding_dim
#         )
#         self.user_bias = nn.Embedding(n_users, 1)
#         self.item_bias = nn.Embedding(n_items, 1)
#         self.bias_ = nn.Parameter(torch.tensor([0.0]))

#         fm_modules = []
#         if batch_norm:
#             fm_modules.append(nn.BatchNorm1d(embedding_dim))
#         fm_modules.append(nn.Dropout(dropout))
#         self.fm_layers = nn.Sequential(*fm_modules)

#         deep_modules = []
#         in_dim = embedding_dim * 2   # user & item
#         for _ in range(num_layers):  # _ is dim if layers is list
#             out_dim = in_dim
#             deep_modules.append(nn.Linear(in_dim, out_dim))
#             in_dim = out_dim
#             if batch_norm:
#                 deep_modules.append(nn.BatchNorm1d(out_dim))
#             if act_function == 'relu':
#                 deep_modules.append(nn.ReLU())
#             elif act_function == 'sigmoid':
#                 deep_modules.append(nn.Sigmoid())
#             elif act_function == 'tanh':
#                 deep_modules.append(nn.Tanh())
#             deep_modules.append(nn.Dropout(dropout))

#         self.deep_layers = nn.Sequential(*deep_modules)
#         self.deep_out = nn.Linear(in_dim, 1, bias=False)

#         self._init_weights()

#     def _init_weights(self):
#         nn.init.normal_(self.item_embedding.weight, std=0.01)
#         nn.init.normal_(self.user_embedding.weight, std=0.01)
#         nn.init.constant_(self.user_bias.weight, 0.0)
#         nn.init.constant_(self.item_bias.weight, 0.0)

#         # for deep layers
#         for m in self.deep_layers:
#             if isinstance(m, nn.Linear):
#                 nn.init.xavier_normal_(m.weight)
#         nn.init.xavier_normal_(self.deep_out.weight)

#     def forward(self, users, items):
#         embed_user = self.user_embedding(users)
#         embed_item = self.item_embedding(items)

#         fm = embed_user * embed_item
#         fm = self.fm_layers(fm)
#         y_fm = fm.sum(dim=-1)

#         y_fm = y_fm + self.user_bias(users) + self.item_bias(items) + self.bias_

#         if self.num_layers:
#             fm = self.deep_layers(fm)

#         y_deep = torch.cat((embed_user, embed_item), dim=-1)
#         y_deep = self.deep_layers(y_deep)

#         # since BCELoss will automatically transfer pred with sigmoid
#         # there is no need to use extra nn.Sigmoid(pred)
#         pred = y_fm + y_deep

#         return pred.view(-1)

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-08 05:31:12

recohut: 0.0.9

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas    : 1.1.5
numpy     : 1.19.5
IPython   : 5.5.0
PIL       : 7.1.2
matplotlib: 3.2.2
torch     : 1.10.0+cu111

