In [None]:
# default_exp models.afm

# AFM
> A pytorch implementation of Attentional Factorization Machines (AFM).

Improves FM by discriminating the importance of different feature interactions. It learns the importance of each feature interaction from data via a neural attention network. Empirically, it is shown on regression task AFM betters FM with a 8.6% relative improvement, and consistently outperforms the state-of-the-art deep learning methods Wide&Deep and DeepCross with a much simpler structure and fewer model parameters.

![Untitled](https://github.com/RecoHut-Stanzas/S021355/raw/main/images/img11.png)

Formally, the AFM model can be defined as:

$$\hat{y}_{AFM} (x) = w_0 + \sum_{i=1}^nw_ix_i + p^T\sum_{i=1}^n\sum_{j=i+1}^na_{ij}(v_i\odot v_j)x_ix_j$$

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

## V1

In [None]:
#export
from typing import Any, Iterable, List, Optional, Tuple, Union, Callable

import torch
from torch import nn
import torch.nn.functional as F

from recohut.models.bases.common import PointModel

In [None]:
#export
class AFM(PointModel):

    def __init__(self, n_users, n_items, embedding_dim, batch_norm=True, dropout=0.1):
        super().__init__()

        self.user_embedding = nn.Embedding(
            num_embeddings=n_users, embedding_dim=embedding_dim
        )
        self.item_embedding = nn.Embedding(
            num_embeddings=n_items, embedding_dim=embedding_dim
        )
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.bias_ = nn.Parameter(torch.tensor([0.0]))

        fm_modules = []
        if batch_norm:
            fm_modules.append(nn.BatchNorm1d(embedding_dim))
        fm_modules.append(nn.Dropout(dropout))
        self.fm_layers = nn.Sequential(*fm_modules)

        # consider attention score layer dimension should be (embedding_dim, num_features)
        # here we only consider 2 features, user & item, then K=2
        K = 2   # num_features
        self.lin = nn.Linear(embedding_dim, K)
        self.h = nn.Parameter(torch.rand(K, 1))

        # final prediction for reducer sum
        self.pred = nn.Linear(embedding_dim, 1, bias=False)

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.lin.weight, std=0.01)
        nn.init.xavier_normal_(self.pred.weight)

        nn.init.constant_(self.user_bias.weight, 0.0)
        nn.init.constant_(self.item_bias.weight, 0.0)

    def forward(self, users, items):
        embed_user = self.user_embedding(users)
        embed_item = self.item_embedding(items)

        fm = embed_user * embed_item
        fm = self.fm_layers(fm)

        ''' attention part '''
        att = F.relu(self.lin(fm)).mm(self.h)
        fm = fm * att

        fm = fm + self.user_bias(users) + self.item_bias(items) + self.bias_
        pred = self.pred(fm)

        return pred.view(-1)

Example

In [None]:
from recohut.datasets.movielens import ML1mDataModule
from recohut.trainers.pl_trainer import pl_trainer

In [None]:
class Args:
    def __init__(self):
        self.data_dir = '/content/data'
        self.min_rating = 4
        self.num_negative_samples = 99
        self.min_uc = 5
        self.min_sc = 5

        self.log_dir = '/content/logs'
        self.model_dir = '/content/models'

        self.val_p = 0.2
        self.test_p = 0.2
        self.num_workers = 2
        self.normalize = False
        self.batch_size = 32
        self.seed = 42
        self.shuffle = True
        self.pin_memory = True
        self.drop_last = False
        self.split_type = 'stratified'

        self.embedding_dim = 20
        self.max_epochs = 5

args = Args()

In [None]:
ds = ML1mDataModule(**args.__dict__)

ds.prepare_data()

In [None]:
model = AFM(n_items=ds.data.num_items, n_users=ds.data.num_users, embedding_dim=args.embedding_dim)

pl_trainer(model, ds, max_epochs=args.max_epochs)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type       | Params
----------------------------------------------
0 | user_embedding | Embedding  | 120 K 
1 | item_embedding | Embedding  | 62.5 K
2 | user_bias      | Embedding  | 6.0 K 
3 | item_bias      | Embedding  | 3.1 K 
4 | fm_layers      | Sequential | 40    
5 | lin            | Linear     | 42    
6 | pred           | Linear     | 20    
----------------------------------------------
192 K     Trainable params
0         Non-trainable params
192 K     Total params
0.770     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test Metrics': {'apak': tensor(0.1085),
                  'hr': tensor(0.2565),
                  'loss': tensor(214.0854),
                  'ncdg': tensor(0.1429)}}
--------------------------------------------------------------------------------


[{'Test Metrics': {'apak': tensor(0.1085),
   'hr': tensor(0.2565),
   'loss': tensor(214.0854),
   'ncdg': tensor(0.1429)}}]

## V2

> **References:-**
- J Xiao, et al. Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks, 2017.
- https://github.com/rixwew/pytorch-fm/blob/master/torchfm/model/afm.py

In [None]:
#export
import torch

from recohut.layers.common import FeaturesEmbedding, FeaturesLinear, MultiLayerPerceptron

In [None]:
#exporti
class AttentionalFactorizationMachine(torch.nn.Module):

    def __init__(self, embed_dim, attn_size, dropouts):
        super().__init__()
        self.attention = torch.nn.Linear(embed_dim, attn_size)
        self.projection = torch.nn.Linear(attn_size, 1)
        self.fc = torch.nn.Linear(embed_dim, 1)
        self.dropouts = dropouts

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        num_fields = x.shape[1]
        row, col = list(), list()
        for i in range(num_fields - 1):
            for j in range(i + 1, num_fields):
                row.append(i), col.append(j)
        p, q = x[:, row], x[:, col]
        inner_product = p * q
        attn_scores = F.relu(self.attention(inner_product))
        attn_scores = F.softmax(self.projection(attn_scores), dim=1)
        attn_scores = F.dropout(attn_scores, p=self.dropouts[0], training=self.training)
        attn_output = torch.sum(attn_scores * inner_product, dim=1)
        attn_output = F.dropout(attn_output, p=self.dropouts[1], training=self.training)
        return self.fc(attn_output)

In [None]:
#export
class AFMv2(torch.nn.Module):
    """
    A pytorch implementation of Attentional Factorization Machine.
    Reference:
        J Xiao, et al. Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks, 2017.
    """

    def __init__(self, field_dims, embed_dim, attn_size, dropouts):
        super().__init__()
        self.num_fields = len(field_dims)
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.linear = FeaturesLinear(field_dims)
        self.afm = AttentionalFactorizationMachine(embed_dim, attn_size, dropouts)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = self.linear(x) + self.afm(self.embedding(x))
        return torch.sigmoid(x.squeeze(1))

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut