# https://github.com/rixwew/pytorch-fm/tree/master

In [1]:
import numpy as np
import random
import os
import gc

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(113) # Seed 고정

In [4]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv( 'test.csv')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [5]:
import pandas as pd

train_lb = pd.read_csv('train_lb.csv')
test_lb = pd.read_csv('test_lb.csv')

In [6]:
X_train = train_lb.drop(columns = ['Unnamed: 0','ID', 'Book-Rating', 'Age', 'Year-Of-Publication', 'User_count', 'Rating_count', 'Author_count'])
y_train = train_lb['Book-Rating']
x_test = test_lb.drop(columns = ['Unnamed: 0','ID', 'Age', 'Year-Of-Publication', 'User_count', 'Rating_count', 'Author_count'])

In [9]:
!pip install torchfm
!pip install lmdb

Collecting lmdb
  Downloading lmdb-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.2/299.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lmdb
Successfully installed lmdb-1.4.1


In [11]:
import torch
import tqdm
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader

from torchfm.dataset.avazu import AvazuDataset
from torchfm.dataset.criteo import CriteoDataset
from torchfm.dataset.movielens import MovieLens1MDataset, MovieLens20MDataset
from torchfm.model.afi import AutomaticFeatureInteractionModel
from torchfm.model.afm import AttentionalFactorizationMachineModel
from torchfm.model.dcn import DeepCrossNetworkModel
from torchfm.model.dfm import DeepFactorizationMachineModel
from torchfm.model.ffm import FieldAwareFactorizationMachineModel
from torchfm.model.fm import FactorizationMachineModel
from torchfm.model.fnfm import FieldAwareNeuralFactorizationMachineModel
from torchfm.model.fnn import FactorizationSupportedNeuralNetworkModel
#from torchfm.model.hofm import HighOrderFactorizationMachineModel
from torchfm.model.lr import LogisticRegressionModel
from torchfm.model.ncf import NeuralCollaborativeFiltering
from torchfm.model.nfm import NeuralFactorizationMachineModel
from torchfm.model.pnn import ProductNeuralNetworkModel
from torchfm.model.wd import WideAndDeepModel
from torchfm.model.xdfm import ExtremeDeepFactorizationMachineModel
from torchfm.model.afn import AdaptiveFactorizationNetwork

In [31]:
features = X_train.columns
idx = {feature:None for feature in features}
for feature in features :
    feature2idx = {v:k for k,v in enumerate(X_train[feature].unique())}
    idx[feature] = len(feature2idx)
    X_train[feature] = X_train[feature].map(feature2idx)
field_dims = np.array(list(idx.values()), dtype=np.uint32)
print(field_dims)

[ 83256 217829  92635  15505     12     11  13820   1810    348]


In [39]:
def get_model(name, dataset):
    """
    Hyperparameters are empirically determined, not opitmized.
    """
    if name == 'lr':
        return LogisticRegressionModel(field_dims)
    elif name == 'fm':
        return FactorizationMachineModel(field_dims, embed_dim=16)
    elif name == 'hofm':
        return HighOrderFactorizationMachineModel(field_dims, order=3, embed_dim=16)
    elif name == 'ffm':
        return FieldAwareFactorizationMachineModel(field_dims, embed_dim=4)
    elif name == 'fnn':
        return FactorizationSupportedNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2)
    elif name == 'wd':
        return WideAndDeepModel(field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2)
    elif name == 'ipnn':
        return ProductNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16,), method='inner', dropout=0.2)
    elif name == 'opnn':
        return ProductNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16,), method='outer', dropout=0.2)
    elif name == 'dcn':
        return DeepCrossNetworkModel(field_dims, embed_dim=16, num_layers=3, mlp_dims=(16, 16), dropout=0.2)
    elif name == 'nfm':
        return NeuralFactorizationMachineModel(field_dims, embed_dim=64, mlp_dims=(64,), dropouts=(0.2, 0.2))
    elif name == 'ncf':
        # only supports MovieLens dataset because for other datasets user/item colums are indistinguishable
        assert isinstance(dataset, MovieLens20MDataset) or isinstance(dataset, MovieLens1MDataset)
        return NeuralCollaborativeFiltering(field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2,
                                            user_field_idx=dataset.user_field_idx,
                                            item_field_idx=dataset.item_field_idx)
    elif name == 'fnfm':
        return FieldAwareNeuralFactorizationMachineModel(field_dims, embed_dim=4, mlp_dims=(64,), dropouts=(0.2, 0.2))
    elif name == 'dfm':
        return DeepFactorizationMachineModel(field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2)
    elif name == 'xdfm':
        return ExtremeDeepFactorizationMachineModel(
            field_dims, embed_dim=16, cross_layer_sizes=(16, 16), split_half=False, mlp_dims=(16, 16), dropout=0.2)
    elif name == 'afm':
        return AttentionalFactorizationMachineModel(field_dims, embed_dim=16, attn_size=16, dropouts=(0.2, 0.2))
    elif name == 'afi':
        return AutomaticFeatureInteractionModel(
             field_dims, embed_dim=16, atten_embed_dim=64, num_heads=2, num_layers=3, mlp_dims=(400, 400), dropouts=(0, 0, 0))
    elif name == 'afn':
        print("Model:AFN")
        return AdaptiveFactorizationNetwork(
            field_dims, embed_dim=16, LNN_dim=1500, mlp_dims=(400, 400, 400), dropouts=(0, 0, 0))
    else:
        raise ValueError('unknown model name: ' + name)


In [40]:
class EarlyStopper(object):

    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_accuracy = 0
        self.save_path = save_path

    def is_continuable(self, model, accuracy):
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False


In [41]:
import torch.nn as nn

class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.eps = 1e-6
    def forward(self, x, y):
        criterion = MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

In [89]:
def train(model, optimizer, data_loader, criterion, device, log_interval=100):
    model.train()
    total_loss = 0
    tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0, position=0, leave=True)
    for i, (fields, target) in enumerate(tk0):
        fields, target = fields.to(device), target.to(device)
        y = model(fields)
        loss = criterion(y, target.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % log_interval == 0:
            tk0.set_postfix(loss=total_loss / log_interval)
            total_loss = 0

In [99]:
def test(model, data_loader, device):
    total_loss = 0
    model.eval()
    with torch.no_grad():
        for fields, target in (tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0, position=0, leave=True)):
            fields, target = fields.to(device), target.to(device)
            y = model(fields)
            loss = RMSELoss(target, y)
            total_loss += loss
    return (total_loss/len(data_loader)).item()

In [103]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 256
learning_rate = 0.001
weight_decay=0
epoch = 30

In [79]:
valid_length

87139

In [111]:
from torch.utils.data import TensorDataset, DataLoader

device = torch.device(device)
dataset = TensorDataset(torch.LongTensor(X_train.to_numpy()),
                        torch.LongTensor(y_train.to_numpy()))
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)

test_length = len(dataset) - train_length - valid_length


train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_length, valid_length, test_length])

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

criterion = RMSELoss
model = get_model('nfm', dataset).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=weight_decay)
early_stopper = EarlyStopper(num_trials=2, save_path='nfm.pt')
for epoch_i in range(epoch):
    print(f"Epoch {epoch_i+1}")
    train(model, optimizer, train_data_loader, criterion, device)
    RMSE = test(model, valid_data_loader, device)
    print('epoch:', epoch_i, 'validation: RMSE:', RMSE)
    if not early_stopper.is_continuable(model, RMSE):
        print(f'validation: best RMSE: {early_stopper.best_accuracy}')
        break
RMSE = test(model, test_data_loader, device)

Epoch 1


100%|███████████████████████████████████████████████████████████████████| 2724/2724 [00:24<00:00, 113.28it/s, loss=4.25]
100%|████████████████████████████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 640.18it/s]


epoch: 0 validation: RMSE: 4.235100746154785
Epoch 2


100%|███████████████████████████████████████████████████████████████████| 2724/2724 [00:23<00:00, 114.89it/s, loss=4.22]
100%|████████████████████████████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 645.80it/s]


epoch: 1 validation: RMSE: 4.234694480895996
Epoch 3


 31%|████████████████████▋                                               | 831/2724 [00:07<00:16, 115.25it/s, loss=4.25]


KeyboardInterrupt: 