# GMF on ML-1m

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import datetime

import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [None]:
!wget -q --show-progress https://github.com/sparsh-ai/stanza/raw/S629908/rec/CDL/data/ml_100k_train.npy
!wget -q --show-progress https://github.com/sparsh-ai/stanza/raw/S629908/rec/CDL/data/ml_100k_test.npy

In [None]:
train = np.load('ml_100k_train.npy')
test = np.load('ml_100k_test.npy')

train = (train > 0).astype(float)
test = (test > 0).astype(float)

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
class Config:
    learning_rate = 0.001
    weight_decay = 0.01
    early_stopping_round = 0
    epochs = 20
    seed = 1995
    embed_dim = 50
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    batch_size = 128

config = Config()

In [None]:
class TrainDataset(Dataset):
    def __init__(self, data, neg_data_per_pos_data):
        super(TrainDataset).__init__()
        self.M = data.shape[0]
        self.N = data.shape[1]
        self.data = data
        idx_mat = np.arange(self.M * self.N).reshape(self.M, self.N)
        pos_n = np.sum(data, dtype=np.int16)
        
        neg_idx = idx_mat[data == 0]
        pos_idx = idx_mat[data == 1]

        neg_sampled_idx = np.random.choice(neg_idx, pos_n*neg_data_per_pos_data, replace=False)
        self.total_rate = np.sort(np.union1d(pos_idx, neg_sampled_idx))

    def __len__(self):
        return len(self.total_rate)
        
    def __getitem__(self, i):
        idx = self.total_rate[i]
        u = int(idx // self.N)
        i = int(idx % self.M)
        r = self.data[u, i]

        return (u, i, r)

class TestDataset(Dataset):
    def __init__(self, data):
        super(TestDataset).__init__()
        self.M = data.shape[0]
        self.N = data.shape[1]
        self.data = data

    def __len__(self):
        return self.M * self.N
        
    def __getitem__(self, idx):
        u = int(idx // self.N)
        i = int(idx % self.M)
        r = self.data[u, i]
        
        return (u, i, r)

In [None]:
class GMF(nn.Module):
    def __init__(self, user_dim, item_dim, embed_dim):
        super(GMF, self).__init__()
        self.embed_dim = embed_dim

        self.user_embed_gmf = nn.Embedding(user_dim, embed_dim)
        self.item_embed_gmf = nn.Embedding(item_dim, embed_dim)
        self.out = nn.Linear(embed_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(self, user_idx, item_idx):
        p = self.user_embed_gmf(user_idx)
        q = self.item_embed_gmf(item_idx)
        
        assert p.shape == q.shape

        z = p * q
        out = self.out(z)

        out = self.sig(out)
        return out

In [None]:
seed_everything(config.seed)

train_data = TrainDataset(train, neg_data_per_pos_data=4)
test_data = TestDataset(test)

train_loader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=config.batch_size*100, shuffle=False)

model = GMF(train.shape[0], train.shape[1], config.embed_dim)
model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
loss_fn = nn.BCEWithLogitsLoss()

start = datetime.now()
history = defaultdict(list)
history['best_loss'] = np.inf

for epoch in range(config.epochs):
    model.train()
    losses = 0
    for batch_data in train_loader:
        user = batch_data[0].to(config.device, dtype=torch.long)
        item = batch_data[1].to(config.device, dtype=torch.long)
        rate = batch_data[2].to(config.device, dtype=torch.float)

        optimizer.zero_grad()
        
        pred = model(user, item)
        loss = loss_fn(pred, rate.unsqueeze(-1))
        loss.backward()
        optimizer.step()

        losses += loss.item()
    losses /= len(train_loader) 
    history['train_losses'].append(losses)

    losses_val = 0
    for bacth_data in test_loader:
        user = batch_data[0].to(config.device, dtype=torch.long)
        item = batch_data[1].to(config.device, dtype=torch.long)
        rate = batch_data[2].to(config.device, dtype=torch.float)

        with torch.no_grad():

            pred = model(user, item)
            loss = loss_fn(pred, rate.unsqueeze(-1))
            losses_val += loss.item()

    losses_val /= len(test_loader)
    
    print(f'EPOCH {epoch+1} TRAIN LogLoss : {losses:.6f}, TEST LogLoss : {losses_val:.6f}')

    history['val_losses'].append(losses_val)
    if history['best_loss'] > losses_val:
        history['best_loss'] = losses_val
        torch.save(model.state_dict(), f'./artifacts/models/pretrained/pytorch/GMF.pth')
        print('The Model Saving...')
    # if epoch==0 or (epoch + 1) % 10 == 0 or epoch == config.epochs:

end = datetime.now()
print(f'Training takes time {end-start}')

EPOCH 1 TRAIN LogLoss : 0.921107, TEST LogLoss : 0.848807
The Model Saving...
EPOCH 2 TRAIN LogLoss : 0.833160, TEST LogLoss : 0.809659
The Model Saving...
EPOCH 3 TRAIN LogLoss : 0.781696, TEST LogLoss : 0.765037
The Model Saving...
EPOCH 4 TRAIN LogLoss : 0.753273, TEST LogLoss : 0.744066
The Model Saving...
EPOCH 5 TRAIN LogLoss : 0.737446, TEST LogLoss : 0.732368
The Model Saving...
EPOCH 6 TRAIN LogLoss : 0.728723, TEST LogLoss : 0.723949
The Model Saving...
EPOCH 7 TRAIN LogLoss : 0.724300, TEST LogLoss : 0.725161
EPOCH 8 TRAIN LogLoss : 0.722460, TEST LogLoss : 0.724106
EPOCH 9 TRAIN LogLoss : 0.721952, TEST LogLoss : 0.723895
The Model Saving...
EPOCH 10 TRAIN LogLoss : 0.721864, TEST LogLoss : 0.718208
The Model Saving...
EPOCH 11 TRAIN LogLoss : 0.721874, TEST LogLoss : 0.723880
EPOCH 12 TRAIN LogLoss : 0.721866, TEST LogLoss : 0.723900
EPOCH 13 TRAIN LogLoss : 0.721868, TEST LogLoss : 0.720035
EPOCH 14 TRAIN LogLoss : 0.721868, TEST LogLoss : 0.723905
EPOCH 15 TRAIN LogLoss 

In [None]:
history['best_loss']

0.7182083129882812

---

In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-11-28 16:13:54

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

torch     : 1.10.0+cu111
IPython   : 5.5.0
sys       : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
numpy     : 1.19.5
pandas    : 1.1.5
matplotlib: 3.2.2



---

**END**