In [40]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.functional as F

In [142]:
class MF(nn.Module):

    def __init__(self, n_users, n_items, embd_dim):
        super().__init__()
        self.user_embd = nn.Embedding(n_users, embd_dim)
        self.item_embd = nn.Embedding(n_items, embd_dim)

        self._init_weights()

    def forward(self, u, v):
        u = self.user_embd(u)
        v = self.item_embd(v)

        return (u * v).sum(dim=1)

    def _init_weights(self) -> None:
        self.user_embd.weight.data.uniform_(0, .5)
        self.item_embd.weight.data.uniform_(0, .5)


In [276]:
class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, path: str):
        df = self._load_data(path)
        self.user_ids = self._arange_movie_ids(df.iloc[:, 0])
        self.movie_ids = self._arange_movie_ids(df.iloc[:, 1])
        self.ratings = torch.FloatTensor(df.iloc[:, 2].values)

    def _load_data(self, path: str):
        return pd.read_csv(path, sep="::")

    def _arange_movie_ids(self, ids):
        map_id = dict(zip(ids.unique(), np.arange(len(ids.unique()))))
        return torch.LongTensor(list(map(lambda x: map_id[x], ids)))

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        values = []
        for arr in [self.user_ids, self.movie_ids, self.ratings]:
            values.append(arr[idx])

        keys = ["user_id", "movie_id", "rating"]
        return dict(zip(keys, values))

In [277]:
dataset = MovieLensDataset("../../data/ml-1m/ratings.dat")

  return pd.read_csv(path, sep="::")


In [278]:
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [279]:
next(iter(data_loader))

{'user_id': tensor([1604, 2637, 3964, 1679,  426, 2736, 3080,  854, 4563, 5171, 4317, 1465,
         5014, 3360, 1419, 4620]),
 'movie_id': tensor([1773, 1068,  322, 1083,   59, 2672, 1045,  197,  797,  535,  147, 1209,
         1963,  217, 1285,  123]),
 'rating': tensor([4., 3., 3., 3., 5., 3., 3., 2., 5., 3., 5., 3., 4., 5., 4., 5.])}

In [280]:
mf_model = MF(len(dataset.user_ids.unique()), len(dataset.movie_ids.unique()), 10)

In [281]:
loss_fn = nn.MSELoss()
optim = torch.optim.Adam(mf_model.parameters())

In [282]:
batch["user_id"]

tensor([4807, 1939, 2335,  757, 5463, 5820, 1647, 3424, 2700, 6040, 3939, 5350,
        5550,  580, 4169, 1193])

In [283]:
EPOCHS = 2

for epoch in range(EPOCHS):
    for batch in data_loader:
        preds = mf_model(batch["user_id"], batch["movie_id"])
        loss = loss_fn(batch["rating"], preds)
        optim.zero_grad()  # reset gradient
        loss.backward()
        optim.step()
        print(loss.item())

9.086992263793945
12.47150707244873
10.673502922058105
10.066643714904785
10.090364456176758
9.73542308807373
8.010313987731934
10.915080070495605
12.044336318969727
8.798759460449219
11.457185745239258
12.342007637023926
10.932440757751465
9.337244033813477
11.891122817993164
9.812763214111328
10.690451622009277
8.656230926513672
9.220598220825195
8.927656173706055
8.491456031799316
11.60592269897461
10.942052841186523
10.26839542388916
11.458390235900879
11.05502986907959
11.15677547454834
9.105960845947266
9.473462104797363
10.789730072021484
10.931889533996582
11.473836898803711
10.142559051513672
10.625954627990723
10.474478721618652
8.020242691040039
13.237239837646484
9.597134590148926
8.855364799499512
10.629743576049805
9.21977424621582
9.806138038635254
11.4168701171875
12.07112979888916
10.879055976867676
8.999835014343262
11.12590503692627
11.643111228942871
11.29271125793457
10.776169776916504
9.660694122314453
12.957046508789062
10.611970901489258
10.181575775146484
10.64

KeyboardInterrupt: 

In [284]:
preds

tensor([3.1880, 2.7237, 2.8774, 4.0196, 3.0923, 4.4104, 4.2246, 2.8025, 2.4873,
        2.3652, 3.8148, 3.9690, 3.0982, 3.6308, 4.0676, 4.2844],
       grad_fn=<SumBackward1>)

In [285]:
batch["rating"]

tensor([2., 3., 4., 4., 4., 5., 4., 4., 4., 3., 3., 4., 3., 3., 4., 4.])