In [1]:
import numpy as np
import pandas as pd
import torch

from pathlib import Path

In [3]:
path = Path("data/ml-1m")

In [4]:
list(path.iterdir())

[PosixPath('data/ml-1m/README'),
 PosixPath('data/ml-1m/users.dat'),
 PosixPath('data/ml-1m/movies.dat'),
 PosixPath('data/ml-1m/ratings.dat')]

In [5]:
str(path/"ratings.dat")

'data/ml-1m/ratings.dat'

In [6]:
ratings = pd.read_csv(str(path/"ratings.dat"), sep="::", header=None, names=["uid", "mid", "rating", "ts"])

  """Entry point for launching an IPython kernel.


In [7]:
ratings.dtypes

uid       int64
mid       int64
rating    int64
ts        int64
dtype: object

In [8]:
users = pd.read_csv(str(path/"users.dat"), sep="::", header=None, names=["uid", "gender", "age", "occupation", "zip_code"])

  """Entry point for launching an IPython kernel.


In [9]:
users.dtypes

uid            int64
gender        object
age            int64
occupation     int64
zip_code      object
dtype: object

In [10]:
movies = pd.read_csv(str(path/"movies.dat"), sep="::", header=None, names=["mid", "title", "genre"])

  """Entry point for launching an IPython kernel.


In [11]:
movies.dtypes

mid       int64
title    object
genre    object
dtype: object

In [12]:
uids = users.uid.unique()

In [13]:
mids = movies.mid.unique()

In [14]:
uid_idx = {uid:idx  for idx,uid in enumerate(uids)}

In [15]:
mid_idx = {mid:idx for idx,mid in enumerate(mids)}

In [169]:
int(0.5)

0

In [188]:
val_idxs = np.random.randint(low=0, high=len(ratings), size=int(0.2*len(ratings)))

In [189]:
ratings_idxs = np.array([False]*len(ratings))

In [190]:
ratings_idxs[val_idxs] = True

In [191]:
ratings_trn_arr = ratings.iloc[~ratings_idxs,:3].values

In [192]:
ratings_val_arr = ratings.iloc[ratings_idxs,:3].values

In [194]:
ratings.head()

Unnamed: 0,uid,mid,rating,ts
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [193]:
ratings_trn_arr.shape, ratings_val_arr.shape

((818967, 3), (181242, 3))

In [195]:
class MovieLensCfDataset():
    def __init__(self, uid_idx, mid_idx, ratings):
        self.uid_idx, self.mid_idx, self.ratings = uid_idx, mid_idx, ratings
        
    def __getitem__(self, index):
        r = self.ratings[index]
        uidx = self.uid_idx[r[0]]
        midx = self.mid_idx[r[1]]
        return np.array([uidx, midx, r[2]])
    
    def __len__(self):
        return self.ratings.shape[0]

In [196]:
mlcf_trn_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_trn_arr)

In [197]:
mlcf_trn_ds[5], len(mlcf_trn_ds)

(array([  0, 590,   4]), 818967)

In [198]:
class MovieLensCfDataLoader():
    def __init__(self, dataset, bs=128, shuffle=False):
        self.dataset, self.bs = dataset, bs
        if shuffle:
            self.sampler = torch.utils.data.RandomSampler(dataset)
        else:
            self.sampler = torch.utils.data.SequentialSampler(dataset)
        self.batch_sampler = torch.utils.data.BatchSampler(self.sampler, bs, False)
    
    def get_batch(self, idx_batch):
        batch = []
        for idx in idx_batch:
            batch.append(self.dataset[idx])
        batch = list(zip(*batch))
        return [np.array(i) for i in batch]
        
    def get_tensor(self, batch):
        t_batch = [torch.tensor(i).contiguous() for i in batch]
        t_batch[-1] = t_batch[-1].float().contiguous()
        return t_batch
        
    def __iter__(self):
        for idx_batch in self.batch_sampler:
            yield self.get_tensor(self.get_batch(idx_batch))
            
    def __len__(self):
        return len(self.batch_sampler)

In [199]:
mlcf_trn_dl = MovieLensCfDataLoader(mlcf_trn_ds, shuffle=True)

In [200]:
l = next(iter(mlcf_trn_dl))

In [201]:
l

[tensor([3891, 3829, 3560, 2465, 1640, 2859, 5602, 3595, 2109, 4028,  918, 2163,
         4468, 4807, 3591, 5537, 2119, 4489, 1338, 1775, 2348, 1613, 4993, 6034,
         5837, 3229, 1836, 1183, 4586, 1544, 1271, 2224,  423,  727, 6009, 5519,
         5666, 4421, 5830,  945, 5111,  111, 3617, 2254,  523, 5252, 5436, 5082,
         4982, 3568, 1242, 1883, 3303, 2345, 2070, 2909, 4693, 4419,  352, 3092,
          854, 4057, 4185, 1062, 3991, 5440, 3599, 4840, 2287, 2972, 1284, 4864,
         4587, 2594, 4225,  586, 5325, 4084, 5584, 3518, 5053, 2155, 2010, 3289,
         4897, 3841, 5884, 1135,  156, 2638, 3922, 1684, 3848, 3754, 1841, 4403,
         5557, 3840, 4880, 5165, 5107, 4783, 3756, 2278, 5131,   53,   12, 2096,
         1836, 2260, 4417, 4898, 1218,  604, 4261, 1888, 5043, 3525, 1501, 4746,
         2822, 1208, 1740, 2885, 4456, 2184, 1756, 3899]),
 tensor([ 102, 3686, 2078,  655,  314, 1238,  952, 1196, 1271, 3110, 1178, 1243,
         1896, 1957,  139, 3606, 3412,  585,  513,

In [202]:
len(mlcf_trn_dl)

6399

In [323]:
def get_emb(n_emb, emb_dim):
    emb = torch.nn.Embedding(n_emb, emb_dim)
    emb.weight.data.uniform_(-0.01, 0.01)
    return emb

class MovieLensCfModel(torch.nn.Module):
    def __init__(self, n_users, n_movies, n_features):
        super().__init__()
        self.uf = get_emb(n_users, n_features)
        self.mf = get_emb(n_movies, n_features)
        self.ub = get_emb(n_users, 1)
        self.mb = get_emb(n_movies, 1)
        
    def forward(self, u_idxs, m_idxs):
        return 5 * torch.sigmoid(torch.squeeze(torch.sum((self.uf(u_idxs) * self.mf(m_idxs)), 1, 
                                       keepdim=True) + self.ub(u_idxs) + self.mb(m_idxs)))

In [324]:
n_features = 50
n_epochs = 2

In [325]:
mlcf_model = MovieLensCfModel(len(uids), len(mids), n_features)
mlcf_trn_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_trn_arr)
mlcf_val_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_val_arr)
mlcf_trn_dl = MovieLensCfDataLoader(mlcf_trn_ds, shuffle=True, bs=64)
mlcf_val_dl = MovieLensCfDataLoader(mlcf_val_ds, bs=len(mlcf_val_ds))

In [326]:
optimizer = torch.optim.Adam(mlcf_model.parameters())

In [None]:
for epoch in range(n_epochs):
    #start of epoch
    iteration = 0
    for mini_batch in mlcf_trn_dl:
        #start of iteration
        optimizer.zero_grad()
        preds = mlcf_model(mini_batch[0], mini_batch[1])
        loss = torch.nn.functional.mse_loss(preds, mini_batch[2])
        loss.backward()
        optimizer.step()
        if (iteration % 500) == 0:
            print(f"epoch:{epoch}; iteration:{iteration}; trn_loss:{loss.item()}")
        iteration += 1
    val_batch = next(iter(mlcf_val_dl))
    val_preds = mlcf_model(val_batch[0], val_batch[1])
    val_loss = torch.nn.functional.mse_loss(val_preds, val_batch[2])
    print(f"epoch:{epoch}; val_loss:{val_loss}")

epoch:0; iteration:0; trn_loss:2.432405710220337
epoch:0; iteration:500; trn_loss:1.876611590385437
epoch:0; iteration:1000; trn_loss:1.7131539583206177
epoch:0; iteration:1500; trn_loss:1.33926260471344
epoch:0; iteration:2000; trn_loss:1.0421702861785889
epoch:0; iteration:2500; trn_loss:0.8736897110939026
epoch:0; iteration:3000; trn_loss:1.0669044256210327
epoch:0; iteration:3500; trn_loss:0.6119546294212341
epoch:0; iteration:4000; trn_loss:1.2707244157791138
epoch:0; iteration:4500; trn_loss:0.9104400277137756
epoch:0; iteration:5000; trn_loss:0.9401925206184387
epoch:0; iteration:5500; trn_loss:0.982681930065155
epoch:0; iteration:6000; trn_loss:0.8931989073753357
epoch:0; iteration:6500; trn_loss:0.8575478196144104
epoch:0; iteration:7000; trn_loss:0.6615099906921387
epoch:0; iteration:7500; trn_loss:0.6814995408058167
epoch:0; iteration:8000; trn_loss:0.8597233891487122
epoch:0; iteration:8500; trn_loss:0.8457373976707458
epoch:0; iteration:9000; trn_loss:0.8364303708076477
ep

In [322]:
pow(0.75696946144104,0.5)

0.8700399194525732