In [1]:
import numpy as np
import pandas as pd
import torch

from pathlib import Path

In [2]:
path = Path("data/ml-1m")

In [3]:
list(path.iterdir())

[PosixPath('data/ml-1m/README'),
 PosixPath('data/ml-1m/users.dat'),
 PosixPath('data/ml-1m/movies.dat'),
 PosixPath('data/ml-1m/ratings.dat')]

In [4]:
str(path/"ratings.dat")

'data/ml-1m/ratings.dat'

In [5]:
ratings = pd.read_csv(str(path/"ratings.dat"), sep="::", header=None, names=["uid", "mid", "rating", "ts"])

  """Entry point for launching an IPython kernel.


In [6]:
ratings.dtypes

uid       int64
mid       int64
rating    int64
ts        int64
dtype: object

In [7]:
users = pd.read_csv(str(path/"users.dat"), sep="::", header=None, names=["uid", "gender", "age", "occupation", "zip_code"])

  """Entry point for launching an IPython kernel.


In [8]:
users.dtypes

uid            int64
gender        object
age            int64
occupation     int64
zip_code      object
dtype: object

In [9]:
movies = pd.read_csv(str(path/"movies.dat"), sep="::", header=None, names=["mid", "title", "genre"])

  """Entry point for launching an IPython kernel.


In [10]:
movies.dtypes

mid       int64
title    object
genre    object
dtype: object

In [11]:
uids = users.uid.unique()

In [12]:
mids = movies.mid.unique()

In [13]:
uid_idx = {uid:idx  for idx,uid in enumerate(uids)}

In [14]:
mid_idx = {mid:idx for idx,mid in enumerate(mids)}

In [15]:
val_idxs = np.random.randint(low=0, high=len(ratings), size=int(0.2*len(ratings)))

In [16]:
ratings_idxs = np.array([False]*len(ratings))

In [17]:
ratings_idxs[val_idxs] = True

In [18]:
ratings_trn_arr = ratings.iloc[~ratings_idxs,:3].values

In [19]:
ratings_val_arr = ratings.iloc[ratings_idxs,:3].values

In [20]:
ratings.head()

Unnamed: 0,uid,mid,rating,ts
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [21]:
ratings_trn_arr.shape, ratings_val_arr.shape

((818857, 3), (181352, 3))

In [22]:
class MovieLensCfDataset():
    def __init__(self, uid_idx, mid_idx, ratings):
        self.uid_idx, self.mid_idx, self.ratings = uid_idx, mid_idx, ratings
        
    def __getitem__(self, index):
        r = self.ratings[index]
        uidx = self.uid_idx[r[0]]
        midx = self.mid_idx[r[1]]
        return np.array([uidx, midx, r[2]])
    
    def __len__(self):
        return self.ratings.shape[0]

In [23]:
mlcf_trn_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_trn_arr)

In [25]:
mlcf_trn_ds[5], len(mlcf_trn_ds)

(array([   0, 1267,    5]), 818857)

In [35]:
from torch.utils import data

In [38]:
class MovieLensCfDataLoader():
    def __init__(self, dataset, bs=128, shuffle=False):
        self.dataset, self.bs = dataset, bs
        if shuffle:
            self.sampler = torch.utils.data.RandomSampler(dataset)
        else:
            self.sampler = torch.utils.data.SequentialSampler(dataset)
        self.batch_sampler = torch.utils.data.BatchSampler(self.sampler, bs, False)
    
    def get_batch(self, idx_batch):
        batch = []
        for idx in idx_batch:
            batch.append(self.dataset[idx])
        batch = list(zip(*batch))
        return [np.array(i) for i in batch]
        
    def get_tensor(self, batch):
        t_batch = [torch.tensor(i).contiguous() for i in batch]
        t_batch[-1] = t_batch[-1].float().contiguous()
        return t_batch
        
    def __iter__(self):
        for idx_batch in self.batch_sampler:
            yield self.get_tensor(self.get_batch(idx_batch))
            
    def __len__(self):
        return len(self.batch_sampler)

In [39]:
mlcf_trn_dl = MovieLensCfDataLoader(mlcf_trn_ds, shuffle=True)

In [40]:
l = next(iter(mlcf_trn_dl))

In [41]:
l

[tensor([1956, 2933,  854, 3581, 4584,  307, 1305, 4956, 3239, 1087, 5023,  817,
          831,  795, 3413, 4280, 3994, 3945, 4334, 5705, 5746, 3412, 3409, 3907,
         5764, 3335, 3144, 4713,  829, 4522, 4447, 3577, 1241, 4814, 4063, 3468,
          647, 2175, 1337, 5947, 3413, 3930,  600, 4490, 4429, 4309, 1646, 5386,
         3863, 2107, 3525, 4247, 4222, 5547, 1243, 4012, 3388, 3127, 5635, 1979,
         1697, 4724, 2872, 5407, 5412, 3225,  523, 4057,   64, 3388, 2303, 2678,
         3332, 3440, 2565, 4708, 2865, 3335, 4737, 1979, 3291,  753, 4961, 1116,
         1450,  187, 4496, 3625, 3421, 4488, 2740,  174,  135, 5635,  301, 4146,
         2115, 3708, 4727, 3799, 2379, 1883, 2024, 3912, 3686, 2732, 1721, 3390,
          181, 1138, 1596, 1592, 1050, 1915,  267, 1126, 1446, 1968, 3260, 2418,
         4591, 1925, 3599, 5793, 1242, 3628, 1596, 5846]),
 tensor([2327, 2803, 3195, 1192, 1113, 2728, 2890,  139, 1683, 1332,  898,  159,
         3457,  959, 1375, 1207,  623,  535, 3200,

In [42]:
len(mlcf_trn_dl)

6398

In [43]:
def get_emb(n_emb, emb_dim):
    emb = torch.nn.Embedding(n_emb, emb_dim)
    emb.weight.data.uniform_(-0.01, 0.01)
    return emb

class MovieLensCfModel(torch.nn.Module):
    def __init__(self, n_users, n_movies, n_features):
        super().__init__()
        self.uf = get_emb(n_users, n_features)
        self.mf = get_emb(n_movies, n_features)
        self.ub = get_emb(n_users, 1)
        self.mb = get_emb(n_movies, 1)
        
    def forward(self, u_idxs, m_idxs):
        return 5 * torch.sigmoid(torch.squeeze(torch.sum((self.uf(u_idxs) * self.mf(m_idxs)), 1, 
                                       keepdim=True) + self.ub(u_idxs) + self.mb(m_idxs)))

In [44]:
n_features = 50
n_epochs = 2

In [45]:
mlcf_model = MovieLensCfModel(len(uids), len(mids), n_features)
mlcf_trn_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_trn_arr)
mlcf_val_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_val_arr)
mlcf_trn_dl = MovieLensCfDataLoader(mlcf_trn_ds, shuffle=True, bs=64)
mlcf_val_dl = MovieLensCfDataLoader(mlcf_val_ds, bs=len(mlcf_val_ds))

In [46]:
optimizer = torch.optim.Adam(mlcf_model.parameters())

In [47]:
for epoch in range(n_epochs):
    #start of epoch
    iteration = 0
    for mini_batch in mlcf_trn_dl:
        #start of iteration
        optimizer.zero_grad()
        preds = mlcf_model(mini_batch[0], mini_batch[1])
        loss = torch.nn.functional.mse_loss(preds, mini_batch[2])
        loss.backward()
        optimizer.step()
        if (iteration % 500) == 0:
            print(f"epoch:{epoch}; iteration:{iteration}; trn_loss:{loss.item()}")
        iteration += 1
    val_batch = next(iter(mlcf_val_dl))
    val_preds = mlcf_model(val_batch[0], val_batch[1])
    val_loss = torch.nn.functional.mse_loss(val_preds, val_batch[2])
    print(f"epoch:{epoch}; val_loss:{val_loss}")

epoch:0; iteration:0; trn_loss:1.9701387882232666
epoch:0; iteration:500; trn_loss:1.9901479482650757
epoch:0; iteration:1000; trn_loss:1.4574848413467407
epoch:0; iteration:1500; trn_loss:0.9649052619934082
epoch:0; iteration:2000; trn_loss:0.9627129435539246
epoch:0; iteration:2500; trn_loss:0.8875295519828796
epoch:0; iteration:3000; trn_loss:0.9062322378158569
epoch:0; iteration:3500; trn_loss:0.6941001415252686
epoch:0; iteration:4000; trn_loss:0.8348031640052795
epoch:0; iteration:4500; trn_loss:0.8412410616874695
epoch:0; iteration:5000; trn_loss:0.8494327068328857
epoch:0; iteration:5500; trn_loss:1.1773713827133179
epoch:0; iteration:6000; trn_loss:0.931889533996582
epoch:0; iteration:6500; trn_loss:0.9589399695396423
epoch:0; iteration:7000; trn_loss:0.6762319803237915
epoch:0; iteration:7500; trn_loss:0.8657174706459045
epoch:0; iteration:8000; trn_loss:0.8396139144897461
epoch:0; iteration:8500; trn_loss:0.6313912868499756
epoch:0; iteration:9000; trn_loss:1.122288584709167

In [48]:
#Test RMSE score. Should be comparable to MovieLens 1M Benchmarks: 
# https://www.researchgate.net/figure/Test-RMSE-of-different-models-on-MovieLens-1M_tbl1_303698729
pow(0.7428099513053894,0.5)

0.8618642302041485

Since the test rmse is comparable to benchmarks, the user and movie features that have been learned are good  and can be used for making recommendations

In [64]:
movies = movies.set_index("mid")

In [49]:
# for bias and against bias recommendations comparison
# Given user_id, make N "for bias" recommendations

In [52]:
np_mb = mlcf_model.mb.weight.data.numpy()
np_mf = mlcf_model.mf.weight.data.numpy()
np_uf = mlcf_model.uf.weight.data.numpy()

In [91]:
for m_idx in range(np_mb.shape[0]):
    movies.loc[mids[m_idx], "movie_quality"] = np_mb[m_idx][0]

In [254]:
def for_bias_recs(u_idx, m_idxs, num_recs):
    """ Returns movie idxs
    """
    uf = np_uf[u_idx]
    mf = np_mf[m_idxs]
    np_rats = np.sum((mf * uf), axis=1) + np_mb[m_idxs].flatten()
    return m_idxs[np.argsort(np_rats)[-num_recs:]]

In [296]:
def against_bias_recs(u_idx, m_idxs, num_recs, exclusive=True):
    """ Returns movie idxs
    """
    if exclusive:
        sel = np.array([False]*len(movies))
        sel[m_idxs] = True
        f_m_idxs = for_bias_recs(u_idx, m_idxs, num_recs)
        sel[f_m_idxs] = False
        m_idxs = np.argwhere(sel).flatten()
        
    uf = np_uf[u_idx]
    mf = np_mf[m_idxs]
    clip_val = -0.35 * (np_mb.max() - np_mb.min())
    np_rats = np_mb[m_idxs].flatten() -  np.clip(np.sum((mf * uf), axis=1), clip_val, -0.3*clip_val)
    return m_idxs[np.argsort(np_rats)[-num_recs:]]

In [297]:
all_m_idxs = np.arange(len(movies))

In [302]:
for_recs_idxs = for_bias_recs(777, all_m_idxs, 5)

In [303]:
movies.loc[mids[for_recs_idxs]]

Unnamed: 0_level_0,title,genre,movie_quality
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
527,Schindler's List (1993),Drama|War,0.919768
2028,Saving Private Ryan (1998),Action|Drama|War,0.881226
2905,Sanjuro (1962),Action|Adventure,0.442288
50,"Usual Suspects, The (1995)",Crime|Thriller,0.873108
318,"Shawshank Redemption, The (1994)",Drama,1.000027


In [304]:
against_recs_idxs = against_bias_recs(777, all_m_idxs, 5, exclusive=True)

In [305]:
movies.loc[mids[against_recs_idxs]]

Unnamed: 0_level_0,title,genre,movie_quality
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,0.836911
593,"Silence of the Lambs, The (1991)",Drama|Thriller,0.897762
1198,Raiders of the Lost Ark (1981),Action|Adventure,0.935052
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,0.962402
2762,"Sixth Sense, The (1999)",Thriller,1.004761


In [277]:
uids.shape

(6040,)