In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
from pathlib import Path

In [3]:
path = Path("data/ml-1m")

In [4]:
list(path.iterdir())

[PosixPath('data/ml-1m/README'),
 PosixPath('data/ml-1m/users.dat'),
 PosixPath('data/ml-1m/movies.dat'),
 PosixPath('data/ml-1m/ratings.dat')]

In [5]:
str(path/"ratings.dat")

'data/ml-1m/ratings.dat'

In [6]:
ratings = pd.read_csv(str(path/"ratings.dat"), sep="::", header=None, names=["uid", "mid", "rating", "ts"])

  """Entry point for launching an IPython kernel.


In [7]:
ratings.dtypes

uid       int64
mid       int64
rating    int64
ts        int64
dtype: object

In [8]:
users = pd.read_csv(str(path/"users.dat"), sep="::", header=None, names=["uid", "gender", "age", "occupation", "zip_code"])

  """Entry point for launching an IPython kernel.


In [9]:
users.dtypes

uid            int64
gender        object
age            int64
occupation     int64
zip_code      object
dtype: object

In [10]:
movies = pd.read_csv(str(path/"movies.dat"), sep="::", header=None, names=["mid", "title", "genre"])

  """Entry point for launching an IPython kernel.


In [11]:
movies.dtypes

mid       int64
title    object
genre    object
dtype: object

In [12]:
uids = users.uid.unique()

In [13]:
mids = movies.mid.unique()

In [14]:
uid_idx = {uid:idx  for idx,uid in enumerate(uids)}

In [15]:
mid_idx = {mid:idx for idx,mid in enumerate(mids)}

In [16]:
ratings_arr = ratings.iloc[:,:3].values

In [17]:
ratings.head()

Unnamed: 0,uid,mid,rating,ts
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
ratings_arr[3]

array([   1, 3408,    4])

In [80]:
class MovieLensCfDataset():
    def __init__(self, uid_idx, mid_idx, ratings):
        self.uid_idx, self.mid_idx, self.ratings = uid_idx, mid_idx, ratings
        
    def __getitem__(self, index):
        r = self.ratings[index]
        uidx = self.uid_idx[r[0]]
        midx = self.mid_idx[r[1]]
        return np.array([uidx, midx, r[2]])
    
    def __len__(self):
        return self.ratings.shape[0]

In [81]:
mlcf_ds = MovieLensCfDataset(uid_idx, mid_idx, ratings_arr)

In [82]:
mlcf_ds[5], len(mlcf_ds)

(array([   0, 1179,    3]), 1000209)

In [83]:
z = zip(*[mlcf_ds[0], mlcf_ds[1]])
list(z)

[(0, 0), (1176, 655), (5, 3)]

In [84]:
t = torch.tensor(np.array([1,2,3]))

In [85]:
t.float()

tensor([1., 2., 3.])

In [86]:
t

tensor([1, 2, 3])

In [93]:
class MovieLensCfDataLoader():
    def __init__(self, dataset, bs=128, shuffle=False):
        self.dataset, self.bs = dataset, bs
        if shuffle:
            self.sampler = torch.utils.data.RandomSampler(dataset)
        else:
            self.sampler = torch.utils.data.SequentialSampler(dataset)
        self.batch_sampler = torch.utils.data.BatchSampler(self.sampler, bs, False)
    
    def get_batch(self, idx_batch):
        batch = []
        for idx in idx_batch:
            batch.append(self.dataset[idx])
        batch = list(zip(*batch))
        return [np.array(i) for i in batch]
        
    def get_tensor(self, batch):
        t_batch = [torch.tensor(i).contiguous() for i in batch]
        t_batch[-1] = t_batch[-1].float().contiguous()
        return t_batch
        
    def __iter__(self):
        for idx_batch in self.batch_sampler:
            yield self.get_tensor(self.get_batch(idx_batch))
            
    def __len__(self):
        return len(self.batch_sampler)

In [95]:
mlcf_dl = MovieLensCfDataLoader(mlcf_ds, shuffle=True)

In [99]:
l = next(iter(mlcf_dl))

In [100]:
l

[tensor([3463, 2633, 3881, 3647, 2234, 1488, 4408, 1675, 2435, 2445, 2077, 3475,
         5538, 5097, 1824, 4311, 3523,  944, 3889, 4619, 2236, 3304,  122, 1684,
         3966, 2402, 5594, 3475, 4618, 4155,  829, 3684, 5616, 4915, 3400, 1328,
         5053, 1747, 1265, 5255, 3394, 5779, 1777, 2764, 5534,  332,  244, 5095,
         3777, 3170, 5325, 2917, 2682, 4790, 3031, 5110, 1050, 3674,   77, 1610,
         1181, 5683, 4542, 3332, 3390, 2076, 1190, 5045, 1306, 2394, 3037,  676,
          713, 1242, 3303, 2030, 5428, 5078,  194, 4138,  148, 3606, 1888,  385,
         4915, 4138, 4823, 3453,  969, 3598, 4507, 4270, 5906, 1009, 5778,  933,
          636, 4697, 3877, 3474, 4578, 5393, 5989, 1598, 5609, 4459, 3053, 4212,
         2165, 2546, 4860, 4367, 4095, 2016,  527, 2016, 5688, 4970, 1223,  751,
         4447, 3846, 2506, 3828, 5877, 4681, 5447, 1486]),
 tensor([2327, 3294, 1771,  582,  496, 1064, 1079, 3185, 1113, 2031, 1673, 1940,
         2618, 2091, 3827, 1301, 2433, 2757, 2473,

In [98]:
len(mlcf_dl)

7815

In [102]:
from torch import nn

In [104]:
nn.Module

torch.nn.modules.module.Module

In [None]:
class MovieLensCfModel(torch.nn.Module):
    def __init__(self):
        