In [1]:
!pip install -Uqq -r requirements.txt

In [2]:
from fastai.collab import *
from fastai.tabular.all import *
set_seed(42)

In [3]:
from pathlib import Path
import pandas as pd
path  = Path.cwd()
path

Path('/notebooks/CollaborativeFiltering')

In [4]:
ratings = pd.read_csv(path/'data'/'rating.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies = pd.read_csv(path/'data'/'movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants perdus, La) (1995)",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [7]:
ratings.shape

(20000263, 6)

In [10]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA.")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

GPU is available. Using CUDA.


In [11]:
dls = CollabDataLoaders.from_df(ratings, item_name = 'title', bs = 64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,76325,Kindergarten Cop (1990),3.5
1,99759,Kissing Jessica Stein (2001),4.0
2,71345,Amistad (1997),4.0
3,117774,Groundhog Day (1993),1.0
4,44042,Donnie Brasco (1997),4.0
5,85667,Bangkok Dangerous (2008),2.5
6,119167,"Lock, Stock & Two Smoking Barrels (1998)",3.0
7,67346,Cross Creek (1983),3.0
8,86066,Who Framed Roger Rabbit? (1988),4.0
9,82803,Forrest Gump (1994),5.0


In [12]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [13]:
one_hot_3 = one_hot(3, n_users).float()

In [14]:
user_factors.t() @ one_hot_3

tensor([-0.2104,  0.3708,  0.3753, -0.1506,  0.9477])

In [15]:
user_factors[3]

tensor([-0.2104,  0.3708,  0.3753, -0.1506,  0.9477])

In [16]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)
        

In [17]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DotProduct(n_users, n_movies, 50).to(device)
# learn = Learner(dls=dls, model=model, loss_func=MSELossFlat())

In [19]:
# learn.fit_one_cycle(5, 5e-3)

In [20]:
# learn.export(path/'models'/'model_v1.pkl')

In [21]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range = (0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim = 1), *self.y_range)

In [24]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 129112 KiB | 183187 KiB | 576747 MiB | 576620 MiB |
|       from large pool | 129084 KiB | 183184 KiB | 576434 MiB | 576308 MiB |
|       from small pool |     28 KiB |     54 KiB |    312 MiB |    312 MiB |
|---------------------------------------------------------------------------|
| Active memory         | 129112 KiB | 183187 KiB | 576747 MiB | 576620 MiB |
|       from large pool | 129084 KiB | 183184 KiB | 576434 MiB | 576308 MiB |
|       from small pool |     28 KiB |     54 KiB |    312 MiB |    312 MiB |
|---------------------------------------------------------------

In [25]:
model = DotProduct(n_users, n_movies, 25).to(device)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(3, 5e-3)

epoch,train_loss,valid_loss,time
0,0.931149,0.950988,34:30
1,0.81504,0.825421,33:56
2,0.666385,0.717505,33:35


In [26]:
learn.export(path/'models'/'model_v2_sigmoid.pkl')