In [1]:
!pip install -Uqq -r requirements.txt

In [2]:
from fastai.collab import *
from fastai.tabular.all import *
set_seed(42)

In [3]:
from pathlib import Path
import pandas as pd
path  = Path.cwd()
path

Path('/notebooks/CollaborativeFiltering')

In [4]:
ratings = pd.read_csv(path/'data'/'rating.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies = pd.read_csv(path/'data'/'movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants perdus, La) (1995)",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [7]:
ratings.shape

(20000263, 6)

In [8]:
dls = CollabDataLoaders.from_df(ratings, item_name = 'title', bs = 64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,18255,"Blues Brothers, The (1980)",5.0
1,55691,Waterworld (1995),3.0
2,91931,Christmas Vacation (National Lampoon's Christmas Vacation) (1989),4.0
3,39497,Close Encounters of the Third Kind (1977),3.5
4,92257,"Matrix Revolutions, The (2003)",0.5
5,103076,Glory (1989),3.0
6,117049,Face/Off (1997),4.0
7,60247,Groundhog Day (1993),4.0
8,13703,Stripes (1981),3.5
9,74609,"Rock, The (1996)",3.0


In [9]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [10]:
one_hot_3 = one_hot(3, n_users).float()

In [11]:
user_factors.t() @ one_hot_3

tensor([-0.8439, -0.0255, -0.5745,  0.6977,  0.3907])

In [12]:
user_factors[3]

tensor([-0.8439, -0.0255, -0.5745,  0.6977,  0.3907])

In [13]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)
        

In [14]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DotProduct(n_users, n_movies, 50).to(device)
# learn = Learner(dls=dls, model=model, loss_func=MSELossFlat())

In [16]:
# learn.fit_one_cycle(5, 5e-3)

In [17]:
# learn.export(path/'models'/'model_v1.pkl')

In [18]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range = (0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim = 1), *self.y_range)

In [19]:
# print(torch.cuda.memory_summary())

In [20]:
# model = DotProduct(n_users, n_movies, 25).to(device)
# learn = Learner(dls, model, loss_func=MSELossFlat())
# learn.fit_one_cycle(3, 5e-3)

In [21]:
# learn.export(path/'models'/'model_v2_sigmoid.pkl')

In [22]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [25]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range= (0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim = 1, keepdim = True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [26]:
model = DotProductBias(n_users, n_movies, 25).to(torch.device('cuda'))
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(3, 5e-3)

epoch,train_loss,valid_loss,time
0,0.808738,0.809009,48:37
1,0.71396,0.742475,44:40
2,0.662825,0.681147,44:07


In [27]:
learn.export(path/'models'/'model_v3_bias.pkl')