In [1]:
#hide
from utils import *

# Collaborative Filtering Deep Dive

In [2]:
from fastai2.collab import *
from fastai2.tabular.all import *
path = untar_data(URLs.ML_100k)

## Creating the DataLoaders

In [3]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


## Set rating to 1 for all the seen movies

In [5]:
ratings = ratings.drop(['timestamp', 'movie'], axis=1)
ratings['rating'] = 1
ratings

Unnamed: 0,user,rating,title
0,196,1,Kolya (1996)
1,63,1,Kolya (1996)
2,226,1,Kolya (1996)
3,154,1,Kolya (1996)
4,306,1,Kolya (1996)
...,...,...,...
99995,840,1,Mamma Roma (1962)
99996,655,1,"Eighth Day, The (1996)"
99997,655,1,Girls Town (1996)
99998,655,1,"Silence of the Palace, The (Saimt el Qusur) (1994)"


## Create dataframe  for all the not seen movies with rating set to 0

In [6]:
user_ids = ratings['user'].unique()
all_titles = set(movies['title'].tolist())

def seen_by(user):
    return ratings.loc[ratings.user==user]['title'].tolist()

def not_seen_by(user):
    return all_titles - set(seen_by(user))

In [7]:
random.sample(not_seen_by(42), 5)

['Vie est belle, La (Life is Rosey) (1987)',
 'Palmetto (1998)',
 'Faster Pussycat! Kill! Kill! (1965)',
 'Othello (1995)',
 'Grass Harp, The (1995)']

In [8]:
def negative_for(user):
    seen = seen_by(user)
    not_seen = not_seen_by(user)
    samp = random.sample(not_seen, len(seen))
    return samp

not_seen = ratings.iloc[0:0,:].copy()
not_seen
to_append = []
for user in user_ids:
    to_append += [{'user':user, 'title':x, 'rating':0} for x in negative_for(user)]
not_seen =not_seen.append(to_append)
not_seen

Unnamed: 0,user,rating,title
0,196,0,"Bridges of Madison County, The (1995)"
1,196,0,Wings of Courage (1995)
2,196,0,Wings of Desire (1987)
3,196,0,Terminator 2: Judgment Day (1991)
4,196,0,Sprung (1997)
...,...,...,...
99995,873,0,So I Married an Axe Murderer (1993)
99996,873,0,"MatchMaker, The (1997)"
99997,873,0,Love! Valour! Compassion! (1997)
99998,873,0,Multiplicity (1996)


## Sanity check the most seen and the most not seen movies

In [9]:
not_seen.groupby('title').count().sort_values('user', ascending =False)

Unnamed: 0_level_0,user,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Mat' i syn (1997),92,92
Blue Chips (1994),85,85
Mighty Morphin Power Rangers: The Movie (1995),85,85
"Cement Garden, The (1993)",85,85
"New Age, The (1994)",85,85
...,...,...
"Terminator, The (1984)",19,19
Fargo (1996),19,19
Return of the Jedi (1983),17,17
"Rock, The (1996)",14,14


In [10]:
ratings.groupby('title').count().sort_values('user', ascending =False)

Unnamed: 0_level_0,user,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),583,583
Contact (1997),509,509
Fargo (1996),508,508
Return of the Jedi (1983),507,507
Liar Liar (1997),485,485
...,...,...
"Great Day in Harlem, A (1994)",1,1
"Other Voices, Other Rooms (1997)",1,1
Good Morning (1971),1,1
Girls Town (1996),1,1


## Dataset with both seen and not

In [11]:
all_df = pd.concat([ratings, not_seen], ignore_index=True)
all_df

Unnamed: 0,user,rating,title
0,196,1,Kolya (1996)
1,63,1,Kolya (1996)
2,226,1,Kolya (1996)
3,154,1,Kolya (1996)
4,306,1,Kolya (1996)
...,...,...,...
199995,873,0,So I Married an Axe Murderer (1993)
199996,873,0,"MatchMaker, The (1997)"
199997,873,0,Love! Valour! Compassion! (1997)
199998,873,0,Multiplicity (1996)


In [12]:
dls = CollabDataLoaders.from_df(all_df, item_name='title', rating_name='rating', bs=64)
dls.train.show_batch()

Unnamed: 0,user,title,rating
0,239,William Shakespeare's Romeo and Juliet (1996),1
1,932,"Great White Hype, The (1996)",0
2,844,Playing God (1997),0
3,880,Across the Sea of Time (1995),0
4,405,Once Upon a Time in the West (1969),1
5,526,Air Force One (1997),1
6,521,"Birdcage, The (1996)",1
7,540,Mr. Holland's Opus (1995),1
8,221,"Woman in Question, The (1950)",0
9,429,"Machine, The (1994)",0


In [13]:
n_users  = len(dls.classes['user'])
n_movies = len(dls.classes['title'])



## Train Collaborative filter

In [14]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [15]:
def acc(inp, targ):
    pred = (inp > .5).float()
    return (pred == targ).float().mean()

In [16]:
model = DotProductBias(n_users, n_movies, 50, y_range=(-.1, 1.1))
learn = Learner(dls, model, metrics=[acc], loss_func=MSELossFlat())

In [None]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,acc,time
0,0.156109,0.156562,0.77545,00:20
1,0.132507,0.134777,0.810175,00:19
2,0.119822,0.127135,0.820875,00:20
3,0.113858,0.123149,0.827625,00:20
