In [1]:
from fastbook import *

In [2]:
import pandas as pd
import numpy as np

In [7]:
from fastai.collab import *
from fastai.tabular.all import *
path = "C:/Users/shawn/.fastai/archive/ml-25m/ml-25m"

In [37]:
ratings = pd.read_csv(f"{path}/ratings.csv" , low_memory = False, names = ('user', 'movie', 'rating', 'timestamp'), header = None)

In [39]:
ratings = ratings.drop(0).reset_index(drop=True)

In [40]:
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [41]:
movies = pd.read_csv(f"{path}/movies.csv", low_memory = False, usecols =(0,1), names = ('movie', 'title'), header=None)

In [42]:
movies = movies.drop(0).reset_index(drop=True)

In [43]:
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [44]:
ratings = ratings.merge(movies, on='movie')
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,3,296,5.0,1439474476,Pulp Fiction (1994)
2,4,296,4.0,1573938898,Pulp Fiction (1994)
3,5,296,4.0,830786155,Pulp Fiction (1994)
4,7,296,4.0,835444730,Pulp Fiction (1994)


In [46]:
ratings['user'] = pd.to_numeric(ratings['user'])
ratings['movie'] = pd.to_numeric(ratings['movie'])
ratings['rating'] = pd.to_numeric(ratings['rating'])
ratings['timestamp'] = pd.to_numeric(ratings['timestamp'])
movies['movie'] = pd.to_numeric(movies['movie'])

In [74]:
dls = CollabDataLoaders.from_df(ratings, item_name="title", bs=128)
dls.show_batch()

Unnamed: 0,user,title,rating
0,140577,In Fear (2013),3.0
1,75309,Remains (2011),5.0
2,30041,"Lord of the Rings: The Two Towers, The (2002)",4.0
3,106892,Little Shop of Horrors (1986),2.0
4,3623,Shine (1996),4.0
5,104640,Gone with the Wind (1939),4.5
6,37174,Toy Story (1995),3.0
7,80811,Willy Wonka & the Chocolate Factory (1971),3.0
8,89703,Apollo 13 (1995),4.5
9,140196,Apocalypse Now (1979),4.0


In [75]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
# n_factors = 5

# user_factors = torch.randn(n_users, n_factors)
# movie_factors = torch.randn(n_movies, n_factors)

In [76]:
n_users, n_movies, #len(user_factors), len(movie_factors)

(162542, 58959)

In [77]:
learn = collab_learner(dls, n_factors = 200, y_range=(0,5.5))

In [78]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.943426,0.936087,1:03:12
1,0.967026,0.934187,1:01:08
2,0.890796,0.901198,1:00:38
3,0.829858,0.821131,1:00:48
4,0.748126,0.758728,1:00:42


In [79]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(162542, 200)
  (i_weight): Embedding(58959, 200)
  (u_bias): Embedding(162542, 1)
  (i_bias): Embedding(58959, 1)
)

In [80]:
import pickle
from pathlib import Path

In [81]:
path = Path("C:/Users/shawn/.fastai/archive/ml-25m/ml-25m")

In [82]:
save_path = Path("C:/Users/shawn/.fastai/models")


In [83]:
save_path.mkdir(parents=True, exist_ok=True)

In [85]:
learn.export(save_path/'collab_learner(1).pkl')

In [96]:
def recommended_movie(learn, movie_title, top_n = 5):
    
    movie_factors = learn.model.i_weight.weight
    
    idx = learn.dls.classes['title'].o2i[movie_title]
    
    distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
    
    similar_movie_indices = distances.argsort(descending=True)[:top_n]
    
    similar_movies = [learn.dls.classes['title'][i] for i in similar_movie_indices]
    
    return similar_movies

In [98]:
movie_title = "Finding Nemo (2003)"
recommended_movies = recommended_movie(learn, movie_title, top_n = 5)
print(recommended_movies)

['Finding Nemo (2003)', 'Monsters, Inc. (2001)', 'Ratatouille (2007)', 'Shrek (2001)', 'Incredibles, The (2004)']


