In [2]:
from fastai.collab import *
from fastai.tabular.all import *

In [3]:
# https://files.grouplens.org/datasets/movielens/ml-25m-README.html

In [18]:
import requests
from tqdm import tqdm

url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
directory = Path("c:/Users/rainf/.fastai/data")

file = (directory/'ml-25m.zip')

if not (directory/'ml-25m').exists():
    response = requests.head(url)
    file_size = int(response.headers.get('Content-Length', 0))
    
    # 使用requests库下载文件，并显示进度条
    with requests.get(url, stream=True) as r, open(file, 'wb') as f, tqdm(
        unit='B',  # 进度条单位为字节
        unit_scale=True,
        unit_divisor=1024,  # 进度条以KB显示
        total=file_size,  # 总文件大小
        file=sys.stdout,  # 显示在Jupyter Notebook中
        desc='ml-25m.zip'  # 进度条描述
    ) as bar:
        for data in r.iter_content(chunk_size=1024):
            f.write(data)
            bar.update(len(data))

In [5]:
# import zipfile

# with zipfile.ZipFile(file, 'w') as z:
#     z.extractall(path)

In [6]:
path = (directory/'ml-25m')

In [7]:
ratings = pd.read_csv(path/'ratings.csv', delimiter=',')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [8]:
movies = pd.read_csv(path/'movies.csv', delimiter=',')
movies['genres'] = movies['genres'].str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [9]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
1,3,296,5.0,1439474476,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
2,4,296,4.0,1573938898,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
3,5,296,4.0,830786155,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
4,7,296,4.0,835444730,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"


In [10]:
genres_list = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

In [11]:
del ratings['genres']
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,107650,Videodrome (1983),3.5
1,140110,Hellraiser (1987),3.5
2,112006,Broken Flowers (2005),4.0
3,91630,Akira (1988),5.0
4,10237,Raising Arizona (1987),4.0
5,79324,Babe: Pig in the City (1998),2.0
6,38627,Copycat (1995),1.0
7,66541,Toy Story (1995),4.0
8,116896,When Harry Met Sally... (1989),4.5
9,115907,Wristcutters: A Love Story (2006),4.5


In [12]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_users, n_movies

(162542, 58959)

In [24]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [26]:
x,y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [27]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [28]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,2.450336,2.489846,41:36
1,2.490973,2.449388,40:50
2,1.892136,1.954166,40:52
3,1.373704,1.361567,40:52
4,1.132738,1.145224,40:50


In [31]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [None]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.dls.batch_size = 2**15 # 32768
learn.fit_one_cycle(1, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
