In [1]:
## requirements.txt
# fastai==2.7.12

# !pip install fastai==2.7.12

In [2]:
import pandas as pd

In [3]:
# 加载数据
from fastai.tabular.all import untar_data, URLs
path = untar_data(URLs.ML_100k)

# 数据预览
# 评分
ratings = pd.read_csv(path / 'u.data', delimiter='\t', header=None, names=['user','movie','rating','timestamp'])
ratings.head(3)

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [4]:
# 电影
movies = pd.read_csv(path / 'u.item', delimiter='|', encoding='latin-1', usecols=(0,1), names=('movie','title'), header=None)

# 合并
ratings = ratings.merge(movies)
ratings.head(3)

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)


In [5]:
# 构建 DataLoader
from fastai.collab import CollabDataLoaders

# CollabDataLoaders: 用于协同过滤数据的自定义 DataLoaders, 只需要三列数据: user, item, rating
dls = CollabDataLoaders.from_df(ratings, user_name='user', item_name='title', rating_name='rating', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,537,Apocalypse Now (1979),4
1,679,Dances with Wolves (1990),3
2,194,Raising Arizona (1987),5
3,840,Rebel Without a Cause (1955),5
4,276,2 Days in the Valley (1996),3
5,390,Trainspotting (1996),1
6,1,Quiz Show (1994),4
7,577,Immortal Beloved (1994),5
8,882,Forrest Gump (1994),5
9,151,Nobody's Fool (1994),4


In [21]:
import torch.nn as nn
# from fastai.torch_core import Module  # 相比 nn.Module, 不用显式调用 `super().__init__()`, 除此之外无区别
from fastai.layers import sigmoid_range

# 一个简单的 Embedding 模型
class DotProduct(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
        super().__init__()

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.y_range = y_range
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        logits = (users * movies).sum(dim=1)
        return sigmoid_range(logits, *self.y_range)  # 调整值的范围

In [24]:
# train

from fastai.learner import Learner
from fastai.losses import MSELossFlat

n_users, n_movies = len(dls.classes['user']), len(dls.classes['title'])
n_factors = 50

model = DotProduct(n_users, n_movies, n_factors)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit(n_epoch=5, lr=5e-3)

epoch,train_loss,valid_loss,time
0,7.420341,7.449542,00:05
1,5.947965,6.962557,00:05
2,4.680787,6.17413,00:05
3,3.439287,5.052207,00:05
4,2.343935,3.843578,00:05
