In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
from fastai.learner import *
from fastai.column_data import *

In [4]:
PATH='data/movielens/ml-latest-small/'

In [36]:
df = pd.read_csv(f'{PATH}ratings.csv')

In [38]:
num_users = len(df.userId.unique())
num_movies = len(df.movieId.unique())

idx_dict = {id_:idx for idx,id_ in enumerate(df.userId.unique())}
df.userId = df.userId.apply(lambda x: idx_dict[x])

idx_dict = {id_:idx for idx,id_ in enumerate(df.movieId.unique())}
df.movieId = df.movieId.apply(lambda x: idx_dict[x])

In [39]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205
5,0,5,2.0,1260759151
6,0,6,2.0,1260759187
7,0,7,2.0,1260759148
8,0,8,3.5,1260759125
9,0,9,2.0,1260759131


In [52]:
class CollabFilterModule(nn.Module):
    
    def __init__(self, num_row, num_col, embedding_dim):
        super().__init__()
        self.u = nn.Embedding(num_row, embedding_dim)
        self.v = nn.Embedding(num_col, embedding_dim)
        self.u.weight.data.uniform_(0, 0.05)
        self.v.weight.data.uniform_(0, 0.05)
        
    def forward(self, cats, conts):
        row_vals = cats[:,0]
        col_vals = cats[:,1]
        rve = self.u(row_vals)
        cve = self.v(col_vals)
        return (rve*cve).sum(1)
        

In [53]:
x = df.drop(columns=['rating', 'timestamp'])
y = df.rating.astype('float32')

In [54]:
val_idxs = get_cv_idxs(len(df))
data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y, ['userId', 'movieId'], 64)

In [55]:
m = CollabFilterModule(num_users, num_movies, 50)

In [59]:
opt = optim.SGD(m.parameters(), 1e-1, weight_decay=1e-5, momentum=0.9)

In [60]:
set_lrs(opt, 1e-2)

In [61]:
fit(m, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      2.952969   3.08218   
    1      2.3693     2.516539                                 
    2      1.878491   2.167749                                 



[array([2.16775])]

In [73]:
min_rating,max_rating = df.rating.min(),df.rating.max()

In [80]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_rows, n_cols, embedding_dim):
        super().__init__()
        self.u = nn.Embedding(n_rows, embedding_dim)
        self.u.weight.data.uniform_(-0.01, 0.01)
        self.m = nn.Embedding(n_cols, embedding_dim)
        self.m.weight.data.uniform_(-0.01, 0.01)
        
        self.lin1 = nn.Linear(embedding_dim*2, 30)
        self.lin2 = nn.Linear(30, 10)
        self.lin3 = nn.Linear(10, 1)
        self.drop1 = nn.Dropout(0.05)
        self.drop2 = nn.Dropout(0.5)
        self.drop3 = nn.Dropout(0.5)
        
    def forward(self, cats, conts):
        u_vals = cats[:,0]
        m_vals = cats[:,1]
        x = self.drop1(torch.cat([self.u(u_vals), self.m(m_vals)], dim=1))
        x = self.drop2(F.relu(self.lin1(x)))
        x = self.drop3(F.relu(self.lin2(x)))
        return F.sigmoid(self.lin3(x)) * (max_rating-min_rating+1) + min_rating-0.5

In [81]:
m = EmbeddingNet(num_users, num_movies, 50)
opt = optim.Adam(m.parameters(), 1e-3, weight_decay=1e-5)
fit(m, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.941118   0.825508  
    1      0.841465   0.804686                                 
    2      0.837594   0.80698                                  



[array([0.80698])]

In [84]:
set_lrs(opt, 1e-4)
fit(m, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.723238   0.788927  
    1      0.717504   0.788853                                 
    2      0.704982   0.78919                                  



[array([0.78919])]