In [4]:
from pathlib import Path
import pandas as pd
import numpy as np

In [5]:
PATH = Path("/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small")
list(PATH.iterdir())


[PosixPath('/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small/ratings.csv'),
 PosixPath('/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small/tags.csv'),
 PosixPath('/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small/README.txt'),
 PosixPath('/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small/links.csv'),
 PosixPath('/home/uncomment/Educational/LiveCodeRelive/pandas_data_wrangling/data/ml-latest-small/movies.csv')]

In [6]:
! head $PATH/ratings.csv

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


In [7]:
data = pd.read_csv(PATH/"ratings.csv")

In [9]:
data.head()
data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [21]:
# np.random.seed(3)
# l = np.random.rand(10) < 0.8
# print(l)

[ True  True  True  True False False  True  True  True  True]


In [25]:
# Encoding categorical data 
# Splitting traning & testing before encoding. 
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()
print(len(train))
print(len(val))

80450
20386


In [32]:
def proc_col(col, train_col=None):
    """Encode a pandas column with continuous ids"""
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x,-1) for x in col]), len(uniq)

In [33]:
def encode_data(df, train=None):
    """Encodes rating data with continous user  & movie ids.
    If train is provided, encodes df with the same ncoding as train"""
    df = df.copy()
    for col_name in ["userId","movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name],train_col)
        df[col_name] = col
        df = df[df[col_name]>=0]
    return df

In [39]:
# Encodind theee train & validation data 
df_train = encode_data(train)
df_train.tail()
df_train.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,80450.0,80450.0,80450.0,80450.0
mean,324.622511,1821.04082,3.500566,1206234000.0
std,182.739357,1912.601064,1.044756,216160300.0
min,0.0,0.0,0.5,828124600.0
25%,176.0,445.0,3.0,1019124000.0
50%,324.0,1082.0,3.5,1186161000.0
75%,476.0,2486.0,4.0,1435994000.0
max,609.0,8997.0,5.0,1537799000.0


In [48]:
df_val = encode_data(val,train)
val.describe()
val.head(100)

Unnamed: 0,userId,movieId,rating,timestamp
4,1,50,5.0,964982931
5,1,70,3.0,964982400
29,1,543,4.0,964981179
30,1,552,4.0,964982653
32,1,590,4.0,964982546
...,...,...,...,...
531,5,266,1.0,847435311
536,5,318,3.0,847434880
537,5,344,3.0,847434802
539,5,357,2.0,847435238


## Embedding Layer

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [53]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)


torch.nn.modules.sparse.Embedding

In [54]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[ 0.5764,  0.5392,  0.7485],
         [ 0.2402, -1.0398,  0.5133],
         [ 0.5313, -0.9130, -1.0165],
         [-0.9578,  0.2566,  0.2921],
         [-1.6686, -1.2927, -0.2764],
         [ 0.5764,  0.5392,  0.7485]]], grad_fn=<EmbeddingBackward>)

## Matrix Facotrization model 

In [60]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size = 100):
        super(MF,self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items,emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)
    

## Training MF model 


In [61]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [62]:
model = MF(num_users, num_items, emb_size = 100)

In [68]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(),lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values)
        items = torch.LongTensor(df_train.movieId.values)
        ratings = torch.FloatTensor(df_train.rating.values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users,items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model,unsqueeze)

In [65]:
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [70]:
def test_loss(model,unsqueeze=False):
    """This function will test the prediction value against the test dataframe , 
    df_val for purpose of validation ."""
    model.eval()
    users = torch.LongTensor(df_val.userId.values)
    items = torch.LongTensor(df_val.movieId.values)
    ratings = torch.FloatTensor(df_val.rating.values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("Test loss %.3f" %loss.item())
    

In [71]:
train_epocs(model, epochs=10, lr=0.1)

1.6424936056137085
5.716650009155273
4.122885704040527
1.0668009519577026
2.8487675189971924
2.4866318702697754
0.7549284100532532
1.2320798635482788
2.085761308670044
1.9961754083633423
Test loss 1.440


In [72]:
train_epocs(model, epochs=15, lr=0.01)

1.2051769495010376
0.8402099609375
0.6925613284111023
0.6901534199714661
0.7074396014213562
0.6909264922142029
0.652334451675415
0.6184942722320557
0.6042883992195129
0.6068151593208313
0.6133261322975159
0.6125633716583252
0.6007717251777649
0.5813726782798767
0.5613786578178406
Test loss 0.780


In [73]:
train_epocs(model, epochs=15, lr=0.01)

0.5471259355545044
0.5480055809020996
0.5166163444519043
0.5024110674858093
0.4948512315750122
0.47992047667503357
0.46167251467704773
0.44554489850997925
0.4311355948448181
0.416013240814209
0.3999186158180237
0.38380730152130127
0.36806339025497437
0.35247865319252014
0.3369835615158081
Test loss 0.785
