In [6]:
%matplotlib inline
import pandas as pd
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score

In [7]:
df_train = pd.read_csv("data_hw2/train_books_ratings.csv")
df_valid = pd.read_csv("data_hw2/valid_books_ratings.csv")

In [8]:
df_train.head()

Unnamed: 0,user,item,rating,timestamp
0,A2IIIDRK3PRRZY,0000000116,0,1395619200
1,A9KTKY6BUR8U6,0000013714,0,1357516800
2,A35OP02LIXZ84E,0000477141,0,1399939200
3,A9WX8DK93SN5,000100039X,0,1385683200
4,A36JQ1WC5JQPFQ,000100039X,0,1391990400


In [9]:
# Encoding the training Users

In [10]:
len(df_train.user.unique())

1312778

In [11]:
train_user_ids = np.sort(df_train.user.unique())

In [12]:
user2idx = {val:idx for idx, val in enumerate(train_user_ids)}

In [13]:
df_train.user = df_train.user.apply(lambda x:user2idx[x])

In [14]:
df_valid.user = df_valid.user.apply(lambda x : user2idx.get(x,-1))

In [15]:
df_valid = df_valid[df_valid.user > -1].copy()

In [16]:
# Encoding Item

In [17]:
df_train.head(2)

Unnamed: 0,user,item,rating,timestamp
0,527409,116,0,1395619200
1,1059073,13714,0,1357516800


In [18]:
train_item_ids = np.sort(np.unique(df_train.item.values))

In [19]:
item2idx = {val:idx for idx, val in enumerate(train_item_ids)}
df_train.item = df_train.item.apply(lambda x: item2idx[x])

# Validation 
df_valid.item = df_valid.item.apply(lambda x: item2idx.get(x, -1))

In [20]:
df_valid = df_valid[df_valid.item > -1].copy()

In [21]:
df_valid.shape

(131657, 4)

# Dataloader

In [45]:
class matrix_factorization():
    def __init__(self, x1, x2, y):
        self.x1 = torch.LongTensor(x1) 
        self.x2 = torch.LongTensor(x2)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        return self.x1[idx], self.x2[idx], self.y[idx], 

In [57]:
train_ds = matrix_factorization(df_train.user.values,df_train.item.values, df_train.rating.values)
valid_ds = matrix_factorization(df_valid.user.values,df_valid.item.values, df_valid.rating.values)

In [58]:
df_train.shape[0]/10000

178.7557

In [59]:
train_dl = DataLoader(train_ds, batch_size = 1000, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size = 1000, shuffle = False)

# Embedding Layer

In [60]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size = 100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        #init
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u,v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) + b_u + b_v
        

# Training Loop

In [61]:
# here we are not using data loaders because our data fits well in memory
def train_epocs(model,train_dl, valid_dl, epochs, optimizer):
    losses = []
    for i in range(epochs):
        print(f"epoch no: {i}")
        model.train()
        for users, items, ratings in train_dl:
            y_hat = model(users, items)
            loss = F.binary_cross_entropy(torch.sigmoid(y_hat), ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
        train_loss = np.mean(losses)
        valid_loss,valid_auc = val_metrics(model, valid_dl)
        print("train loss %.3f valid loss %.3f auc-roc %.3f" % (train_loss, valid_loss, valid_auc))

# Validation Error

In [65]:
def val_metrics(model, valid_dl):
    model.eval()
    losses = []
    y_hats = []
    ys = []
    for users, items, ratings in valid_dl:
        y_hat = model(users, items)
        loss = F.binary_cross_entropy(torch.sigmoid(y_hat), ratings)
        y_hats.append(y_hat.detach().numpy())
        ys.append(ratings.numpy())
        losses.append(loss.item())
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    #print(y_hats.shape,ys.shape )
    return np.mean(losses), roc_auc_score(ys, y_hats)

In [66]:
num_users = len(train_user_ids)
num_users
num_items = len(train_item_ids)
num_items

659279

In [67]:
model = MF(num_users, num_items, emb_size=5)

In [70]:
learning_rate = 0.001
wd=1e-5
epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=wd)

In [71]:
train_epocs(model,train_dl, valid_dl, epochs, optimizer)

epoch no: 0
train loss 0.642 valid loss 0.625 auc-roc 0.815
epoch no: 1
train loss 0.638 valid loss 0.621 auc-roc 0.823
epoch no: 2
train loss 0.636 valid loss 0.619 auc-roc 0.825
epoch no: 3
train loss 0.634 valid loss 0.617 auc-roc 0.825
epoch no: 4
train loss 0.633 valid loss 0.617 auc-roc 0.825
epoch no: 5
train loss 0.633 valid loss 0.617 auc-roc 0.825
epoch no: 6
train loss 0.632 valid loss 0.617 auc-roc 0.825
epoch no: 7
train loss 0.632 valid loss 0.616 auc-roc 0.825
epoch no: 8
train loss 0.631 valid loss 0.616 auc-roc 0.825
epoch no: 9
train loss 0.631 valid loss 0.616 auc-roc 0.825
