# Book Recommendation System Using Neural Collaborative Filtering

## Load Data

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import copy
import itertools
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [17]:
plt.style.use('ggplot')

In [6]:
ratings_data = pd.read_csv('data/interactions_small.csv', index_col=0)
book_data = pd.read_csv('data/book_metadata.csv', index_col=0)

In [28]:
ratings_data.head()

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,1384,1bad0122cebb4aa9213f9fe1aa281f66,True,4,,Wed May 09 09:33:44 -0700 2007,Wed May 09 09:33:44 -0700 2007,,
1,8842281e1d1347389f2ab93d60773d4d,1376,eb6e502d0c04d57b43a5a02c21b64ab4,True,4,,Wed May 09 09:33:18 -0700 2007,Wed May 09 09:33:18 -0700 2007,,
2,8842281e1d1347389f2ab93d60773d4d,30119,787564bef16cb1f43e0f641ab59d25b7,True,5,,Sat Jan 13 13:44:20 -0800 2007,Wed Mar 22 11:45:08 -0700 2017,Tue Mar 01 00:00:00 -0800 1983,
3,72fb0d0087d28c832f15776b0d936598,24769928,8c80ee74743d4b3b123dd1a2e0c0bcac,False,0,,Wed Apr 27 11:05:51 -0700 2016,Wed Apr 27 11:05:52 -0700 2016,,
4,72fb0d0087d28c832f15776b0d936598,30119,2a83589fb597309934ec9b1db5876aaf,True,3,,Mon Jun 04 18:58:08 -0700 2012,Mon Jun 04 18:58:13 -0700 2012,,


In [29]:
# use only the data where is_read==True
ratings = ratings_data[ratings_data['is_read']==True]
ratings = ratings[[']]

(10241, 10)

In [14]:
# create a matrix where the rows are users and columns are books
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('user_id')['rating'].count()
    
    unique_users = ratings.user_id.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.user_id.map(user_to_index)
    
    unique_books = ratings.book_id.unique()
    book_to_index = {old: new for new, old in enumerate(unique_books)}
    new_books = ratings.book_id.map(book_to_index)
    
    n_users = unique_users.shape[0]
    n_books = unique_books.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'book_id': new_books})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_books), (X, y), (user_to_index, book_to_index)

In [16]:
(n, m), (X, y), _ = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} books')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 1853 users, 4064 books
Dataset shape: (10241, 2)
Target shape: (10241,)


In [21]:
class ReviewsIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [22]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ReviewsIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1) 

## Embeddings

In [37]:
class EmbeddingNet(nn.Module):
    """
    Creates a dense network with embedding layers.
    
    Args:
    
        n_users:            
            Number of unique users in the dataset.

        n_movies: 
            Number of unique movies in the dataset.

        n_factors: 
            Number of columns in the embeddings matrix.

        embedding_dropout: 
            Dropout rate to apply right after embeddings layer.

        hidden:
            A single integer or a list of integers defining the number of 
            units in hidden layer(s).

        dropouts: 
            A single integer or a list of integers defining the dropout 
            layers rates applyied right after each of hidden layers.
            
    """
    def __init__(self, n_users, n_movies,
                 n_factors=50, embedding_dropout=0.02, 
                 hidden=10, dropouts=0.2):
        super().__init__()
        hidden = get_list(hidden)
        dropouts = get_list(dropouts)
        n_last = hidden[-1]
        
        def gen_layers(n_in):
            """
            A generator that yields a sequence of hidden layers and 
            their activations/dropouts.
            
            Note that the function captures `hidden` and `dropouts` 
            values from the outer scope.
            """
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            
            for n_out, rate in itertools.zip_longest(hidden, dropouts):
                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
            
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.drop = nn.Dropout(embedding_dropout)
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 2)))
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, movies, minmax=None):
        features = torch.cat([self.u(users), self.m(movies)], dim=1)
        x = self.drop(features)
        x = self.hidden(x)
        out = torch.sigmoid(self.fc(x))
        if minmax is not None:
            min_rating, max_rating = minmax
            out = out*(max_rating - min_rating + 1) + min_rating - 0.5
        return out
    
    def _init(self):
        """
        Setup embeddings and hidden layers with reasonable initial values.
        """
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
    
    
def get_list(n):
    if isinstance(n, (int, float)):
        return [n]
    elif hasattr(n, '__iter__'):
        return list(n)
    raise TypeError('layers configuraiton should be a single number or a list of numbers')    

In [38]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [39]:
minmax = ratings.rating.min(), ratings.rating.max()
minmax

(0, 5)

In [40]:
net = EmbeddingNet(
    n_users=n, n_movies=m, 
    n_factors=150, hidden=[500, 500, 500], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [35]:
print(n,m)

1853 4064


## Train

In [41]:
lr = 1e-3
wd = 1e-5
bs = 2000 
n_epochs = 30
# patience = 10
# no_improvements = 0
best_loss = np.inf
best_weights = None
history = []
lr_history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net.to(device)
criterion = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))
# scheduler = CyclicLR(optimizer, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))

for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        training = phase == 'train'
        running_loss = 0.0
        n_batches = 0
        batch_num = 0
        for batch in batches(*datasets[phase], shuffle=training, bs=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                outputs = net(x_batch[:, 0], x_batch[:, 1], minmax)
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:
                    # scheduler.step()
                    loss.backward()
                    optimizer.step()
                    # lr_history.extend(scheduler.get_lr())
                    
            running_loss += loss.item()
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        # if phase == 'val':
        #     if epoch_loss < best_loss:
        #         print('loss improvement on epoch: %d' % (epoch + 1))
        #         best_loss = epoch_loss
        #         best_weights = copy.deepcopy(net.state_dict())
        #         no_improvements = 0
        #     else:
        #         no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    # if no_improvements >= patience:
    #     print('early stopping after epoch {epoch:03d}'.format(**stats))
    #     break

[001/030] train: 2.2937 - val: 1.7459
[002/030] train: 1.8434 - val: 1.6205
[003/030] train: 1.6230 - val: 1.6450
[004/030] train: 1.5912 - val: 1.5565
[005/030] train: 1.4135 - val: 1.5030
[006/030] train: 1.3128 - val: 1.4836
[007/030] train: 1.1543 - val: 1.4513
[008/030] train: 1.0231 - val: 1.4518
[009/030] train: 0.9197 - val: 1.5015
[010/030] train: 0.8417 - val: 1.5517
[011/030] train: 0.7760 - val: 1.5771
[012/030] train: 0.7391 - val: 1.6222
[013/030] train: 0.7014 - val: 1.6508
[014/030] train: 0.6607 - val: 1.6804
[015/030] train: 0.6251 - val: 1.7082
[016/030] train: 0.5864 - val: 1.7243
[017/030] train: 0.5424 - val: 1.7874
[018/030] train: 0.5063 - val: 1.8390
[019/030] train: 0.4735 - val: 1.8597
[020/030] train: 0.4415 - val: 1.8606
[021/030] train: 0.4185 - val: 1.8435
[022/030] train: 0.3842 - val: 1.9116
[023/030] train: 0.3687 - val: 1.9735
[024/030] train: 0.3348 - val: 1.9646
[025/030] train: 0.3239 - val: 2.0400
[026/030] train: 0.3124 - val: 2.0401
[027/030] tr

## Model Evaluation

In [42]:
groud_truth, predictions = [], []

with torch.no_grad():
    for batch in batches(*datasets['val'], shuffle=False, bs=bs):
        x_batch, y_batch = [b.to(device) for b in batch]
        outputs = net(x_batch[:, 0], x_batch[:, 1], minmax)
        groud_truth.extend(y_batch.tolist())
        predictions.extend(outputs.tolist())

groud_truth = np.asarray(groud_truth).ravel()
predictions = np.asarray(predictions).ravel()

In [43]:
final_loss = np.sqrt(np.mean((np.array(predictions) - np.array(groud_truth))**2))
print(f'Final RMSE: {final_loss:.4f}')

Final RMSE: 1.4421


In [44]:
np.array(predictions)

array([3.75242329, 4.72158575, 3.67954445, ..., 4.11724758, 4.77862263,
       3.99162102])