<a href="https://colab.research.google.com/github/priyanu17/recommendation-system-deep-learning/blob/master/Recommendation_System_using_RBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
import numpy as np
import pandas as pd

In [0]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings_2.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [0]:
ratings = ratings.iloc[:, 0:3]

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
movies.shape

(9742, 3)

In [10]:
ratings.shape

(100836, 3)

In [0]:
from sklearn.model_selection import train_test_split
training_set , test_set = train_test_split(ratings, test_size = 0.2)

In [12]:
training_set.shape

(80668, 3)

In [13]:
test_set.shape

(20168, 3)

In [0]:
## Converting into numpy arrays
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')


In [0]:
# Getting the number of users and movies
max_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
max_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [16]:
print("Max users is : " + str(max_users))
print("Max movies is : " + str(max_movies))

Max users is : 610
Max movies is : 193609


In [0]:
# Converting the data into an array with users in lines and movies in columns

In [0]:
def convert(data):
  new_data = []
  for id_users in range(1, max_users + 1):
    id_movies = data[:,1][data[:,0] == id_users]
    id_ratings = data[:,2][data[:,0] == id_users]
    ratings = np.zeros(max_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data

In [0]:
training_set = convert(training_set)

In [0]:
test_set = convert(test_set)

In [0]:
## Importing pytorch libraries:
import torch

In [0]:
# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [0]:
# Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)

# (Ratings (1, 2) -> 0, (3, 4, 5) -> 1, (Not rated -> -1))
training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1

test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

In [0]:
# Importing other pytorch libraries
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [0]:
# Creating the architecture of the Restricted Boltzmann machines

# Creating an RBM class

class RBM():

  # Initializing the weights and biases randomly
  # nh - number of hidden nodes
  # nv - number of visible nodes
  # a - bias of hidden nodes w.r.t visible nodes
  # b - bias of visible nodes w.r.t hidden nodes
  # W - weights in the network 
  def __init__(self, nv, nh):
    self.W = torch.randn(nh, nv)
    
    self.a = torch.randn(1, nh)
    self.b = torch.randn(1, nv)

  # Gibs Sampling on the hidden nodes in the network
  def sample_h(self, x):
    wx = torch.mm(x, self.W.t())
    activation = wx + self.a.expand_as(wx)
    p_h_given_v = torch.sigmoid(activation)
    return p_h_given_v, torch.bernoulli(p_h_given_v)

  # Gibs sampling on the visible nodes in the network
  def sample_v(self, y):
    wy = torch.mm(y, self.W)
    activation = wy + self.b.expand_as(wy)
    p_v_given_h = torch.sigmoid(activation)
    return p_v_given_h, torch.bernoulli(p_v_given_h)

  # Training the dataset using Constrastive divergence
  # Using approximation of the Log-Likelihood Gradient
  def train(self, v0, vk, ph0, phk):
    self.W += (torch.mm(v0.t(),ph0) - torch.mm(vk.t(),phk)).t()
    self.b += torch.sum((v0 - vk), 0)
    self.a += torch.sum((ph0 - phk), 0)

  # Predict method for output 
  def predict( self, x): # x: visible nodes
    _, h = self.sample_h( x)
    _, v = self.sample_v( h)
    return v

In [0]:
nv = len(training_set[0])
nh = 100
batch_size = 100
rbm = RBM(nv, nh)

In [27]:
# Training the RBM
nb_epoch = 10
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  # Epoch Counter
  s = 0
  for id_user in range(0, max_users - batch_size, batch_size):

    vk = training_set[id_user:id_user+batch_size]
    v0 = training_set[id_user:id_user+batch_size]

    ph0,_ = rbm.sample_h(v0)

    for k in range(10):
      _,hk = rbm.sample_h(vk)
      _,vk = rbm.sample_v(hk)
      vk[v0<0] = v0[v0<0]

    phk,_ = rbm.sample_h(vk)

    rbm.train(v0, vk, ph0, phk)
    train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))

    s += 1
  print('No of Epoch : '+str(epoch)+' Total Loss : ' +str(train_loss/s)+ "\n")

No of Epoch : 1 Total Loss : tensor(0.3872)

No of Epoch : 2 Total Loss : tensor(0.2655)

No of Epoch : 3 Total Loss : tensor(0.2553)

No of Epoch : 4 Total Loss : tensor(0.2528)

No of Epoch : 5 Total Loss : tensor(0.2489)

No of Epoch : 6 Total Loss : tensor(0.2448)

No of Epoch : 7 Total Loss : tensor(0.2370)

No of Epoch : 8 Total Loss : tensor(0.2395)

No of Epoch : 9 Total Loss : tensor(0.2420)

No of Epoch : 10 Total Loss : tensor(0.2366)



In [28]:

# Testing the RBM
test_loss = 0
s = 0.

for id_user in range(max_users):

    v = training_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]

    if len(vt[vt>=0]) > 0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)

        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))

        s += 1.
print('Total test loss: '+ str(test_loss/s))

Total test loss: tensor(0.2346)


In [0]:
# Test set loss 0.25 -> System is 75% correctly recommending 

In [30]:
## Testing on One output 
# User no 33

y_pred = rbm.predict(test_set[33:34])
print(y_pred)

tensor([[1., 1., 1.,  ..., 0., 0., 0.]])


In [31]:
y_pred.shape

torch.Size([1, 193609])

In [0]:
## Converting torch tensor into list
y_pred_list = y_pred.numpy().tolist()

In [0]:
## Extracting movie indexes from list

In [0]:
movie_index = []
for i in range(1, max_movies):
  if y_pred_list[0][i] == 1:
    movie_index.append(i)

In [0]:
user_33_watched = ratings[ratings["userId"] == 33]["movieId"].tolist()

In [0]:
## Removing indexes of movies already watched

In [0]:
for item in user_33_watched:
  if item in movie_index:
    movie_index.pop(item)

In [38]:
movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [40]:
## Recommendations for user 33

movies[movies['movieId'].isin(movie_index)]["title"].head(50)

0                                      Toy Story (1995)
4                    Father of the Bride Part II (1995)
6                                        Sabrina (1995)
8                                   Sudden Death (1995)
9                                      GoldenEye (1995)
11                   Dracula: Dead and Loving It (1995)
12                                         Balto (1995)
14                              Cutthroat Island (1995)
15                                        Casino (1995)
17                                    Four Rooms (1995)
19                                   Money Train (1995)
20                                    Get Shorty (1995)
23                                        Powder (1995)
24                             Leaving Las Vegas (1995)
25                                       Othello (1995)
26                                  Now and Then (1995)
27                                    Persuasion (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qi