****Importing the libraries****

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable


  from .autonotebook import tqdm as notebook_tqdm


****Importing the datasets****

In [2]:
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None,
                     engine='python', encoding='latin-1')
movies

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [3]:
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None,
                     engine='python', encoding='latin-1')
users
# columns are user_id, gender, age, codes correspond to user's job, zip code


Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None,
                     engine='python', encoding='latin-1')
ratings

# columns are user_id, movie_id, rating, timestamp

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


****Getting the test and train sets****

In [5]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter='\t')
training_set = np.array(training_set, dtype='int')
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [6]:
test_set = pd.read_csv('ml-100k/u1.test', delimiter='\t')
test_set = np.array(test_set, dtype='int')
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

****Getting total number of users and movies****

In [7]:
nb_users = len(set(np.concatenate((training_set[:, 0], test_set[:, 0]))))
nb_movies = len(set(np.concatenate((training_set[:, 1], test_set[:, 1]))))

print(nb_users, nb_movies)

943 1682


****Convert training and test set to a matrix where the rows are the users and the columns are the movies****

In [8]:
# what was shown in the lectures

def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:, 1][data[:, 0] == id_users]
        id_ratings = data[:, 2][data[:, 0] == id_users]
        ratings_arr = np.zeros(nb_movies)
        ratings_arr[id_movies - 1] = id_ratings
        new_data.append(list(ratings_arr))
    return new_data

# more elegant way to convert
# todo: fix this
# but the training set and test set won't be the same shape
def pivot(ds: np.ndarray):
    return pd.DataFrame(ds).pivot(index=0, columns=1, values=2).to_numpy()


In [9]:
training_set = convert(training_set)
test_set = convert(test_set)

In [10]:
print(len(training_set), len(training_set[0]))
print(len(test_set), len(test_set[0]))

943 1682
943 1682


****Convert test and train sets from 2d lists to torch tensors****

In [11]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [12]:
print(training_set)
print(test_set)

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


****Convert the ratings into binary ratings (1 - liked the movie, 0 - didn't like)****

In [13]:
training_set[training_set == 0] = -1
training_set[(0 < training_set) & (training_set <= 2)] = 0
training_set[training_set > 2] = 1

In [14]:
test_set[test_set == 0] = -1
test_set[(0 < test_set) & (test_set <= 2)] = 0
test_set[test_set > 2] = 1

In [15]:
print(training_set)
print(test_set)

tensor([[-1.,  1.,  1.,  ..., -1., -1., -1.],
        [ 1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        ...,
        [ 1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1.,  1., -1.,  ..., -1., -1., -1.]])
tensor([[-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        ...,
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.]])


****Creating the model architecture****

In [16]:
class RBM:

    def __init__(self, nv, nh):
        self.nv = nv
        self.nh = nh
        self.w = torch.randn(self.nh, self.nv)
        self.a = torch.randn(1, self.nh)
        self.b = torch.randn(1, self.nv)

    def sample_h(self, x):
        wx = torch.mm(x, self.w.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)

    def sample_v(self, y):
        wy = torch.mm(y, self.w)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)

    def train(self, v0, vk, ph0, phk):
        self.w += torch.mm(ph0.t(), v0) - torch.mm(phk.t(), vk)
        self.b += torch.sum((v0 - vk), 0) # same as v0 - vk but removes the outer dimension
        self.a += torch.sum((ph0 - phk), 0)










In [17]:
nv = len(training_set[0])
nh = 100
batch_size = 100

rbm = RBM(nv, nh)


In [25]:
nb_epoch = 10
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.

    for id_user in range(0, nb_users - batch_size, batch_size):
        vk = training_set[id_user:id_user + batch_size]
        v0 = training_set[id_user:id_user + batch_size]
        ph0, _ = rbm.sample_h(v0)

        for k in range(10):
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)

            vk[v0 < 0] = v0[v0 < 0]

        phk, _ = rbm.sample_h(vk)

        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(vk[v0 >= 0] - v0[v0 >= 0]))
        s += 1.

    print(f'epoch {epoch}: loss {train_loss / s:.2f}')



epoch 1: loss 0.25
epoch 2: loss 0.25
epoch 3: loss 0.25
epoch 4: loss 0.25
epoch 5: loss 0.25
epoch 6: loss 0.25
epoch 7: loss 0.25
epoch 8: loss 0.24
epoch 9: loss 0.25
epoch 10: loss 0.25


In [26]:
# Testing the RBM

test_loss = 0
s = 0.
for id_user in range(nb_users):
    v = training_set[id_user:id_user+1] # the input which we'll be predicting on
    # the training set is the input of the RBM
    # we need to input the movies that's already activated to get the ratings of the
    # test points that's not yet seen

    vt = test_set[id_user:id_user+1] # the target

    if len(vt[vt >= 0]) > 0:
        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)

        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.
print(f"test loss: {test_loss / s}")


test loss: 0.24795620143413544
