In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable


In [11]:
#importing the dataset for movies, users and ratings.

movies = pd.read_csv("/content/drive/My Drive/BM_DATA/ml-1m/movies.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")

users = pd.read_csv("/content/drive/My Drive/BM_DATA/ml-1m/users.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")

ratings = pd.read_csv("/content/drive/My Drive/BM_DATA/ml-1m/ratings.dat", sep = "::", header = None, engine = "python", encoding = "latin-1")


In [12]:
training_data = pd.read_csv("/content/drive/My Drive/BM_DATA/ml-100k/u1.base", delimiter = "\t")
#convert train set to array to be used as tensors for pytorch 
train_data = np.array(training_data, dtype="int")

testing_data = pd.read_csv("/content/drive/My Drive/BM_DATA/ml-100k/u1.test", delimiter = "\t")
#convert test set to array to be used as tensors for pytorch 
test_data = np.array(testing_data, dtype="int")

print(train_data), print(test_data)

[[        1         2         3 876893171]
 [        1         3         4 878542960]
 [        1         4         3 876893119]
 ...
 [      943      1188         3 888640250]
 [      943      1228         3 888640275]
 [      943      1330         3 888692465]]
[[        1        10         3 875693118]
 [        1        12         5 878542960]
 [        1        14         5 874965706]
 ...
 [      459       934         3 879563639]
 [      460        10         3 882912371]
 [      462       682         5 886365231]]


(None, None)

In [13]:
#finding the total count of users
#The max id of users can be in either of test or train set
n_users = len( np.unique( np.concatenate([train_data[:,0], test_data[:, 0]]) ) )

#finding the total count of movies
#The max id of movies can be in either of test or train set
n_movies = len(np.unique(np.concatenate([train_data[:,1], test_data[:, 1]])))

In [14]:
# A function to map these values in horizontal lists
def data_new(data):
  """
  This function creates data structure according to the format a pytorch takes input.
  The data will be of the form such that it will be list of lists. Each list will contain
  movie rating for all the movies by the users.
  Each row will correspond to a particular user with a list which contains all the movie ratings
  given by the user to the movies.
  In case the user hasn't rated or watched the movie a value of zero will be assigned
  returns list of lists
  output format:- user1:[rating1,rating2,......,rating for nth movie]
  """
 #n_users and n_movies are global variables
  list1 = []
  for i in range(1, n_users+1):
    movie_id = data[:,1][data[:,0] == i]
    movie_rating = data[:,2][data[:,0] == i]
    zero_rating = np.zeros(n_movies)# creating a list of zeros to overwrite if movie rating is present
    zero_rating[movie_id-1] = movie_rating
    list1.append(list(zero_rating))

  new_data = list1
  return new_data 







In [15]:

train_data_new = data_new(train_data)
test_data_new = data_new(test_data)

In [16]:
#converting data to torch tensors
training_set = torch.FloatTensor(train_data_new)
test_set = torch.FloatTensor(test_data_new)


In [17]:
#Creating the architecture for neural network
class AutoEncoders(nn.Module):
  def __init__(self):
    super(AutoEncoders, self).__init__()
    self.fc1 = nn.Linear(n_movies, 20)
    self.fc2 = nn.Linear(20,10)
    self.fc3 = nn.Linear(10,20)
    self.fc4 = nn.Linear(20, n_movies)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x

sae = AutoEncoders()
criterion = nn.MSELoss()
optimizer = optim.Adam(sae.parameters(), lr = 0.003, weight_decay = 0.5)






In [22]:
#training the autoencoder
nb_epochs = 200

for epochs in range(1,nb_epochs+1):
  train_loss = 0
  s = 0.0
  for userid in range(n_users):
    input = Variable(training_set[userid]).unsqueeze(0)
    target = input.clone()
    if torch.sum(target.data > 0) > 0:
      output = sae(input)
      target.required_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = n_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data*mean_corrector)
      s += 1.
      optimizer.step()
  print('epoch: '+str(epochs)+'loss: '+ str(train_loss/s))


epoch: 1loss: tensor(1.0930)
epoch: 2loss: tensor(1.0492)
epoch: 3loss: tensor(1.0341)
epoch: 4loss: tensor(1.0267)
epoch: 5loss: tensor(1.0224)
epoch: 6loss: tensor(1.0196)
epoch: 7loss: tensor(1.0176)
epoch: 8loss: tensor(1.0162)
epoch: 9loss: tensor(1.0152)
epoch: 10loss: tensor(1.0144)
epoch: 11loss: tensor(1.0137)
epoch: 12loss: tensor(1.0132)
epoch: 13loss: tensor(1.0129)
epoch: 14loss: tensor(1.0125)
epoch: 15loss: tensor(1.0123)
epoch: 16loss: tensor(1.0120)
epoch: 17loss: tensor(1.0118)
epoch: 18loss: tensor(1.0117)
epoch: 19loss: tensor(1.0116)
epoch: 20loss: tensor(1.0114)
epoch: 21loss: tensor(1.0113)
epoch: 22loss: tensor(1.0112)
epoch: 23loss: tensor(1.0111)
epoch: 24loss: tensor(1.0111)
epoch: 25loss: tensor(1.0110)
epoch: 26loss: tensor(1.0109)
epoch: 27loss: tensor(1.0108)
epoch: 28loss: tensor(1.0105)
epoch: 29loss: tensor(1.0093)
epoch: 30loss: tensor(1.0075)
epoch: 31loss: tensor(1.0058)
epoch: 32loss: tensor(1.0028)
epoch: 33loss: tensor(0.9987)
epoch: 34loss: tens

In [24]:
test_loss = 0
s = 0.
for id_user in range(n_users):
  input = Variable(training_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = n_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.9421)
