>#### dataset: https://grouplens.org/datasets/movielens/

>#### latest-small[1MB-->3MB]: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#### Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

>#### latest[335MB-->1.5GB]: https://grouplens.org/datasets/movielens/latest/
#### Full: approximately 33,000,000 ratings and 2,000,000 tag applications applied to 86,000 movies by 330,975 users. 
#### Includes tag genome data with 14 million relevance scores across 1,100 tags. Last updated 9/2018.

>#### ml-25m[250MB-->1.2GB]: https://grouplens.org/datasets/movielens/25m/ 
 #### MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies 
 #### by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags. Released 12/2019 

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable


In [2]:
#ENABLING CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
df = pd.read_csv('datasets/ml-25m/ratings.csv', nrows=100000)


In [4]:
df.drop(['timestamp'], axis=1, inplace=True)

In [17]:
df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
99995,757,2115,3.0
99996,757,2117,3.0
99997,757,2118,4.0
99998,757,2124,3.5


In [5]:
X = np.array(df, dtype = 'int')
X.shape

(100000, 3)

In [19]:
df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test = train_test_split(X,test_size=0.2, random_state=42)

In [7]:
X_train.shape

(80000, 3)

In [8]:
X_test.shape

(20000, 3)

In [11]:
import gc

In [12]:
# del df
# gc.collect()

In [13]:
# del X
# gc.collect()

In [9]:
nb_users = int(max(max(X_train[:, 0], ), max(X_test[:, 0])))
nb_movies = int(max(max(X_train[:, 1], ), max(X_test[:, 1])))

In [24]:
nb_movies

206272

In [25]:
nb_users

757

In [10]:
def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
X_train = convert(X_train)
X_test = convert(X_test)



In [11]:
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test) 

In [12]:
# class SAE(nn.Module):
#     def __init__(self, ):
#         super(SAE, self).__init__()
#         self.encoder = nn.Sequential(
#             nn.Linear(nb_movies, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 32),
#             nn.ReLU(),
#             # nn.Linear(32, 16),
#             # nn.ReLU()
#         )
#         self.decoder = nn.Sequential(
#             # nn.Linear(16, 32),
#             # nn.ReLU(),
#             nn.Linear(32, 64),
#             nn.ReLU(),
#             nn.Linear(64, 128),
#             nn.ReLU(),
#             nn.Linear(128, nb_movies),
#             nn.Sigmoid()
#         )

#     def forward(self, x):
#         x = self.encoder(x)
#         x = self.decoder(x)
#         return x
    
# sae = SAE()
# criterion = nn.MSELoss()
# optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)



class SAE(nn.Module):
    def __init__(self, ):  
        super(SAE, self).__init__() 
        self.fc1 = nn.Linear(nb_movies, 720)               
        self.fc2 = nn.Linear(720,360)
        self.fc3 = nn.Linear(360,180)    
        self.fc4 = nn.Linear(180,360)                           
        self.fc5 = nn.Linear(360,720)    
        self.fc6 = nn.Linear(720,nb_movies)             
        self.activation = nn.Sigmoid()

    def forward(self, x):           
        x = self.activation(self.fc1(x))  
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x)) 
        x = self.activation(self.fc4(x))   
        x = self.activation(self.fc5(x))                               
        x = self.fc6(x)                    
                                           
        return x
sae = SAE().to(device)
# sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr= 0.01, weight_decay= 0.5) 
#train loss of 2.0 ....

In [14]:
class SAE(nn.Module):
    def __init__(self, ):  
        super(SAE, self).__init__() 
        self.fc1 = nn.Linear(nb_movies, 128)               
        self.fc2 = nn.Linear(128,64)
        self.fc3 = nn.Linear(64,32)    
        self.fc4 = nn.Linear(32,16)                           
        self.fc5 = nn.Linear(16,32)   
        self.fc6 = nn.Linear(32,64)    
        self.fc7 = nn.Linear(64,128)                
        self.fc8 = nn.Linear(128,nb_movies)               
        self.activation = nn.Sigmoid()

    def forward(self, x):           
        x = self.activation(self.fc1(x))  
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x)) 
        x = self.activation(self.fc4(x)) 
        x = self.activation(self.fc5(x)) 
        x = self.activation(self.fc6(x)) 
        x = self.activation(self.fc7(x))                                  
        x = self.fc8(x)                    
                                           
        return x
sae = SAE().to(device)
# sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr= 0.01, weight_decay= 0.5) 

In [19]:
nb_epoch = 20 #30+10+20+20+20 =100 epochs
for epoch in range(1, nb_epoch + 1):
  train_loss = 0
  s = 0.
  for id_user in range(nb_users):
    input = Variable(X_train[id_user]).unsqueeze(0).to(device)
    target = input.clone().to(device)
    if torch.sum(target.data > 0) > 0:
      output = sae(input)
      target.require_grad = False
      output[target == 0] = 0
      loss = criterion(output, target)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += torch.sqrt(loss.data*mean_corrector)
      s += 1.
      optimizer.step()
  print('epoch: '+str(epoch)+' loss: '+ str(train_loss/s))

epoch: 1 loss: tensor(1.2561, device='cuda:0')
epoch: 2 loss: tensor(1.1799, device='cuda:0')
epoch: 3 loss: tensor(1.2507, device='cuda:0')
epoch: 4 loss: tensor(1.1752, device='cuda:0')
epoch: 5 loss: tensor(1.2454, device='cuda:0')
epoch: 6 loss: tensor(1.1706, device='cuda:0')
epoch: 7 loss: tensor(1.2404, device='cuda:0')
epoch: 8 loss: tensor(1.1662, device='cuda:0')
epoch: 9 loss: tensor(1.2356, device='cuda:0')
epoch: 10 loss: tensor(1.1620, device='cuda:0')
epoch: 11 loss: tensor(1.2309, device='cuda:0')
epoch: 12 loss: tensor(1.1580, device='cuda:0')
epoch: 13 loss: tensor(1.2265, device='cuda:0')
epoch: 14 loss: tensor(1.1542, device='cuda:0')
epoch: 15 loss: tensor(1.2221, device='cuda:0')
epoch: 16 loss: tensor(1.1505, device='cuda:0')
epoch: 17 loss: tensor(1.2180, device='cuda:0')
epoch: 18 loss: tensor(1.1469, device='cuda:0')
epoch: 19 loss: tensor(1.2140, device='cuda:0')
epoch: 20 loss: tensor(1.1435, device='cuda:0')


## testing the SAE


In [20]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(X_train[id_user]).unsqueeze(0).to(device)
  target = Variable(X_test[id_user]).unsqueeze(0).to(device)
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += torch.sqrt(loss.data*mean_corrector)
    s += 1.
print('test loss: '+str(test_loss/s))


test loss: tensor(1.2449, device='cuda:0')


## saving the model


In [None]:
torch.save(sae.state_dict(), 'sae_model_state_dicttest.pth')

In [26]:
torch.save(sae, 'sae_model_raw.pth')

## loading in the model


In [29]:
inference_sae = SAE()
inference_sae.load_state_dict(torch.load('sae_model_state_dict.pth'))
inference_sae.eval()

SAE(
  (fc1): Linear(in_features=206272, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=16, bias=True)
  (fc5): Linear(in_features=16, out_features=32, bias=True)
  (fc6): Linear(in_features=32, out_features=64, bias=True)
  (fc7): Linear(in_features=64, out_features=128, bias=True)
  (fc8): Linear(in_features=128, out_features=206272, bias=True)
  (activation): Sigmoid()
)

## making inferences

In [30]:
with torch.inference_mode():
    inference_sae_preds = inference_sae(X_test)
inference_sae_preds

tensor([[4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756],
        [4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756],
        [4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756],
        ...,
        [4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756],
        [4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756],
        [4.0497, 3.4818, 2.7945,  ..., 0.0048, 0.0050, 0.4756]])

In [31]:
#testing if same
sae.to('cpu')
sae.eval()
with torch.inference_mode():
    y_pred = sae(X_test)

In [25]:
inference_sae_preds == y_pred

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])

# ------tests--------

## testing 

In [8]:
customer = test_set[159,:]
customer

tensor([4., 0., 3.,  ..., 0., 0., 0.])

In [10]:
customer_input = Variable(customer).unsqueeze(0)

In [44]:
output = sae(customer_input)
# recomovo = torch.tensor(output[:][output[:]>4.5]).unsqueeze(0)
torch.where(output>4.5)[1]
indexes = torch.where(output>4.5)[1]
test_index = indexes.squeeze()


In [55]:
numpy_indexes = indexes.numpy()
print(numpy_indexes)
numpy_indexes.shape


[  63  126  168  356  656 1292 1366 1448]


(8,)

In [46]:
#printing out the recommended values 
print(output[:][output[:]>4.5])

tensor([4.5199, 4.5651, 4.8176, 4.5691, 4.5005, 4.6465, 4.5703, 4.6539],
       grad_fn=<IndexBackward0>)


In [None]:

def convert(data,cust_id):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == cust_id]
    id_ratings = data[:, 2] [data[:, 0] == cust_id]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
customer = convert(customer,nb_users)
customer = torch.FloatTensor(customer)

#for one customer ig
customer_input = Variable(customer).unsqueeze(0)
output = sae(input)

In [None]:
#original convert 
def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
training_set = convert(training_set)
test_set = convert(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set) 

In [None]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
  input = Variable(training_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user]).unsqueeze(0)
  if torch.sum(target.data > 0) > 0:
    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data*mean_corrector)
    s += 1.
print('test loss: '+str(test_loss/s))


In [None]:
customer = np.zeros(nb_movies)
customer

In [None]:

def convert(data,cust_id):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == cust_id]
    id_ratings = data[:, 2] [data[:, 0] == cust_id]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
customer = convert(customer,nb_users)
customer = torch.FloatTensor(customer)

#for one customer ig
input = Variable(customer).unsqueeze(0)
output = sae(input)




In [None]:

def convert(data):
  new_data = []
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data
training_set = convert(training_set)
test_set = convert(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set) 