In [None]:
import json
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch
torch.manual_seed(4)

<torch._C.Generator at 0x7f431cd44870>

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
#!wget https://files.grouplens.org/datasets/movielens/ml-10m.zip

In [None]:
#!unzip ml-10m.zip

In [None]:
#movies = pd.read_csv('./ml-10M100K/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
#users = pd.read_csv('./ml-10M100K/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings_full = pd.read_csv('./ml-10M100K/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1', names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [None]:
ratings_full.shape

(10000054, 4)

In [None]:
ratings_full.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [None]:
n_samples = 10
ratings = ratings_full.groupby(['user_id'], group_keys=False).apply(lambda x: x.sample(n_samples))

In [None]:
ratings_full = pd.DataFrame()

In [None]:
# movies = pd.read_csv('processedMovids (1).csv', sep = ',', header = 0, encoding = 'latin-1')
# users = pd.read_csv('./ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
#ratings = pd.read_csv('sample-0.csv', sep = ',', header = None,skiprows=1, encoding = 'latin-1')

In [None]:
print(ratings.shape)

(698780, 4)


In [None]:
print(ratings.head())

    user_id  movie_id  rating  timestamp
9         1       364     5.0  838983707
1         1       185     5.0  838983525
20        1       594     5.0  838984679
2         1       231     5.0  838983392
18        1       588     5.0  838983339


In [None]:
user_mapping = {user:index for index, user in enumerate(ratings['user_id'].unique())}
movie_mapping = {movie:index for index, movie in enumerate(ratings['movie_id'].unique())}

In [None]:
ratings['user_id_mapped'] = [user_mapping[user_id] for user_id in ratings['user_id']]
ratings['movie_id_mapped'] = [movie_mapping[movie_id] for movie_id in ratings['movie_id']]

In [None]:
print(ratings.head())

    user_id  movie_id  rating  timestamp  user_id_mapped  movie_id_mapped
9         1       364     5.0  838983707               0                0
1         1       185     5.0  838983525               0                1
20        1       594     5.0  838984679               0                2
2         1       231     5.0  838983392               0                3
18        1       588     5.0  838983339               0                4


In [None]:
print(len(ratings['movie_id_mapped'].unique()))

8378


In [None]:
ratings.sort_values(by='timestamp', ascending=True, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
training_set, test_set = train_test_split(ratings, test_size=0.2, shuffle=False)
training_set, validation_set = train_test_split(training_set, test_size=0.2, shuffle=False)

In [None]:
print(training_set.shape)
print(validation_set.shape)
print(test_set.shape)

(447219, 6)
(111805, 6)
(139756, 6)


In [None]:
##convert it to array
#training_set = np.array(training_set, dtype = 'int')
#test_set = np.array(test_set, dtype = 'int')

In [None]:
print(training_set.shape)
print(test_set.shape)

(447219, 6)
(139756, 6)


In [None]:
nb_users = len(ratings['user_id_mapped'].unique())
nb_movies = len(ratings['movie_id_mapped'].unique())

In [None]:
print('Num of users: ', nb_users, '\nNum of movies: ', nb_movies)

Num of users:  69878 
Num of movies:  8378


In [None]:
def convert(data):
    
    for user_id in range(0, nb_users):
      filter_user = data['user_id_mapped'] == user_id
      movie_ids = data.loc[:, 'movie_id_mapped'][filter_user]
      user_ratings = data.loc[:, 'rating'][filter_user]

      ratings = np.zeros(nb_movies)
      ratings[movie_ids] = user_ratings
      yield ratings

In [None]:
with open('./training_pivot.jsonl', 'w') as file:
  for data in  convert(training_set):
    file.write(json.dumps(list(data)) + '\n')

In [None]:
with open('./validation_pivot.jsonl', 'w') as file:
  for data in  convert(validation_set):
    file.write(json.dumps(list(data)) + '\n')

In [None]:
with open('./testing_pivot.jsonl', 'w') as file:
  for data in  convert(test_set):
    file.write(json.dumps(list(data)) + '\n')

In [None]:
from torch.utils.data import DataLoader, IterableDataset
class Dataset(IterableDataset):

  def __init__(self, path):
    self._path = path

  def __iter__(self):
    for line in open(self._path):
      input = np.array(json.loads(line))
      yield {'input':input, 'target':input}

In [None]:
def generate_batches(dataset, batch_size, shuffle=False, drop_last=True, device='cpu'):
  data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
  for data_dict in data_loader:
    out_data_dict = {}
    for name, tensor in data_dict.items():
      out_data_dict[name] = data_dict[name].to(device)
    yield out_data_dict

In [None]:
train_dataset = Dataset('./training_pivot.jsonl')
validation_dataset = Dataset('./validation_pivot.jsonl')
test_dataset = Dataset('./testing_pivot.jsonl')

In [None]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        #Encoding
        self.fc1 = nn.Linear(nb_movies, 256)
        self.fc3 = nn.Linear(256, 128)
        #Decoding
        self.fc4 = nn.Linear(128, 256)
        self.fc6 = nn.Linear(256, nb_movies)

        self.activation = nn.ReLU()

    def forward(self, x):
        #encoding
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc3(x))
        # decoding
        x = self.activation(self.fc4(x))
        x = self.fc6(x)
        return x

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
sae = SAE()
sae.to(device)
#Create loss object
criterion = nn.MSELoss()
#create optimizer object
#parameters of all auto-encoders defined in the class
optimizer = optim.Adam(sae.parameters(), lr = 1e-4, weight_decay = 0.5)
# number of params
count_parameters(sae)

4364090

In [None]:
num_epochs = 2
batch_size = 512

In [None]:
for epoch in range(num_epochs):
  batch_generator = generate_batches(train_dataset, batch_size=batch_size, device='cuda')
  sae.train()
  running_loss = 0.0
  for batch_index, batch_dict in enumerate(batch_generator):
    optimizer.zero_grad()
    output = sae(batch_dict['input'].float())
    output[batch_dict['target'] == 0] = 0
    loss = criterion(output, batch_dict['target'].float())
    mean_corrector = nb_movies / float(torch.sum(batch_dict['target'] > 0) + 1e-10)
    train_loss = np.sqrt(loss.data.cpu() * mean_corrector)
    running_loss += (train_loss - running_loss) / (batch_index + 1)
    loss.backward()
    optimizer.step()
    
    
  print('epoch: '+str(epoch) + ' Train loss: ' + str(running_loss))


  batch_generator = generate_batches(validation_dataset, batch_size=batch_size, device='cuda')
  sae.eval()
  running_loss = 0.0
  for batch_index, batch_dict in enumerate(batch_generator):
    output = sae(batch_dict['input'].float())
    output[batch_dict['target'] == 0] = 0
    loss = criterion(output, batch_dict['target'].float())
    mean_corrector = nb_movies / float(torch.sum(batch_dict['target'] > 0) + 1e-10)
    train_loss = np.sqrt(loss.data.cpu() * mean_corrector)
    running_loss += (train_loss - running_loss) / (batch_index + 1)
  print('epoch: '+str(epoch) + ' Validation loss: ' + str(running_loss))

epoch: 0Train loss: tensor(0.1676)
epoch: 0Validation loss: tensor(0.1630)
epoch: 1Train loss: tensor(0.1676)
epoch: 1Validation loss: tensor(0.1630)


In [None]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)

In [None]:
batch_generator = generate_batches(train_dataset, batch_size=512, device='cuda')

In [None]:
x = next(batch_generator)

In [None]:
torch.sum(x['input'])

tensor(12716., device='cuda:0', dtype=torch.float64)

In [None]:
torch.sum(x['target'])

tensor(12716., device='cuda:0', dtype=torch.float64)

In [None]:
batch_generator = generate_batches(test_dataset, batch_size=batch_size, device='cuda')
sae.eval()
running_loss = 0.0
for batch_index, batch_dict in enumerate(batch_generator):
  output = sae(batch_dict['input'].float())
  output[batch_dict['target'] == 0] = 0
  loss = criterion(output, batch_dict['target'].float())
  mean_corrector = nb_movies / float(torch.sum(batch_dict['target'] > 0) + 1e-10)
  train_loss = np.sqrt(loss.data.cpu() * mean_corrector)
  running_loss += (train_loss - running_loss) / (batch_index + 1)
print('epoch: '+str(epoch) + 'Validation loss: ' + str(running_loss))

KeyboardInterrupt: ignored

In [None]:
test_loss = 0
s = 0.
sae.eval()

y_pred = []
y_true = []

for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
        y_pred.extend(output.cpu().detach().numpy()[0])
        y_true.extend(target.cpu().detach().numpy()[0])

print('test loss: '+str(test_loss/s))

test loss: tensor(2.3772)


In [None]:
user_input = Variable(training_set[user_id]).unsqueeze(0)
# print('training input: ', len(training_set[user_id]), training_set[user_id])
predicted = sae(user_input)
predicted = np.round(predicted.data.numpy().reshape(-1,1), 2)
# print('predicted: \n', len(predicted), predicted)

In [None]:
movies.shape

(45132, 4)

In [None]:
user_input = user_input.data.numpy().reshape(-1,1)
result_array = np.hstack([movie_title, user_input, user_target, predicted])
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'User input', 'Target Rating', 'Predicted'])

ValueError: ignored

In [None]:
results = result_df[result_df['Target Rating'] > 0]
results

Unnamed: 0,Movie,User input,Target Rating,Predicted
116,Casper,0,2,0
120,Desperado,0,3,0
291,Fall Time,0,2,-0
328,Heaven & Earth,0,3,2.77
355,Orlando,0,3,3.04
...,...,...,...,...
21211,"Turn Left, Turn Right",0,2,0
21294,Suite 16,0,2,-0
21468,Plastic Planet,0,2,-0
21480,Yardbird,0,1,-0


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error



In [None]:
y_filter_true = []
y_filter_pred = []
for y_true_array, y_pred_array in zip(y_true, y_pred):
    if(y_true_array != 0):
      y_filter_true.append(y_true_array)
      y_filter_pred.append(y_pred_array)

In [None]:
len(y_true)

4106970

In [None]:
print(mean_absolute_error(y_filter_true, y_filter_pred))

2.2302346


In [None]:
print(mean_squared_error(y_filter_true, y_filter_pred,squared=False))

2.5587835


In [None]:
y_filter_pred_1 = [np.round(pred) for pred in y_filter_pred]

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_filter_true, y_filter_pred_1))


NameError: ignored

In [None]:
np.round(3.2)

3.0