In [1]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

from sklearn.model_selection import train_test_split

In [23]:
MODEL_DIR = "Model"
MODEL_VER = "1"
MODEL_FILE = "model.pt"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_VER)

OUTPUT_PATH = "Output"
OUTPUT_FILE = f"output{MODEL_VER}.csv"
OUTPUT_R_FILE = f"output_rating{MODEL_VER}.csv"

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

# Model

In [3]:
lr = 0.01
dropout = 0.1
epochs = 200
activation = "sigmoid"

#### Reference: https://medium.com/@haoyunlai/pytorch-implementation-of-autoencoder-based-recommender-system-9aff6c3d1b02

In [4]:
class Encoder(nn.Module):
    """
    Class for Symmetrical AutoEncoder Network
    :param L: List of int, contains sizes of encoding layers and starts with feature size
    For example: [500, 20, 10] will result in:
      - encoder 2 layers: 500x20 and 20x10. Representation layer (z) will be 10
      - decoder 2 layers: 10x20 and 20x500, output size is 500, reconstructed.
    :param activation_fn: (default 'sigmoid') Type of activation function
    :param drop_prob: (default: 0.0) Dropout probability
    """
    def __init__(self, L, activation_fn='sigmoid', drop_prob=0.0):
        super(Encoder, self).__init__()
        layers = self.create_nn_structure(L)
        self.num_layers = len(L)
        # create controller for activation function
        self.activation_fn_nm = activation_fn
        # create dropout module
        self._drop_prob = drop_prob
        if drop_prob > 0.0:
            self.dropout = nn.Dropout(drop_prob)
        #initialize with empty list to store layers
        self.linears = nn.ModuleList([])
        self.linears.extend([nn.Linear(i[0], i[1]) for i in layers])
        
    def get_activation_fn(self):
        # user selected activation function at layers except for last layer
        if self.activation_fn_nm == 'relu':
            return nn.ReLU()
        elif self.activation_fn_nm == 'lrelu':
            return nn.LeakyReLU()
        elif self.activation_fn_nm == 'sigmoid':
            return nn.Sigmoid()
        else:
            raise ValueError('Activation function type not defined')
    
    def forward(self, x):
        for i,layer in enumerate(self.linears):
            if i <= self.num_layers-1:
                # create instance of activation function
                act_fn = self.get_activation_fn()
                # pass in the input
                x = act_fn(self.linears[i](x))
                if self._drop_prob > 0.0 and i <= int(self.num_layers/2): 
                    # apply dropout only on encode layer by control of i
                    x = self.dropout(x)
        # No activation on the last decoding layer
        x = self.linears[-1](x)
        return x

    def create_nn_structure(self, L):
        max_ind = len(L)-1
        layers = []
        for i,v in enumerate(L):
            if i < max_ind:
                #still have i+1 available, create layer tuple
                layer = [v,L[i+1]]
                layers.append(layer)
        #then inverse the layers for decoder size
        encoder_layers = layers[:]
        for l in encoder_layers[::-1]:
            decoder_layer = l[::-1]
            layers.append(decoder_layer)
        return layers

### Data Preprocessing methods

In [5]:
def create_index_mapping(L):
    '''
    return reindexed dict on user and items
    encoded indices starts from 1
    input: 
    * L: list of str
    outputs:
    * ind_2_item,item_2_ind: tuple of dictionary
    '''
    L = set(L)
    ind_2_item = {}
    
    for i,v in enumerate(L):
        #index start from 1
        ind_2_item[i+1] = v
        
    #invert the map
    item_2_ind = {v: k for k, v in ind_2_item.items()}
    return ind_2_item,item_2_ind

def reindexer(ratings_df,user_col,item_col,rating_col):
    '''
    inputs:
    * ratings_df: pandas df containing ratings/affinity for user-item pairs
    * user_col: actual col name for users
    * item_col: actual col name for items
    * rating_col: actual col name for ratings
    output:
    * ratings_df: reindexed user and item column, pandas df
    '''
    users_list = ratings_df[user_col].tolist()
    item_list = ratings_df[item_col].tolist()
    
    ind_2_user,user_2_ind = create_index_mapping(users_list)
    ind_2_item,item_2_ind = create_index_mapping(item_list)
    
    #rename ratings df columns
    ratings_df = ratings_df.rename(columns={user_col:'user_col',
                                            item_col:'item_col',
                                            rating_col:'rating_col'})
    
    #encode df using the 2 mappings
    ratings_df['encoded_users'] = ratings_df['user_col'].apply(lambda x:user_2_ind[x])
    ratings_df['encoded_items'] = ratings_df['item_col'].apply(lambda x:item_2_ind[x])
    
    return ratings_df[['encoded_users','encoded_items','rating_col']], ind_2_user,user_2_ind, ind_2_item,item_2_ind

def reindex_movies(movies, item_2_ind):
    movies['movieid'] = movies['movieid'].apply(lambda x:item_2_ind[x] if x in item_2_ind else -1)
    id_2_movie = pd.Series(movies.movie.values,index = movies.movieid).to_dict()

    return id_2_movie

def convert(data):
    new_data = []
    for id_users in range(nb_users+1):
        id_items = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_items-1] = id_ratings
        new_data.append(list(ratings))
    return new_data

# Data loading and preprocessing

In [6]:
ratings = pd.read_csv('MovieLens1M/ratings.dat', delimiter = '::',header=None, engine='python')
movies = pd.read_csv('MovieLens1M/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

ratings.columns = ['userid','movieid','rating','timestamp']
ratings_reindex, ind_2_user,user_2_ind , ind_2_item,item_2_ind = reindexer(ratings,'userid','movieid','rating')

movies.columns = ['movieid', 'movie', 'genre']

id_2_movie = reindex_movies(movies, item_2_ind)



train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.1,
                               random_state=42)

training_set = np.array(train, dtype='int')
test_set = np.array(test, dtype='int')

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
  
training_set = convert(training_set)
test_set = convert(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

print(f"Number of users: {nb_users}, Number of Movies: {nb_movies}")
print(f"Training set size: {training_set.shape}, Validation set size: {test_set.shape}")

Number of users: 6040, Number of Movies: 3706
Training set size: torch.Size([6041, 3706]), Validation set size: torch.Size([6041, 3706])


# Training and Validation

In [7]:
autoencoder_network = Encoder([nb_movies, 20,10], activation, dropout)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(autoencoder_network.parameters(), lr = lr, weight_decay = 0.1)

validationLoss = float("inf")
minEpoch = epochs
nb_epoch = epochs


for epoch in range(1, nb_epoch + 1):
    autoencoder_network.train()
    train_loss = 0
    s = 0.
    # s is the number of users who rated at least 1 movies
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = autoencoder_network(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) #making this anyway not equal to 0, as this will be a denominator
            #mean_corrector is the avg of the error, only considering the movies having ratings (non-zero ratings) for computing mean of error
            loss.backward() # decide the direction the increment of weights
            #this call will just computing all the gradients required
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step() # decide the amount to update the weights
    
    # Validation
    autoencoder_network.eval()
    test_loss = 0
    s = 0.

    res = []
    targets = []

    # averaged difference between real rating and predicted rating
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0) # should keep the training set
        target = Variable(test_set[id_user]).unsqueeze(0) # to predict the other movies user not seen yet

        if torch.sum(target.data > 0) > 0:
            # make predictions
            with torch.no_grad():
                output = autoencoder_network(input)
                targets.append(target.detach().numpy())
                res.append(output.detach().numpy()) 
                target.require_grad = False
                output[target == 0] = 0 # dont want to measure the loss on the movies didnt get the actual rating from user 
                # force to 0 and difference / loss will be 0 for those entries
                loss = criterion(output, target)

                mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) 
                # only consider the movies that are rated in the test set, to be included in the loss
                test_loss += np.sqrt(loss.data*mean_corrector)
                s += 1.
    print(f"epoch: {epoch} train loss: {train_loss/s}|| test loss: {test_loss/s}")
    
    # Saving Model
    if validationLoss > (test_loss/s):
        validationLoss = (test_loss/s)
        minEpoch = epoch
        torch.save(autoencoder_network.state_dict(), os.path.join(MODEL_PATH,MODEL_FILE))

epoch: 1 train loss: 1.0720396041870117|| test loss: 0.9771516919136047
epoch: 2 train loss: 0.9990372061729431|| test loss: 0.9748116731643677
epoch: 3 train loss: 0.9975261092185974|| test loss: 0.9739258885383606
epoch: 4 train loss: 0.9960575103759766|| test loss: 0.9701754450798035
epoch: 5 train loss: 0.9949276447296143|| test loss: 0.9705732464790344
epoch: 6 train loss: 0.9924980401992798|| test loss: 0.9683036804199219
epoch: 7 train loss: 0.9898546934127808|| test loss: 0.966316282749176
epoch: 8 train loss: 0.9862893223762512|| test loss: 0.9657771587371826
epoch: 9 train loss: 0.9836224913597107|| test loss: 0.9651578664779663
epoch: 10 train loss: 0.9823111891746521|| test loss: 0.9632856249809265
epoch: 11 train loss: 0.9805281758308411|| test loss: 0.9627885818481445
epoch: 12 train loss: 0.976828932762146|| test loss: 0.9624950289726257
epoch: 13 train loss: 0.97258460521698|| test loss: 0.9556384682655334
epoch: 14 train loss: 0.9695627689361572|| test loss: 0.94905495

epoch: 114 train loss: 0.9362266063690186|| test loss: 0.9204217791557312
epoch: 115 train loss: 0.9347079992294312|| test loss: 0.9218929409980774
epoch: 116 train loss: 0.9349929690361023|| test loss: 0.9214138984680176
epoch: 117 train loss: 0.9343478083610535|| test loss: 0.9205283522605896
epoch: 118 train loss: 0.9348019361495972|| test loss: 0.9215470552444458
epoch: 119 train loss: 0.9342434406280518|| test loss: 0.9207889437675476
epoch: 120 train loss: 0.9341545701026917|| test loss: 0.9206734299659729
epoch: 121 train loss: 0.9336395263671875|| test loss: 0.9215880632400513
epoch: 122 train loss: 0.9326661229133606|| test loss: 0.9197896718978882
epoch: 123 train loss: 0.9321725964546204|| test loss: 0.9216663837432861
epoch: 124 train loss: 0.93332839012146|| test loss: 0.9210762977600098
epoch: 125 train loss: 0.9328573346138|| test loss: 0.918907642364502
epoch: 126 train loss: 0.9350405335426331|| test loss: 0.9221632480621338
epoch: 127 train loss: 0.9332910180091858|| 

In [8]:
print(f"RMSE obtained: {validationLoss.data} after {minEpoch} epochs")

RMSE obtained: 0.9161987900733948 after 158 epochs


# Making top k recommendation

In [9]:
# Loading model
autoencoder_network = Encoder([nb_movies, 20,10], activation, dropout)
autoencoder_network.load_state_dict(torch.load(os.path.join(MODEL_PATH,MODEL_FILE)))
autoencoder_network.eval()

evidence = np.array(ratings_reindex, dtype='int')
evidence = convert(evidence)
evidence = torch.FloatTensor(evidence)

In [10]:
def make_top_k_recommendations(encoder,evidence,k,filter_seen=True):
    '''
    :param encoder: autoencoder instance
    :param evidence: full set of seen ratings from all users
    :param k: top k items (by output score)
    :param filter_seen: (default True) filter controller to remove seen items from top k list
    '''     
    res = []
    nb_users = evidence.shape[0]
    # to find top scored items for each user
    for id_user in range(nb_users):
        encoder_input = Variable(evidence[id_user]).unsqueeze(0) # should keep the training set 
        encoder_output = encoder(encoder_input)
        
        target = Variable(evidence[id_user]).unsqueeze(0) # mask to find items not seen yet
        if filter_seen:
            encoder_output[target != 0] = 0 # force seen items scores to 0, will never get recommended
        res.append(encoder_output.detach().numpy())
        
    res = [a[0] for a in res]
    final_itemsets = []    
    for each in res:
        full_ratings_predicted = list(each)
        full_ratings_indexed = list(enumerate(full_ratings_predicted))
        final_itemsets.append(sorted(full_ratings_indexed,key=lambda x:x[1],reverse =True)[:k])
        
    return final_itemsets



In [11]:
recommendations = make_top_k_recommendations(autoencoder_network, training_set, 10)

In [12]:
recommendation_movies = [[r[0] for r in recommendation] for recommendation in recommendations]
movies_ratings = [[r[1] for r in recommendation] for recommendation in recommendations]

In [20]:
user_recommendation = pd.DataFrame(recommendation_movies)
user_recommendation = user_recommendation.applymap(lambda x:id_2_movie[x])

recommendation_ratings = pd.DataFrame(movies_ratings)

In [26]:
user_recommendation.to_csv(os.path.join(OUTPUT_PATH, OUTPUT_FILE))
recommendation_ratings.to_csv(os.path.join(OUTPUT_PATH, OUTPUT_R_FILE))
print(f"Successfully saved user recommendations to file: {os.path.join(OUTPUT_PATH, OUTPUT_R_FILE)}")

Successfully saved user recommendations to file: Output/output_rating1.csv
