In [None]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

from sklearn.model_selection import train_test_split

In [None]:
MODEL_DIR = "Model"
MODEL_VER = "2"
MODEL_FILE = "model.pt"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_VER)
    
OUTPUT_PATH = "Output"
OUTPUT_FILE = f"output{MODEL_VER}.csv"
OUTPUT_R_FILE = f"output_rating{MODEL_VER}.csv"

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

# Model

In [None]:
lr = 0.01
dropout = 0.1
epochs = 200
activation = "sigmoid"

#### Reference: https://medium.com/@haoyunlai/pytorch-implementation-of-autoencoder-based-recommender-system-9aff6c3d1b02

In [None]:
class Encoder(nn.Module):
    """
    Class for Symmetrical AutoEncoder Network
    :param L: List of int, contains sizes of encoding layers and starts with feature size
    For example: [500, 20, 10] will result in:
      - encoder 2 layers: 500x20 and 20x10. Representation layer (z) will be 10
      - decoder 2 layers: 10x20 and 20x500, output size is 500, reconstructed.
    :param activation_fn: (default 'sigmoid') Type of activation function
    :param drop_prob: (default: 0.0) Dropout probability
    """
    def __init__(self, L, activation_fn='sigmoid', drop_prob=0.0):
        super(Encoder, self).__init__()
        layers = self.create_nn_structure(L)
        self.num_layers = len(L)
        # create controller for activation function
        self.activation_fn_nm = activation_fn
        # create dropout module
        self._drop_prob = drop_prob
        if drop_prob > 0.0:
            self.dropout = nn.Dropout(drop_prob)
        #initialize with empty list to store layers
        self.linears = nn.ModuleList([])
        self.linears.extend([nn.Linear(i[0], i[1]) for i in layers])
        
    def get_activation_fn(self):
        # user selected activation function at layers except for last layer
        if self.activation_fn_nm == 'relu':
            return nn.ReLU()
        elif self.activation_fn_nm == 'lrelu':
            return nn.LeakyReLU()
        elif self.activation_fn_nm == 'sigmoid':
            return nn.Sigmoid()
        else:
            raise ValueError('Activation function type not defined')
    
    def forward(self, x):
        for i,layer in enumerate(self.linears):
            if i <= self.num_layers-1:
                # create instance of activation function
                act_fn = self.get_activation_fn()
                # pass in the input
                x = act_fn(self.linears[i](x))
                if self._drop_prob > 0.0 and i <= int(self.num_layers/2): 
                    # apply dropout only on encode layer by control of i
                    x = self.dropout(x)
        # No activation on the last decoding layer
        x = self.linears[-1](x)
        return x

    def create_nn_structure(self, L):
        max_ind = len(L)-1
        layers = []
        for i,v in enumerate(L):
            if i < max_ind:
                #still have i+1 available, create layer tuple
                layer = [v,L[i+1]]
                layers.append(layer)
        #then inverse the layers for decoder size
        encoder_layers = layers[:]
        for l in encoder_layers[::-1]:
            decoder_layer = l[::-1]
            layers.append(decoder_layer)
        return layers

### Data Preprocessing

In [None]:
def create_index_mapping(L):
    '''
    return reindexed dict on user and items
    encoded indices starts from 1
    input: 
    * L: list of str
    outputs:
    * ind_2_item,item_2_ind: tuple of dictionary
    '''
    L = set(L)
    ind_2_item = {}
    
    for i,v in enumerate(L):
        #index start from 1
        ind_2_item[i+1] = v
    #invert the map
    item_2_ind = {v: k for k, v in ind_2_item.items()}
    return ind_2_item,item_2_ind

def reindexer(ratings_df,user_col,item_col,rating_col):
    '''
    inputs:
    * ratings_df: pandas df containing ratings/affinity for user-item pairs
    * user_col: actual col name for users
    * item_col: actual col name for items
    * rating_col: actual col name for ratings
    output:
    * ratings_df: reindexed user and item column, pandas df
    '''
    users_list = ratings_df[user_col].tolist()
    item_list = ratings_df[item_col].tolist()
    
    ind_2_user,user_2_ind = create_index_mapping(users_list)
    ind_2_item,item_2_ind = create_index_mapping(item_list)
    
    #rename ratings df
    ratings_df = ratings_df.rename(columns={user_col:'user_col',
                                            item_col:'item_col',
                                            rating_col:'rating_col'})
    
    #encode df using the 2 mappings
    ratings_df['encoded_users'] = ratings_df['user_col'].apply(lambda x:user_2_ind[x])
    ratings_df['encoded_items'] = ratings_df['item_col'].apply(lambda x:item_2_ind[x])
    
    return ratings_df[['encoded_users','encoded_items','rating_col']], ind_2_user,user_2_ind, ind_2_item,item_2_ind

In [None]:
def reindex_movies(movies, item_2_ind):
    movies['movieid'] = movies['movieid'].apply(lambda x:item_2_ind[x] if x in item_2_ind else -1)
    id_2_movie = pd.Series(movies.movie.values,index = movies.movieid).to_dict()

    return id_2_movie

def extract_genre_values(movies):
    genre = movies['genre']
    unique_genre = genre.unique()
    genre_values = []
    for movie_genre in unique_genre:
        mg = movie_genre.split("|")
        for g in mg:
            if g not in genre_values:
                genre_values.append(g)

    return sorted(genre_values, key=str.lower)

# get genre vector
def get_genre_vector(genre_row_val):
    mg = genre_row_val.split("|")
    gen_vec = np.zeros(len(genre_values))
    gen_index = 0
    for g in genre_values:
        if g in mg:
            gen_vec[gen_index] = 1
        gen_index += 1
    return gen_vec

# Add Genre Vector to movies dataframe
def add_movies_genre(movies):
    
    movies['genre_vector'] = movies['genre'].apply(lambda x:np.array(get_genre_vector(x)))
    
    movie_2_genre = pd.Series(movies.genre_vector.values,index = movies.movieid).to_dict()

    return movie_2_genre

# Add genre preferences to users
def get_user_genre(movies, ratings):
    genres = np.zeros(nb_genres)
    for movie, rating in zip(movies, ratings):
        if rating>2:
            genres += movie_2_genre[movie]
    return genres
    
    
def convert_with_side_info(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        
        id_items = data[:,1][data[:,0] == id_users]

        id_ratings = data[:,2][data[:,0] == id_users]
        
        ratings = np.zeros(nb_movies)
        
        genres = get_user_genre(id_items, id_ratings)
                
        ratings[id_items-1] = id_ratings
        
        new_data.append(list(np.append(ratings,genres)))
    return new_data

# Data loading and preprocessing

In [None]:
ratings = pd.read_csv('MovieLens1M/ratings.dat', delimiter = '::',header=None, engine='python')
movies = pd.read_csv('MovieLens1M/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('MovieLens1M/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

ratings.columns = ['userid','movieid','rating','timestamp']
ratings_reindex, ind_2_user,user_2_ind , ind_2_item,item_2_ind = reindexer(ratings,'userid','movieid','rating')

# Adding Side Information
movies.columns = ['movieid', 'movie', 'genre']
users.columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']

users['female_user'] = (users['gender'] == 'F').astype(int)
users['male_user'] = (users['gender'] == 'M').astype(int)

id_2_movie = reindex_movies(movies, item_2_ind)
genre_values = extract_genre_values(movies)
movie_2_genre = add_movies_genre(movies)

train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.1,
                               random_state=42)

training_set = np.array(train, dtype='int')
test_set = np.array(test, dtype='int')

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_genres = len(genre_values)
  
training_set = convert_with_side_info(training_set)
test_set = convert_with_side_info(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

print(f"Number of users: {nb_users}, Number of Movies: {nb_movies}, Number of Genres: {nb_genres}")
print(f"Training set size: {training_set.shape}, Validation set size: {test_set.shape}")

# Training and Validation

In [None]:
autoencoder_network = Encoder([(nb_movies+nb_genres), 20,10], activation, dropout)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(autoencoder_network.parameters(), lr = lr, weight_decay = 0.5)

validationLoss = float("inf")
minEpoch = epochs
nb_epoch = epochs

for epoch in range(1, nb_epoch+1):
    autoencoder_network.train()
    train_loss = 0
    train_s = 0.
    for id_user in range(nb_users):
        input_ids = Variable(training_set[id_user]).unsqueeze(0)
        target = Variable(training_set[id_user]).unsqueeze(0)
        #Select only rating related columns to compute loss
        target_ratings = target[:, :nb_movies]
        if torch.sum(target.data > 0) > 0:
            output = autoencoder_network(input_ids)
            
            output[target==0] = 0
            output_ratings = output[:, :nb_movies]
            target.require_grad = False
            
            loss = criterion(output_ratings, target_ratings)
            loss.backward()
            
            mean_corrector = nb_movies/float(torch.sum(target_ratings.data > 0) + 1e-10)
            train_loss += np.sqrt(loss.data*mean_corrector)
            train_s += 1.
            optimizer.step()
    
    
    # Validation
    autoencoder_network.eval()
    test_loss = 0
    test_s = 0.
    for id_user in range(nb_users):
        input_ids = Variable(training_set[id_user]).unsqueeze(0)
        target = Variable(test_set[id_user]).unsqueeze(0)
        target_ratings = target[:, :nb_movies]
        if torch.sum(target.data > 0) > 0:
            with torch.no_grad():
                output = autoencoder_network(input_ids)
            
                output[target==0] = 0
                output_ratings = output[:, :nb_movies]
                target.require_grad = False

                loss = criterion(output_ratings, target_ratings)
                mean_corrector = nb_movies/float(torch.sum(target_ratings.data > 0) + 1e-10)
                
                test_loss += np.sqrt(loss.data*mean_corrector)
                test_s += 1.
    print(f"epoch: {epoch} train loss: {train_loss/train_s}|| test loss: {test_loss/test_s}")
    
    # Saving Model
    if validationLoss > (test_loss/test_s):
        validationLoss = (test_loss/test_s)
        minEpoch = epoch
        torch.save(autoencoder_network.state_dict(), os.path.join(MODEL_PATH,MODEL_FILE))

In [None]:
print(f"RMSE obtained: {validationLoss.data} after {minEpoch} epochs")

# Making top k recommendation

In [None]:
# Loading model
autoencoder_network = Encoder([(nb_movies+nb_genres), 20,10], activation, dropout)
autoencoder_network.load_state_dict(torch.load(os.path.join(MODEL_PATH,MODEL_FILE)))
autoencoder_network.eval()

evidence = np.array(ratings_reindex, dtype='int')
evidence = convert_with_side_info(evidence)
evidence = torch.FloatTensor(evidence)

In [None]:
def make_top_k_recommendations(encoder,evidence,k,filter_seen=True):
    '''
    :param encoder: autoencoder instance
    :param evidence: full set of seen ratings from all users
    :param k: top k items (by output score)
    :param filter_seen: (default True) filter controller to remove seen items from top k list
    '''     
    res = []
    nb_users = evidence.shape[0]
    # to find top scored items for each user
    for id_user in range(nb_users):
        encoder_input = Variable(evidence[id_user]).unsqueeze(0) # should keep the training set 
        encoder_output = encoder(encoder_input)
        
        target = Variable(evidence[id_user]).unsqueeze(0) # mask to find items not seen yet
        if filter_seen:
            encoder_output[target != 0] = 0 # force seen items scores to 0, will never get recommended
        res.append(encoder_output.detach().numpy())
        
    res = [a[0] for a in res]
    final_itemsets = []    
    for each in res:
        full_ratings_predicted = list(each)
        full_ratings_indexed = list(enumerate(full_ratings_predicted))
        final_itemsets.append(sorted(full_ratings_indexed,key=lambda x:x[1],reverse =True)[:k])
        
    return final_itemsets

In [None]:
recommendations = make_top_k_recommendations(autoencoder_network, evidence, 10)

In [None]:
recommendation_movies = [[r[0] for r in recommendation] for recommendation in recommendations]
movies_ratings = [[r[1] for r in recommendation] for recommendation in recommendations]

user_recommendation = pd.DataFrame(recommendation_movies)
user_recommendation = user_recommendation.applymap(lambda x:id_2_movie[x])

recommendation_ratings = pd.DataFrame(movies_ratings)

user_recommendation.to_csv(os.path.join(OUTPUT_PATH, OUTPUT_FILE))
recommendation_ratings.to_csv(os.path.join(OUTPUT_PATH, OUTPUT_R_FILE))
print(f"Successfully saved user recommendations to file: {os.path.join(OUTPUT_PATH, OUTPUT_FILE)}")