<a href="https://colab.research.google.com/github/soujanyarbhat/SWM_MovieRecommenderSystem/blob/main/SAE_ratings_genres.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

from sklearn.model_selection import train_test_split

In [2]:

MODEL_DIR = "Model"
MODEL_VER = "2"
MODEL_FILE = "model.pt"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_VER)

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

In [3]:
lr = 0.01
dropout = 0.1
epochs = 200
activation = "sigmoid"


In [5]:
def create_index_mapping(L):
    '''
    return reindexed dict on user and items
    encoded indices starts from 1
    input: 
    * L: list of str
    outputs:
    * ind_2_item,item_2_ind: tuple of dictionary
    '''
    L = set(L)
    ind_2_item = {}
    
    for i,v in enumerate(L):
        #index start from 1
        ind_2_item[i+1] = v
    #invert the map
    item_2_ind = {v: k for k, v in ind_2_item.items()}
    return ind_2_item,item_2_ind

def reindexer(ratings_df,user_col,item_col,rating_col):
    '''
    inputs:
    * ratings_df: pandas df containing ratings/affinity for user-item pairs
    * user_col: actual col name for users
    * item_col: actual col name for items
    * rating_col: actual col name for ratings
    output:
    * ratings_df: reindexed user and item column, pandas df
    '''
    users_list = ratings_df[user_col].tolist()
    item_list = ratings_df[item_col].tolist()
    
    ind_2_user,user_2_ind = create_index_mapping(users_list)
    ind_2_item,item_2_ind = create_index_mapping(item_list)
    
    #rename ratings df
    ratings_df = ratings_df.rename(columns={user_col:'user_col',
                                            item_col:'item_col',
                                            rating_col:'rating_col'})
    
    #encode df using the 2 mappings
    ratings_df['encoded_users'] = ratings_df['user_col'].apply(lambda x:user_2_ind[x])
    ratings_df['encoded_items'] = ratings_df['item_col'].apply(lambda x:item_2_ind[x])
    
    return ratings_df[['encoded_users','encoded_items','rating_col']], ind_2_user,user_2_ind, ind_2_item,item_2_ind

def convert(data):
    new_data = []
    for id_users in range(nb_users+1):
        # each user's watched movies
        # data[:,0], first column, all rows column users
        id_items = data[:,1][data[:,0] == id_users]
        # each user's rating for that item
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        # the positions of these items are filled with ratings, creating the matrix
        ratings[id_items-1] = id_ratings
        new_data.append(list(ratings))
    return new_data

In [6]:
def reindex_movies(movies, item_2_ind):
    movies['movieid'] = movies['movieid'].apply(lambda x:item_2_ind[x] if x in item_2_ind else -1)

def extract_genre_values(movies):
    genre = movies['genre']
    unique_genre = genre.unique()
    genre_values = []
    for movie_genre in unique_genre:
        mg = movie_genre.split("|")
        for g in mg:
            if g not in genre_values:
                genre_values.append(g)

    return sorted(genre_values, key=str.lower)

# get genre vector
def get_genre_vector(genre_row_val):
    mg = genre_row_val.split("|")
    gen_vec = np.zeros(len(genre_values))
    gen_index = 0
    for g in genre_values:
        if g in mg:
            gen_vec[gen_index] = 1
        gen_index += 1
    return gen_vec

# Add Genre Vector to movies dataframe
def add_movies_genre(movies):
#     movie_col = []

#     movie_2_genre = {}
#     for row in movies.iterrows():
#         print(row)
#         gen_vec = get_genre_vector(row['genre'])
#         movie_col.append(gen_vec)
#         movie_2_genre[row['movieid']] = np.array(gen_vec)

    movies['genre_vector'] = movies['genre'].apply(lambda x:np.array(get_genre_vector(x)))
    
    movie_2_genre = pd.Series(movies.genre_vector.values,index = movies.movieid).to_dict()

    return movie_2_genre


# def addgenrevector(data, movie_2_genre):
#     genre_array = []
#     movie_id_list = data['encoded_items'].tolist()
#     for movie_id in movie_id_list:
#         genre_array.append(movie_2_genre[movie_id])
#     data['genre_vector'] = genre_array
    
def get_user_genre(movies, ratings):
    genres = np.zeros(nb_genres)
    for movie, rating in zip(movies, ratings):
        if rating>3:
            genres += movie_2_genre[movie]
    return genres
    
    
def convert_with_side_info(data):
    new_data = []
    for id_users in range(nb_users+1):
        # each user's watched movies
        # data[:,0], first column, all rows column users
        id_items = data[:,1][data[:,0] == id_users]
        # each user's rating for that item
        id_ratings = data[:,2][data[:,0] == id_users]
        
        ratings = np.zeros(nb_movies)
        
        genres = get_user_genre(id_items, id_ratings)
                
        # the positions of these items are filled with ratings, creating the matrix
        ratings[id_items-1] = id_ratings
        
        new_data.append(list(np.append(ratings,genres)))
    return new_data

In [7]:
ratings = pd.read_csv('/content/ratings.dat', delimiter = '::',header=None, engine='python')
movies = pd.read_csv('/content/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('/content/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

ratings.columns = ['userid','movieid','rating','timestamp']
ratings_reindex, ind_2_user,user_2_ind , ind_2_item,item_2_ind = reindexer(ratings,'userid','movieid','rating')

# Adding Side Information
movies.columns = ['movieid', 'movie', 'genre']
users.columns = ['userid', 'gender', 'age', 'occupation', 'zipcode']

users['female_user'] = (users['gender'] == 'F').astype(int)
users['male_user'] = (users['gender'] == 'M').astype(int)

reindex_movies(movies, item_2_ind)
genre_values = extract_genre_values(movies)
movie_2_genre = add_movies_genre(movies)

train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.1,
                               random_state=42)

training_set = np.array(train, dtype='int')
test_set = np.array(test, dtype='int')

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_genres = len(genre_values)

training_set = convert_with_side_info(training_set)
test_set = convert_with_side_info(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [16]:
input_columns = nb_movies + nb_genres

class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(input_columns, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, input_columns)
        self.activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

In [19]:
nb_epoch = 200
for epoch in range(1, nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        #Select only rating related columns to compute loss
        target_ratings = target[:, :nb_movies]
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            output_ratings = output[:, :nb_movies]
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output_ratings, target_ratings)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))
    
    
# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)
    target_ratings = target[:, :nb_movies]
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        output_ratings = output[:, :nb_movies]
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output_ratings, target_ratings)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))

epoch: 1 loss: tensor(1.3888)
epoch: 2 loss: tensor(0.9542)
epoch: 3 loss: tensor(0.9209)
epoch: 4 loss: tensor(0.9093)
epoch: 5 loss: tensor(0.9036)
epoch: 6 loss: tensor(0.9004)
epoch: 7 loss: tensor(0.8985)
epoch: 8 loss: tensor(0.8970)
epoch: 9 loss: tensor(0.8949)
epoch: 10 loss: tensor(0.8942)
epoch: 11 loss: tensor(0.8933)
epoch: 12 loss: tensor(0.8927)
epoch: 13 loss: tensor(0.8927)
epoch: 14 loss: tensor(0.8930)
epoch: 15 loss: tensor(0.8909)
epoch: 16 loss: tensor(0.8903)
epoch: 17 loss: tensor(0.8892)
epoch: 18 loss: tensor(0.8911)
epoch: 19 loss: tensor(0.8896)
epoch: 20 loss: tensor(0.8884)
epoch: 21 loss: tensor(0.8885)
epoch: 22 loss: tensor(0.8894)
epoch: 23 loss: tensor(0.8873)
epoch: 24 loss: tensor(0.8876)
epoch: 25 loss: tensor(0.8886)
epoch: 26 loss: tensor(0.8895)
epoch: 27 loss: tensor(0.8886)
epoch: 28 loss: tensor(0.8885)
epoch: 29 loss: tensor(0.8861)
epoch: 30 loss: tensor(0.8822)
epoch: 31 loss: tensor(0.8837)
epoch: 32 loss: tensor(0.8861)
epoch: 33 loss: t

In [20]:
    print(f"epoch: {epoch} train loss: {train_loss/s}|| test loss: {test_loss/s}")
    
    # Saving Model
    if validationLoss > (test_loss/s):
        validationLoss = (test_loss/s)
        torch.save(sae.state_dict(), os.path.join(MODEL_PATH,MODEL_FILE))

epoch: 200 train loss: 0.8854984641075134|| test loss: 0.7386822700500488
