In [42]:
# import libraries
import os
import json
import re
import sys
import csv
import pandas as pd
import numpy as np
import seaborn as seab
import pickle

print('headers loaded.. plotly plotting set..')

# Normalize function- to normalize values for review score and tip score//
def normalize(x, new_min = 0, new_max = 100):
    output = []
    old_min, old_max = min(x), max(x)

    for iter_val in x:
        val = (new_max - new_min) / (old_max - old_min) * (iter_val - old_min) + new_min
        output.append(val)

    return output

headers loaded.. plotly plotting set..


In [2]:
# Read dataframes for google colab
# import json
# from google.colab import drive

# drive.mount('/content/drive')
# reviews_path = "/content/drive/MyDrive/Course Work/SEM2/Info Storage and Retrieval/project/yelp_dataset/yelp_academic_dataset_review.csv"
# business_path = "/content/drive/MyDrive/Course Work/SEM2/Info Storage and Retrieval/project/yelp_dataset/yelp_academic_dataset_business.csv"
# user_path = "/content/drive/MyDrive/Course Work/SEM2/Info Storage and Retrieval/project/yelp_dataset/yelp_academic_dataset_user.csv"

# reviews_df = pd.read_csv(reviews_path)
# user_df = pd.read_csv(user_path)
# business_df = pd.read_csv(business_path)

In [2]:
# Read dataframe for Local VS Code
reviews_df = pd.read_csv('yelp_academic_dataset_review.csv')
# user_df = pd.read_csv('yelp_academic_dataset_user.csv')
business_df = pd.read_csv('yelp_academic_dataset_business.csv')

In [43]:
# Get Top Ten States
business_df = business_df.dropna(subset=['categories'])
unique_states = business_df['state'].unique()
state_map = dict()
for s in unique_states:
  state_map[s] = business_df[business_df['state'] == s].shape[0]
# 'CA' 'MO' 'AZ' 'PA' 'TN' 'FL' 'IN' 'LA' 'AB' 'NV' 'ID' 'DE' 'IL' 'NJ' 'NC' 'CO' 'WA' 'HI' 'UT' 'TX' 'MT' 'MI' 'SD' 'XMS' 'MA' 'VI' 'VT'
top_states = [state[0] for state in sorted(sorted(state_map.items(), key=lambda x: x[1], reverse=True), key=lambda x: x[1], reverse=True)[:10]]
print(top_states)

# create an empty dictionary to store the dataframes
hotel_state_df_map = {}
restaurent_state_df_map = {}

# Creating mask for Hotels & Travel
hotel_mask = business_df['categories'].str.contains('Hotels & Travel')
hotel_df = business_df[hotel_mask]

# Creating mask for Restaurents
restaurent_mask = business_df['categories'].str.contains('Restaurants')
restaurent_df = business_df[restaurent_mask]

for state in top_states:
    df_name = f'business_df_{state}'

    hotel_state_df = hotel_df[hotel_df['state'] == state]
    restaurent_state_df = restaurent_df[restaurent_df['state'] == state]

    exec(f"{df_name} = hotel_state_df")
    # add the dataframe to the dictionary with the state abbreviation as the key
    hotel_state_df_map[state] = hotel_state_df

    exec(f"{df_name} = restaurent_state_df")
    # add the dataframe to the dictionary with the state abbreviation as the key
    restaurent_state_df_map[state] = restaurent_state_df


['PA', 'FL', 'TN', 'IN', 'MO', 'LA', 'AZ', 'NJ', 'NV', 'AB']


In [44]:
# Building Ratings Matrix
from scipy.sparse import coo_matrix
import numpy as np
from collections import defaultdict

class Recommendations:
  class Business:
    def __init__(self, name, address, city, state, postal_code, stars):
      self.name = name
      self.address = address
      self.city = city
      self.state = state
      self.postal_code = postal_code
      self.stars = stars
  
  def __init__(self, business_df, state_name, shorten=False):
    print(f"========Calculating For {state_name} State========")
    self.business_df = business_df
    self.rating_mat = []
    self.shorten = shorten if isinstance(shorten, bool) else False
    self.user_num_to_user_hash_dict = dict()
    self.user_hash_to_user_num_dict = dict()
    self.business_num_to_business_hash_dict = dict()
    self.business_hash_to_business_num_dict = dict()
    self.business_recommendations = []
    self.business_popularity = []
    self.calculateRatingMatrix()
    self.nonPersonalizedRecommendations()
  
  def calculateRatingMatrix(self):
    print("Calculating rating matrix...")
    business_list = list(self.business_df['business_id'])
    reviews_df_updated = reviews_df[reviews_df['business_id'].isin(business_list)]

    if (self.shorten):
          print(f"Size Before Cutting Down: {reviews_df_updated.shape[0]}")
          user_counts = reviews_df_updated.groupby('user_id').size().reset_index(name='count')

          # Sort the user_counts dataframe in descending order by count and select the top 100 user_ids
          top_users = user_counts.sort_values(by='count', ascending=False).head(100)['user_id'].tolist()

          # Filter the original dataframe to keep only the records that belong to the top 100 user_ids
          reviews_df_updated = reviews_df_updated[reviews_df_updated['user_id'].isin(top_users)]
          print(f"Size After Cutting Down: {reviews_df_updated.shape[0]}")


    unique_business_id = reviews_df_updated['business_id'].unique()
    unique_user_id = reviews_df_updated['user_id'].unique()

    j = 0
    for u in unique_user_id:
        self.user_hash_to_user_num_dict[u] = j
        self.user_num_to_user_hash_dict[j] = u
        j += 1

    j = 0
    for i in unique_business_id:
        self.business_hash_to_business_num_dict[i] = j
        self.business_num_to_business_hash_dict[j] = i
        j += 1

    # Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
    user_list = reviews_df_updated['user_id'].values
    movie_list = reviews_df_updated['business_id'].values
    for j in range(len(reviews_df_updated)):
        user_list[j] = self.user_hash_to_user_num_dict[user_list[j]]
        movie_list[j] = self.business_hash_to_business_num_dict[movie_list[j]]
    reviews_df_updated['user_id'] = user_list
    reviews_df_updated['business_id'] = movie_list

    num_user = len(reviews_df_updated['user_id'].unique())
    num_movie = len(reviews_df_updated['business_id'].unique())

    self.ratings_mat = coo_matrix((reviews_df_updated['stars'].values, (reviews_df_updated['user_id'].values, reviews_df_updated['business_id'].values)), shape=(num_user, num_movie)).astype(float).toarray()
    print(f"Size of Ratings Matrix: {self.ratings_mat.shape[0]}, {self.ratings_mat.shape[1]}")
  
  def nonPersonalizedRecommendations(self):
    print("Calculating NPR...")
    n = len(self.ratings_mat) # number of users
    m = len(self.ratings_mat[0]) # number of movies

    # Creating popularity array - size number of movies
    self.business_popularity = np.zeros((m,))
    self.business_popularity = self.ratings_mat.sum(axis=0) # claculating the popularity of each movie by summing the values in each column

    self.business_recommendations = np.zeros((n, 50), dtype=np.int32)

    for u in range(self.ratings_mat.shape[0]):
      business_unvisited = np.where(self.ratings_mat[u] == 0)[0]
      unwatched_popularity = self.business_popularity[business_unvisited]
      # Sort the unwatched movies according to popularity and fetch top 50 to recommend
      self.business_recommendations[u] = business_unvisited[np.argsort(unwatched_popularity)[::-1]][:50]

    # print("Non personalized recommendations for first User:")
    # for i in range(5):
    #   business_hash = self.getBusinessHashFromBusinessNum(self.business_recommendations[0,i])
    #   business = self.getBusinessInfo(business_hash)
    #   print(f"Rank {i+1}: Business {self.business_recommendations[0,i]} - Name: {business.name} - state: {business.state} - stars: {business.stars}  - Popularity {business_popularity[self.business_recommendations[0,i]]}")

  def getNPRForuUser(self, user_num):
    print(f"Non personalized recommendations for User {user_num}:")
    for i in range(5):
      business_hash = self.getBusinessHashFromBusinessNum(self.business_recommendations[0,i])
      business = self.getBusinessInfo(business_hash)
      print(f"Rank {i+1}: Business {self.business_recommendations[0,i]} - Name: {business.name} - state: {business.state} - stars: {business.stars}  - Popularity {self.business_popularity[self.business_recommendations[0,i]]}")


  def getUserHashFromUserNum(self, user_num):
    return self.user_num_to_user_hash_dict[user_num]

  def getUserNumFromUserHash(self, user_hash):
    return self.user_hash_to_user_num_dict[user_hash]

  def getBusinessHashFromBusinessNum(self, business_num):
    return self.business_num_to_business_hash_dict[business_num]

  def getBusinessNumFromBusinessHash(self, business_hash):
    return self.business_hash_to_business_num_dict[business_hash]
  
  def getBusinessInfo(self, business_hash):
    bus_df = self.business_df[self.business_df['business_id'] == business_hash].iloc[0]
    return self.Business(bus_df['name'], bus_df['address'], bus_df['city'], bus_df['state'], bus_df['postal_code'], bus_df['stars'])


In [9]:
# PA_Hotel_Recommendation = Recommendations(hotel_state_df_map['PA'], 'PA')
# FL_Hotel_Recommendation = Recommendations(hotel_state_df_map['FL'], 'FL')
# TN_Hotel_Recommendation = Recommendations(hotel_state_df_map['TN'], 'TN')
# IN_Hotel_Recommendation = Recommendations(hotel_state_df_map['IN'], 'IN')
# MO_Hotel_Recommendation = Recommendations(hotel_state_df_map['MO'], 'MO')
# LA_Hotel_Recommendation = Recommendations(hotel_state_df_map['LA'], 'LA')
# AZ_Hotel_Recommendation = Recommendations(hotel_state_df_map['AZ'], 'AZ')
# NJ_Hotel_Recommendation = Recommendations(hotel_state_df_map['NJ'], 'NJ')
# NV_Hotel_Recommendation = Recommendations(hotel_state_df_map['NV'], 'NV')
# AB_Hotel_Recommendation = Recommendations(hotel_state_df_map['AB'], 'AB')

Calculating rating matrix...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df_updated['user_id'] = user_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df_updated['business_id'] = movie_list


Calculating NPR...


In [25]:
# with open('PA_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(PA_Hotel_Recommendation, file)
# print("Object successfully saved to PA_Hotel_Recommendation.pkl")

# with open('FL_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(FL_Hotel_Recommendation, file)
# print("Object successfully saved to FL_Hotel_Recommendation.pkl")

# with open('TN_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(TN_Hotel_Recommendation, file)
# print("Object successfully saved to TN_Hotel_Recommendation.pkl")

# with open('IN_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(IN_Hotel_Recommendation, file)
# print("Object successfully saved to IN_Hotel_Recommendation.pkl")

# with open('MO_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(MO_Hotel_Recommendation, file)
# print("Object successfully saved to MO_Hotel_Recommendation.pkl")

# with open('LA_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(LA_Hotel_Recommendation, file)
# print("Object successfully saved to LA_Hotel_Recommendation.pkl")

# with open('AZ_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(AZ_Hotel_Recommendation, file)
# print("Object successfully saved to AZ_Hotel_Recommendation.pkl")

# with open('NJ_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(NJ_Hotel_Recommendation, file)
# print("Object successfully saved to NJ_Hotel_Recommendation.pkl")

# with open('NV_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(NV_Hotel_Recommendation, file)
# print("Object successfully saved to NV_Hotel_Recommendation.pkl")

# with open('AB_Hotel_Recommendation.pkl', 'wb') as file:
#     pickle.dump(AB_Hotel_Recommendation, file)
# print("Object successfully saved to AB_Hotel_Recommendation.pkl")


Object successfully saved to FL_Hotel_Recommendation.pkl
Object successfully saved to TN_Hotel_Recommendation.pkl
Object successfully saved to IN_Hotel_Recommendation.pkl
Object successfully saved to MO_Hotel_Recommendation.pkl
Object successfully saved to LA_Hotel_Recommendation.pkl
Object successfully saved to AZ_Hotel_Recommendation.pkl
Object successfully saved to NJ_Hotel_Recommendation.pkl
Object successfully saved to NV_Hotel_Recommendation.pkl
Object successfully saved to AB_Hotel_Recommendation.pkl


In [45]:
import bz2file as bz2

def compressed_pickle(title, data):
    with bz2.BZ2File(title + ".pbz2", "w") as f:
        pickle.dump(data, f)

def decompress_pickle(file):
    data = bz2.BZ2File(file, "rb")
    data = pickle.load(data)
    return data

# compressed_pickle("PA_Hotel_Recommendation", PA_Hotel_Recommendation)
# compressed_pickle("FL_Hotel_Recommendation", FL_Hotel_Recommendation)
# compressed_pickle("TN_Hotel_Recommendation", TN_Hotel_Recommendation)
# compressed_pickle("IN_Hotel_Recommendation", IN_Hotel_Recommendation)
# compressed_pickle("MO_Hotel_Recommendation", MO_Hotel_Recommendation)
# compressed_pickle("LA_Hotel_Recommendation", LA_Hotel_Recommendation)
# compressed_pickle("AZ_Hotel_Recommendation", AZ_Hotel_Recommendation)
# compressed_pickle("NJ_Hotel_Recommendation", NJ_Hotel_Recommendation)
# compressed_pickle("NV_Hotel_Recommendation", NV_Hotel_Recommendation)
# compressed_pickle("AB_Hotel_Recommendation", AB_Hotel_Recommendation)

In [46]:
PA_Hotel_Recommendation = decompress_pickle("PA_Hotel_Recommendation.pbz2")
FL_Hotel_Recommendation = decompress_pickle("FL_Hotel_Recommendation.pbz2")
TN_Hotel_Recommendation = decompress_pickle("TN_Hotel_Recommendation.pbz2")
IN_Hotel_Recommendation = decompress_pickle("IN_Hotel_Recommendation.pbz2")
MO_Hotel_Recommendation = decompress_pickle("MO_Hotel_Recommendation.pbz2")
LA_Hotel_Recommendation = decompress_pickle("LA_Hotel_Recommendation.pbz2")
AZ_Hotel_Recommendation = decompress_pickle("AZ_Hotel_Recommendation.pbz2")
NJ_Hotel_Recommendation = decompress_pickle("NJ_Hotel_Recommendation.pbz2")
NV_Hotel_Recommendation = decompress_pickle("NV_Hotel_Recommendation.pbz2")
AB_Hotel_Recommendation = decompress_pickle("AB_Hotel_Recommendation.pbz2")

In [None]:
# with open('PA_Hotel_Recommendation.pkl', 'rb') as file:
#     PA_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from PA_Hotel_Recommendation.pkl")

# with open('FL_Hotel_Recommendation.pkl', 'rb') as file:
#     FL_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from FL_Hotel_Recommendation.pkl")

# with open('TN_Hotel_Recommendation.pkl', 'rb') as file:
#     TN_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from TN_Hotel_Recommendation.pkl")

# with open('IN_Hotel_Recommendation.pkl', 'rb') as file:
#     IN_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from IN_Hotel_Recommendation.pkl")

# with open('MO_Hotel_Recommendation.pkl', 'rb') as file:
#     MO_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from MO_Hotel_Recommendation.pkl")

# with open('LA_Hotel_Recommendation.pkl', 'rb') as file:
#     LA_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from LA_Hotel_Recommendation.pkl")

# with open('AZ_Hotel_Recommendation.pkl', 'rb') as file:
#     AZ_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from AZ_Hotel_Recommendation.pkl")

# with open('NJ_Hotel_Recommendation.pkl', 'rb') as file:
#     NJ_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from NJ_Hotel_Recommendation.pkl")

# with open('NV_Hotel_Recommendation.pkl', 'rb') as file:
#     NV_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from NV_Hotel_Recommendation.pkl")

# with open('AB_Hotel_Recommendation.pkl', 'rb') as file:
#     AB_Hotel_Recommendation = pickle.load(file)
# print("Object successfully loaded from AB_Hotel_Recommendation.pkl")



In [15]:
# PA_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['PA'], 'PA', True)
# FL_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['FL'], 'FL', True)
# TN_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['TN'], 'TN', True)
# IN_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['IN'], 'IN', True)
# MO_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['MO'], 'MO', True)
# LA_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['LA'], 'LA', True)
# AZ_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['AZ'], 'AZ', True)
# NJ_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['NJ'], 'NJ', True)
# NV_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['NV'], 'NV', True)
# AB_Restaurent_Recommendation = Recommendations(restaurent_state_df_map['AB'], 'AB', True)

Calculating rating matrix...
Size Before Cutting Down: 1100250
Size After Cutting Down: 36442
Size of Ratings Matrix: 100, 8206
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 792133
Size After Cutting Down: 32619
Size of Ratings Matrix: 100, 6373
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 434697
Size After Cutting Down: 17425
Size of Ratings Matrix: 100, 3274
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 335843
Size After Cutting Down: 21611
Size of Ratings Matrix: 100, 3238
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 354605
Size After Cutting Down: 24585
Size of Ratings Matrix: 100, 3255
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 558340
Size After Cutting Down: 22143
Size of Ratings Matrix: 100, 2948
Calculating NPR...
Calculating rating matrix...
Size Before Cutting Down: 267036
Size After Cutting Down: 16550
Size of Ratings Matrix: 100, 2

In [19]:
# compressed_pickle("PA_Restaurent_Recommendation", PA_Restaurent_Recommendation)
# compressed_pickle("FL_Restaurent_Recommendation", FL_Restaurent_Recommendation)
# compressed_pickle("TN_Restaurent_Recommendation", TN_Restaurent_Recommendation)
# compressed_pickle("IN_Restaurent_Recommendation", IN_Restaurent_Recommendation)
# compressed_pickle("MO_Restaurent_Recommendation", MO_Restaurent_Recommendation)
# compressed_pickle("LA_Restaurent_Recommendation", LA_Restaurent_Recommendation)
# compressed_pickle("AZ_Restaurent_Recommendation", AZ_Restaurent_Recommendation)
# compressed_pickle("NJ_Restaurent_Recommendation", NJ_Restaurent_Recommendation)
# compressed_pickle("NV_Restaurent_Recommendation", NV_Restaurent_Recommendation)
# compressed_pickle("AB_Restaurent_Recommendation", AB_Restaurent_Recommendation)

In [47]:
PA_Restaurent_Recommendation = decompress_pickle("PA_Restaurent_Recommendation.pbz2")
FL_Restaurent_Recommendation = decompress_pickle("FL_Restaurent_Recommendation.pbz2")
TN_Restaurent_Recommendation = decompress_pickle("TN_Restaurent_Recommendation.pbz2")
IN_Restaurent_Recommendation = decompress_pickle("IN_Restaurent_Recommendation.pbz2")
MO_Restaurent_Recommendation = decompress_pickle("MO_Restaurent_Recommendation.pbz2")
LA_Restaurent_Recommendation = decompress_pickle("LA_Restaurent_Recommendation.pbz2")
AZ_Restaurent_Recommendation = decompress_pickle("AZ_Restaurent_Recommendation.pbz2")
NJ_Restaurent_Recommendation = decompress_pickle("NJ_Restaurent_Recommendation.pbz2")
NV_Restaurent_Recommendation = decompress_pickle("NV_Restaurent_Recommendation.pbz2")
AB_Restaurent_Recommendation = decompress_pickle("AB_Restaurent_Recommendation.pbz2")

In [None]:
# with open('PA_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(PA_Restaurent_Recommendation, file)
# print("Object successfully saved to PA_Restaurent_Recommendation.pkl")

# with open('FL_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(FL_Restaurent_Recommendation, file)
# print("Object successfully saved to FL_Restaurent_Recommendation.pkl")

# with open('TN_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(TN_Restaurent_Recommendation, file)
# print("Object successfully saved to TN_Restaurent_Recommendation.pkl")

# with open('IN_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(IN_Restaurent_Recommendation, file)
# print("Object successfully saved to IN_Restaurent_Recommendation.pkl")

# with open('MO_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(MO_Restaurent_Recommendation, file)
# print("Object successfully saved to MO_Restaurent_Recommendation.pkl")

# with open('LA_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(LA_Restaurent_Recommendation, file)
# print("Object successfully saved to LA_Restaurent_Recommendation.pkl")

# with open('AZ_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(AZ_Restaurent_Recommendation, file)
# print("Object successfully saved to AZ_Restaurent_Recommendation.pkl")

# with open('NJ_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(NJ_Restaurent_Recommendation, file)
# print("Object successfully saved to NJ_Restaurent_Recommendation.pkl")

# with open('NV_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(NV_Restaurent_Recommendation, file)
# print("Object successfully saved to NV_Restaurent_Recommendation.pkl")

# with open('AB_Restaurent_Recommendation.pkl', 'wb') as file:
#     pickle.dump(AB_Restaurent_Recommendation, file)
# print("Object successfully saved to AB_Restaurent_Recommendation.pkl")

In [None]:
# with open('PA_Restaurent_Recommendation.pkl', 'rb') as file:
#     PA_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from PA_Restaurent_Recommendation.pkl")

# with open('FL_Restaurent_Recommendation.pkl', 'rb') as file:
#     FL_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from FL_Restaurent_Recommendation.pkl")

# with open('TN_Restaurent_Recommendation.pkl', 'rb') as file:
#     TN_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from TN_Restaurent_Recommendation.pkl")

# with open('IN_Restaurent_Recommendation.pkl', 'rb') as file:
#     IN_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from IN_Restaurent_Recommendation.pkl")

# with open('MO_Restaurent_Recommendation.pkl', 'rb') as file:
#     MO_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from MO_Restaurent_Recommendation.pkl")

# with open('LA_Restaurent_Recommendation.pkl', 'rb') as file:
#     LA_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from LA_Restaurent_Recommendation.pkl")

# with open('AZ_Restaurent_Recommendation.pkl', 'rb') as file:
#     AZ_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from AZ_Restaurent_Recommendation.pkl")

# with open('NJ_Restaurent_Recommendation.pkl', 'rb') as file:
#     NJ_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from NJ_Restaurent_Recommendation.pkl")

# with open('NV_Restaurent_Recommendation.pkl', 'rb') as file:
#     NV_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from NV_Restaurent_Recommendation.pkl")

# with open('AB_Restaurent_Recommendation.pkl', 'rb') as file:
#     AB_Restaurent_Recommendation = pickle.load(file)
# print("Object successfully loaded from AB_Restaurent_Recommendation.pkl")

In [48]:
class MF_implicit:
    def __init__(self, train_mat, latent=5, lr=0.01, reg=0.01):
        self.train_mat = train_mat  # the training rating matrix of size (#user, #movie)
        
        self.latent = latent  # the latent dimension
        self.lr = lr  # learning rate
        self.reg = reg  # regularization weight, i.e., the lambda in the objective function
        
        self.num_user, self.num_movie = train_mat.shape
        
        self.sample_user, self.sample_movie = self.train_mat.nonzero()  # get the user-movie paris having ratings in train_mat
        self.num_sample = len(self.sample_user)  # the number of user-movie pairs having ratings in train_mat

        # self.user_test_like = []
        # for u in range(self.num_user):
        #     self.user_test_like.append(np.where(self.test_mat[u, :] > 0)[0])

        self.P = np.random.random((self.num_user, self.latent))  # latent factors for users, size (#user, self.latent), randomly initialized
        self.Q = np.random.random((self.num_movie, self.latent))  # latent factors for users, size (#movie, self.latent), randomly initialized
        
    def negative_sampling(self):
        negative_movie = np.random.choice(np.arange(self.num_movie), size=(len(self.sample_user)), replace=True)
        true_negative = self.train_mat[self.sample_user, negative_movie] == 0
        negative_user = self.sample_user[true_negative]
        negative_movie = negative_movie[true_negative]
        return np.concatenate([self.sample_user, negative_user]), np.concatenate([self.sample_movie, negative_movie])

    def train(self, epoch=20):
        """
        Goal: Write your code to train your matrix factorization model for epoch iterations in this function
        Input: epoch -- the number of training epoch 
        """
        for ep in range(epoch):
            """ 
            Write your code here to implement the training process for one epoch, 
            at the end of each epoch, run self.test() to evaluate current version of MF.
            """
            # print("Epoch:", ep+1)
            s_user, s_movie_i = self.negative_sampling()
            data = np.column_stack((s_user, s_movie_i))
            np.random.shuffle(data)
            for u, i  in zip(data[:, 0], data[:, 1]):
              actual_rating = self.train_mat[u, i]
              pu = self.P[u, :]
              qi = self.Q[i, :]

              predicted_rating = np.dot(pu, qi)
              error = 2*(predicted_rating-actual_rating)

              grad_Pu = error * qi + 2*self.reg * pu
              grad_Qi = error * pu + 2*self.reg * qi

              self.P[u, :] -= self.lr * grad_Pu
              self.Q[i, :] -= self.lr * grad_Qi
            self.predict()

            
    def predict(self):
        """
        Write your code here to implement the prediction function, which generates the ranked lists of movies 
        by the trained MF for every user, store the result (named 'recommendation') in a numpy array of size (#user, 50), where entry (u, k) 
        represents the movie id that is ranked at position k in the recommendation list to user u. Return the 'recommendation' variable. 
        """
        prediction_mat = np.matmul(self.P, self.Q.T)
        recommendation = []
        for u in range(self.num_user):
          scores = prediction_mat[u]
          train_like = np.where(self.train_mat[u, :] > 0)[0]
          scores[train_like] = -9999
          top50_iid = np.argpartition(scores, -50)[-50:]
          top50_iid = top50_iid[np.argsort(scores[top50_iid])[-1::-1]]
          recommendation.append(top50_iid)
        recommendation = np.array(recommendation)
        return recommendation

In [49]:
def calculateMF(ratings_mat):
    mf_implicit = MF_implicit(ratings_mat, latent=5, lr=0.01, reg=0.0001)
    mf_implicit.train(epoch=20)
    recommendation = mf_implicit.predict()
    return recommendation

In [51]:
# PA_Hotel_MF = calculateMF(PA_Hotel_Recommendation.ratings_mat)
# compressed_pickle("PA_Hotel_MF", PA_Hotel_MF)
# PA_Restaurent_MF = calculateMF(PA_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("PA_Restaurent_MF", PA_Restaurent_MF)

# FL_Hotel_MF = calculateMF(FL_Hotel_Recommendation.ratings_mat)
# compressed_pickle("FL_Hotel_MF", FL_Hotel_MF)
# FL_Restaurent_MF = calculateMF(FL_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("FL_Restaurent_MF", FL_Restaurent_MF)

# TN_Hotel_MF = calculateMF(TN_Hotel_Recommendation.ratings_mat)
# compressed_pickle("TN_Hotel_MF", TN_Hotel_MF)
# TN_Restaurent_MF = calculateMF(TN_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("TN_Restaurent_MF", TN_Restaurent_MF)

# IN_Hotel_MF = calculateMF(IN_Hotel_Recommendation.ratings_mat)
# compressed_pickle("IN_Hotel_MF", IN_Hotel_MF)
# IN_Restaurent_MF = calculateMF(IN_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("IN_Restaurent_MF", IN_Restaurent_MF)

# MO_Hotel_MF = calculateMF(MO_Hotel_Recommendation.ratings_mat)
# compressed_pickle("MO_Hotel_MF", MO_Hotel_MF)
# MO_Restaurent_MF = calculateMF(MO_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("MO_Restaurent_MF", MO_Restaurent_MF)

# LA_Hotel_MF = calculateMF(LA_Hotel_Recommendation.ratings_mat)
# compressed_pickle("LA_Hotel_MF", LA_Hotel_MF)
# LA_Restaurent_MF = calculateMF(LA_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("LA_Restaurent_MF", LA_Restaurent_MF)

# AZ_Hotel_MF = calculateMF(AZ_Hotel_Recommendation.ratings_mat)
# compressed_pickle("AZ_Hotel_MF", AZ_Hotel_MF)
# AZ_Restaurent_MF = calculateMF(AZ_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("AZ_Restaurent_MF", AZ_Restaurent_MF)

# NJ_Hotel_MF = calculateMF(NJ_Hotel_Recommendation.ratings_mat)
# compressed_pickle("NJ_Hotel_MF", NJ_Hotel_MF)
# NJ_Restaurent_MF = calculateMF(NJ_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("NJ_Restaurent_MF", NJ_Restaurent_MF)

# NV_Hotel_MF = calculateMF(NV_Hotel_Recommendation.ratings_mat)
# compressed_pickle("NV_Hotel_MF", NV_Hotel_MF)
# NV_Restaurent_MF = calculateMF(NV_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("NV_Restaurent_MF", NV_Restaurent_MF)

# AB_Hotel_MF = calculateMF(AB_Hotel_Recommendation.ratings_mat)
# compressed_pickle("AB_Hotel_MF", AB_Hotel_MF)
# AB_Restaurent_MF = calculateMF(AB_Restaurent_Recommendation.ratings_mat)
# compressed_pickle("AB_Restaurent_MF", AB_Restaurent_MF)


In [None]:
# Implementing Autoencoder for recommendation(basic, top-10 recommendation for each user)
import numpy as np
import tensorflow as tf
from scipy.sparse import csr_matrix

def vanilla_autoencoder():
  ratings_mat_sparse = csr_matrix(ratings_mat)

  # load data
  #ratings = np.loadtxt('ratings.csv', delimiter=',')
  n_users, n_items = ratings_mat_sparse.shape

  # split data into training and validation sets
  split = int(0.8 * n_users)
  train_ratings = ratings_mat_sparse[:split]
  val_ratings = ratings_mat_sparse[split:]

  # define the autoencoder model
  input_layer = tf.keras.layers.Input(shape=(n_items,))
  encoded_layer = tf.keras.layers.Dense(16, activation='relu')(input_layer)
  decoded_layer = tf.keras.layers.Dense(n_items, activation='sigmoid')(encoded_layer)
  autoencoder = tf.keras.models.Model(input_layer, decoded_layer)

  # compile the model
  autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

  # train the model
  autoencoder.fit(train_ratings.toarray(), train_ratings.toarray(), epochs=20, batch_size=32, validation_data=(val_ratings, val_ratings))

  # predict ratings for all users and items
  predicted_ratings = autoencoder.predict(ratings_mat_sparse)

  # print top 10 recommended items for each user
  for i in range(n_users):
      top_items = np.argsort(predicted_ratings[i])[::-1][:10]
      print(f"User {i+1}: {top_items}")


In [None]:
new_rat_mat = ratings_mat[0:10]
print(type(new_rat_mat))
print(type(ratings_mat))
del ratings_mat

In [None]:
import numpy as np
from keras.layers import Input, Dense
from keras.models import Model
from scipy.sparse import csr_matrix

def autoencoder_cf():
  ratings_mat_sparse = csr_matrix(new_rat_mat)

  # Split the data into training and testing sets
  train_size = int(0.8 * ratings_mat_sparse.shape[0])
  train_ratings = ratings_mat_sparse[:train_size, :]
  test_ratings = ratings_mat_sparse[train_size:, :]

  # Build the autoencoder model
  input_layer = Input(shape=(ratings_mat_sparse.shape[1],))
  encoded = Dense(16, activation='relu')(input_layer)
  decoded = Dense(ratings_mat_sparse.shape[1], activation='sigmoid')(encoded)
  autoencoder = Model(input_layer, decoded)

  # Compile the model
  autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

  # Train the model
  autoencoder.fit(train_ratings.toarray(), train_ratings.toarray(), epochs=20, batch_size=32, shuffle=True, validation_data=(test_ratings.toarray(), test_ratings.toarray()))

  # Extract the hidden layer output for all the users and items
  hidden_layer = Model(input_layer, encoded)
  batch_size = 32

  def embeddings_generator(ratings_mat_sparse, batch_size):
      num_users = ratings_mat_sparse.shape[0]
      indices = np.arange(num_users)
      for start_idx in range(0, num_users, batch_size):
          end_idx = min(start_idx + batch_size, num_users)
          batch_indices = indices[start_idx:end_idx]
          batch_ratings = ratings_mat_sparse[batch_indices]
          batch_embeddings = hidden_layer.predict(batch_ratings)
          yield batch_embeddings

  user_embeddings = np.concatenate(list(embeddings_generator(ratings_mat_sparse, batch_size)), axis=0)

  # Compute the similarity between users and items
  user_similarity = np.dot(user_embeddings, user_embeddings.T)
  item_similarity = np.dot(user_embeddings.T, user_embeddings)

  # Use the similarity scores to predict the ratings of the items for each user
  user_ratings = np.dot(user_similarity, ratings_mat_sparse) / np.sum(np.abs(user_similarity), axis=1)
  item_ratings = np.dot(item_similarity, ratings_mat_sparse.T) / np.sum(np.abs(item_similarity), axis=1)

In [None]:
# Get predicted ratings for user 
def get_user_recommendation(user_id):
  user_0_pred_ratings = user_ratings[user_id]
  N = 10
  # Sort predicted ratings in descending order
  sorted_item_indices = np.argsort(user_0_pred_ratings[0].toarray().flatten())[::-1][:N]
  print("Recommended items for user 0:", sorted_item_indices)

In [None]:
autoencoder_cf()
get_user_recommendation(0)