In [1]:
import pandas as pd
import numpy as np
import json
import pyarrow
import logging
import os
import findspark
findspark.init()
from collections import Counter

In [2]:
#Spark imports and setting spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Recommender_Engine').getOrCreate()

In [3]:
#Setting the configurations like paths of the files
config_file = open("config.json")
configs  = json.load(config_file)

#Setting the logging files
logging.basicConfig(filename = os.path.join(configs['local_paths']['logging_path'] , 'log.txt') , filemode = 'w' , level = 'INFO')


In [4]:
analytics_sandbox_path = configs['local_paths']['analytics_sandbox']
df = spark.read.csv(analytics_sandbox_path, header = True, inferSchema = True)
user_profile = df.filter(df.user_id == 'rGd8YUtvhSdMm8-9cMdQ4Q')
df = df.filter(df.city == 'Montreal')
temp_df  = df.toPandas()

In [5]:
pandas_df = temp_df.iloc[ : 5000 ,  : ]
test_df = temp_df.iloc[ 5001 :  , : ]

In [6]:
restaurant_df = pandas_df[(pandas_df.business_categories.str.contains('Restaurants') ) 
                          & (pandas_df.user_review_count > 10)  
                          & (pandas_df.review_count > 10)]
matrix_x = pd.pivot_table(data=restaurant_df , index='user_id', columns='business_id' , values='review_stars')

In [7]:
user_profile = user_profile.toPandas()

In [190]:
class User_profile_recommender:
    '''
    Class that provides recommendation based on user profile and the previously visited restaurants.
    '''
    def __init__(self , user_profile_df , restaurant_df , location = 'Montreal' , type = 'Restaurants' ):
        self.user_profile_df = user_profile_df
        self.location = location
        self.restaurant_df = restaurant_df
        self.tags = self.fetch_user_favourite_tags()
    
    def fetch_user_favourite_tags(self):
        '''
        Takes the user_profile as pandas dataframe and return the categories tags as dictionary based on the count.
        Step 1: Remove other entries other than Restaurants from user profile
        Step 2: capture all the categories in a list and run the counter on it to get the  count
        '''
        self.user_profile_df = self.user_profile_df[self.user_profile_df.business_categories.
                                                    str.
                                                    contains('Restaurants')]
        l = [val.split(",") for val in self.user_profile_df.business_categories]
        flatten_list = [str.strip(val) for sublist in l for val in sublist if str.strip(val) not in ['Restaurants','Food']]
        counter_dict = Counter(flatten_list)
        return counter_dict

    def get_recommendations(self):
        distinct_business = self.restaurant_df[['business_id', 'business_name' , 'business_categories']].copy()
        distinct_business.drop_duplicates(['business_id', 'business_name' , 'business_categories'], inplace = True)
        distinct_business.reset_index(drop = True , inplace = True)
        distinct_business['rating'] = 0

        #Learning -> using DataFrame.at() - Similar to loc, in that both provide label-based lookups. 
        #Use 'at' if you only need to get or set a single value in a DataFrame or Series.
        #doing this --  distinct_business.loc[ 0 : , : ] return complete dataframe because it treats 0 as True for
        #row indexing and thus returns all the rows and not the row at index 0.
        #Using 'at' acheives it as distinct_business.at[ 0 , 'rating']

        '''providing rating based on the number of times a tag occurs in the user profile'''
        for i in distinct_business.index:
            categories = distinct_business.loc[i , 'business_categories']
            for keys, value in tags.items():
                if keys in categories:
                    distinct_business.at[ i , 'rating'] = distinct_business.at[ i , 'rating'] + value
        #Sorting based on rating
        distinct_business.sort_values(by = 'rating', inplace = True, ascending = False)
        return distinct_business['business_name'].tolist()

In [191]:
if __name__ == "__main__":
    ur = User_profile_recommender(user_profile , restaurant_df )
    user_recommendations = ur.get_recommendations()