In [1]:
import pandas as pd
import numpy as np
import json
import pyarrow
import logging
import os
import findspark
findspark.init()
from collections import Counter

In [2]:
#Spark imports and setting spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Recommender_Engine').getOrCreate()

In [3]:
#Setting the configurations like paths of the files
config_file = open("config.json")
configs  = json.load(config_file)

#Setting the logging files
logging.basicConfig(filename = os.path.join(configs['local_paths']['logging_path'] , 'log.txt') , filemode = 'w' , level = 'INFO')


In [4]:
analytics_sandbox_path = configs['local_paths']['analytics_sandbox']
df = spark.read.csv(analytics_sandbox_path, header = True, inferSchema = True)
user_profile = df.filter(df.user_id == 'rGd8YUtvhSdMm8-9cMdQ4Q')
df = df.filter(df.city == 'Montreal')
temp_df  = df.toPandas()

In [5]:
pandas_df = temp_df.iloc[ : 5000 ,  : ]
test_df = temp_df.iloc[ 5001 :  , : ]

In [6]:
restaurant_df = pandas_df[(pandas_df.business_categories.str.contains('Restaurants') ) 
                          & (pandas_df.user_review_count > 10)  
                          & (pandas_df.review_count > 10)]
matrix_x = pd.pivot_table(data=restaurant_df , index='user_id', columns='business_id' , values='review_stars')

In [7]:
user_profile = user_profile.toPandas()

In [8]:
user_temp_df = user_profile

In [11]:
def fetch_user_favourite_tags(df):
    '''
    Takes the user_profile as pandas dataframe and return the categories tags as dictionary based on the count.
    Step 1: Remove other entries other than Restaurants from user profile
    Step 2: capture all the categories in a list and run the counter on it to get the  count
    '''
    df = df[df.business_categories.str.contains('Restaurants')]
    l = [val.split(",") for val in df.business_categories]
    flatten_list = [str.strip(val) for sublist in l for val in sublist if str.strip(val) not in ['Restaurants','Food']]
    counter_dict = Counter(flatten_list)
    return counter_dict

In [16]:
if __name__ == "__main__":
    print(fetch_user_favourite_tags(user_temp_df))

Counter({'Chinese': 48, 'Nightlife': 27, 'Japanese': 26, 'Bars': 26, 'Canadian (New)': 25, 'Sandwiches': 24, 'Italian': 21, 'Breakfast & Brunch': 20, 'Coffee & Tea': 20, 'Specialty Food': 17, 'Cafes': 16, 'Pizza': 15, 'Seafood': 15, 'Fast Food': 14, 'Sushi Bars': 12, 'French': 12, 'Desserts': 12, 'Burgers': 11, 'Dim Sum': 10, 'Asian Fusion': 10, 'American (Traditional)': 10, 'Vietnamese': 10, 'Korean': 9, 'Ethnic Food': 8, 'Mediterranean': 8, 'Bakeries': 8, 'Event Planning & Services': 8, 'Barbeque': 8, 'Thai': 7, 'Middle Eastern': 7, 'Noodles': 7, 'Comfort Food': 7, 'American (New)': 7, 'Diners': 6, 'Ramen': 6, 'Caribbean': 6, 'Salad': 6, 'Wine Bars': 6, 'Caterers': 6, 'Mexican': 6, 'Taiwanese': 6, 'Tea Rooms': 5, 'Soup': 5, 'Cocktail Bars': 5, 'Beer': 5, 'Wine & Spirits': 5, 'Ice Cream & Frozen Yogurt': 5, 'Food Trucks': 4, 'Arts & Entertainment': 4, 'Steakhouses': 4, 'Lounges': 4, 'Pubs': 4, 'Street Vendors': 3, 'Vegan': 3, 'Tapas/Small Plates': 3, 'Greek': 3, 'Spanish': 3, 'Indian'