# Importing Essential Libraries

In [None]:
from pprint import pprint # pretty print sometimes helps printing the data more distinguished
import time # to measure execution time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For reading and writing datasets
import csv
import json

# For clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

# For plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the Business dataset:

In [None]:
# Read the json file
df_business = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json", lines=True)

# Get the values that are in British Columbia state
df_business = df_business[df_business['state'] == 'BC']

# Get the values that are in Vancouver city (Not used for now)
# df_business[df_business['city'] == 'Vancouver']

df_business

# Data Cleaning for the Business dataset
Correct or replace any issue of:
* Misspellings {Montgromery street}
* Outliers {1,2,4,2,4,123,3,4}
* Incorrect values {invalid zip, neg number}
* Missing values {6,7,4,3,,4,5,6}
* Incorrect values {94025, -345,96066,…}
* Misspellings {Montgromery street}
* Outliers {1,2,4,2,4,123,3,4}
* Incorrect values {invalid zip, neg number}
* Missing values {6,7,4,3,,4,5,6}
* Incorrect values {94025, -345,96066,…}

In [None]:
# Check to see if any of there is any business without business_id
print("There are", df_business['business_id'].isnull().values.sum(), "businesses without a business_id.")

In [None]:
# Check to see if any of there is any business without category
print("There are", df_business['categories'].isnull().values.sum(), "businesses without categories.")

In [None]:
# Look at the businesses without category
df_business[df_business['categories'].isnull()]

We've found four businesses with categories as None. Since the data is large enough, four instances are easily neglectable. Yet, we want to see if we can update them because the "is_open" value is 1 for all of them. Searching them by name and address, it seems like all, but not the third, don't exist at the claimed location anymore.

The [third one](https://www.google.com/maps/place/Printplus/@49.2879393,-123.1245266,16.26z/data=!4m5!3m4!1s0x54867183e662c37d:0x136bc9b30dcb2d98!8m2!3d49.2877835!4d-123.1208748)... {we can fix it, is it worth the trouble? :D We straight up delete all the four for now}

In [None]:
# Deleting the business with NAN in the categories column
df_business = df_business.dropna(subset=['categories'])
# To address this change in all other datasets, we may eliminate the rows that don't have a matching busness_id with this one.

#  Category distribution visualization
### Step 1: Pre-processing for the sub-categories:
The goal of this step is to extract the number of businesses associated with each individual business. (In other words we want to build a dataframe that has all the categories as index and a column that shows an integer, which is the number of businesses that have that specific category in their business description)

In [None]:
# A business can have multiple categories separated by comma.
# Make a list of strings for each business (instead of the one-string situation).
df_categories = pd.DataFrame(df_business['categories'].apply(lambda x: x.split(', ') if x is not None else []))

# Explode the categories to have individual rows of one business_id and one category related to it.
# (We'll get redundant business_ids, and that's fine.)
df_categories_exploded = df_categories.explode('categories').groupby('categories')

In [None]:
# Count the similar categories.
cat_count = pd.DataFrame(df_categories_exploded.categories.value_counts())

# In the previous process the dataframe had gotten two duplicated indexes (MultiIndex). So, we drop one of them.
cat_count = cat_count.droplevel(1)

# Since the indexes start with "categories",
# the column name is changed to "counts" to prevent any mistake in the future.
cat_count = cat_count.rename(columns={'categories': 'counts'})

# Now we can sort by "counts" confidently, knowing that it's not taking the other name in consideration.
cat_count = cat_count.sort_values(by="counts", ascending=False)

cat_count

### Step 2: Pre-processing for the main categories:

Categories in the Yelp dataset are grouped into main categories and defined on the [Yelp's official blog](https://blog.yelp.com/2018/01/yelp_category_list). The main categories are listed in the "category_group_titles" array, and the sub-categories are collected as .txt file formats in a dataset called [yelpcategorytitles](https://www.kaggle.com/yaldayazdanpanah/yelpcategorytitles). Note that there is no redundancy in sub-categories. The goal of this step is to calculate the number of businesses in each main category.

In [None]:
# This is a dictionary that maps the main categories to the number of businesses in the sub-categories of that main category.
main_categories = {"Active Life": 0, "Arts and Entertainment": 0, "Automotive": 0,
                   "Beauty and Spas": 0, "Education": 0, "Event Planning and Services": 0,
                   "Financial Services": 0, "Food": 0, "Health and Medical": 0,
                   "Home Services": 0, "Hotels and Travel": 0, "Local Flavor": 0,
                   "Local Services": 0, "Mass Media": 0, "Nightlife": 0,
                   "Pets": 0, "Professional Services": 0, "Public Services and Government": 0,
                   "Real Estate": 0, "Religious Organizations": 0, "Restaurants": 0, "Shopping": 0}

In [None]:
# Counter function
'''Parameters
        df: a dataframe of categories as indexes and their counts as the only column
        category_group: an string array of main categories
        
   Returns the number of businesses in that category group
'''           
def sub_category_counter(df, category_group):
    counter = 0
    for index, row in df.iterrows():
        if index in category_group:
            counter = counter + row['counts']
    return counter

In [None]:
# For each main category do the following.
for cat in main_categories:
    
    # Read the sub-categories of cat category.
    path = "../input/yelpcategorytitles/" + cat + ".txt"
    sub_categories = open(path, "r")
    
    sub_categories = [x.strip() for x in sub_categories]
    
    # Count the sub-categories using the function above.
    main_categories[cat] = sub_category_counter(cat_count, sub_categories)

In [None]:
# Finally, sort the main categories counts for a better visualization.
main_categories_sorted = {k: v for k, v in sorted(main_categories.items(),
                                                  key=lambda item: item[1],
                                                  reverse=True)}

### Step 3: Plotting the category distribution:

In [None]:
fig = plt.figure(figsize=(18,6))

ax = fig.add_axes([0,0,1,1])
ax = sns.set_theme(style="whitegrid")
ax = sns.barplot(list(main_categories_sorted.keys()), list(main_categories_sorted.values()))

plt.xticks(rotation=80)
plt.ylabel('# of Businesses', fontsize=14)
plt.xlabel('Category', fontsize=14)

plt.show()

# Other vidualizations
In order to evaluate a business's success, we need to define our own customized metric. Let us call it the success metric for now. The variables we aim to use to extract the success metric are the number of reviews, the number of tips, the stars, and the number of check-ins for each business. We want to look closely into the interactions between the mentioned variables using the multivariate and univariate plots.

In [None]:
# Statistical descriptions
print(df_business.describe())
# box and whisker plots for 10 critical features of df_business
#----------------------
# latitude	Longitude	stars	review_count	is_open	
from matplotlib import pyplot
df_business.latitude.plot(kind='box', subplots=True, sharex=False, sharey=False)
pyplot.show()
df_business.longitude.plot(kind='box', subplots=True, sharex=False, sharey=False)
pyplot.show()
df_business.stars.plot(kind='box', subplots=True, sharex=False, sharey=False)
pyplot.show()
df_business.review_count.plot(kind='box', subplots=True, sharex=False, sharey=False)
pyplot.show()
df_business.is_open.plot(kind='box', subplots=True, sharex=False, sharey=False)
pyplot.show()

In [None]:
#Histograms for numerical columns of the database
df_business.latitude.hist()
pyplot.show()
df_business.longitude.hist()
pyplot.show()
df_business.stars.hist()
pyplot.show()
df_business.review_count.hist()
pyplot.show()
df_business.is_open.hist()
pyplot.show()

# Analyzing the data to extract the traffic of each business:
Traffic shows how popular that specific business is; not neccessarily how qualified its service is.

Storing all the information in dataframes allocates too much memory, plus, we have to only accept the part of other datasets that has a business_id in business datasets (which is limited to British Columbia now). For these two reasons, we cannot upload the whole datasets in Pandas DataFrames like the business dataset. What's the alternative? We need to read in _chunks_. You can refer to this [blog post](https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c#:~:text=Here%20comes%20the%20good%20news,fit%20into%20the%20local%20memory.https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c#:~:text=Here%20comes%20the%20good%20news,fit%20into%20the%20local%20memory.) for more details about reading large data.

### Step 1: Converting the Traffic datasets from JSON to CSV:

In [None]:
traffic_header = ['business_id', 'weighted_stars', 'date']

In [None]:
''' Tip dataset '''

start_t = time.time()
with open("/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json") as f:
    with open("/kaggle/working/yelp_academic_dataset_traffic.csv", 'w') as out:
        
        traffic_writer = csv.writer(out)
        traffic_writer.writerow(traffic_header)
        
        for line in f:
            line_dict = json.loads(line)
            traffic_writer.writerow([line_dict['business_id'],
                                     None,
                                     line_dict['date']])

print(time.time() - start_t)

In [None]:
''' Review dataset '''

start_t = time.time()
with open("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json") as f:
    with open("/kaggle/working/yelp_academic_dataset_traffic.csv", 'a') as out:
        
        traffic_writer = csv.writer(out)
        
        for line in f:
            line_dict = json.loads(line)
            
            # The more engagement a review has, the more reliable it is. However, since it can be zero, we add 1 to balance it.
            # If the engagement is zero, then star_reliability equals 1, and its multiplication by star wouldn't be effective.
            star_reliability = line_dict['useful'] + line_dict['funny'] + line_dict['cool'] + 1

            traffic_writer.writerow([line_dict['business_id'],
                                     line_dict['stars'] * star_reliability,
                                     line_dict['date']])

print(time.time() - start_t)

In [None]:
''' Check-in dataset '''

start_t = time.time()

with open("/kaggle/input/yelp-dataset/yelp_academic_dataset_checkin.json") as f:
    with open("/kaggle/working/yelp_academic_dataset_traffic.csv", 'a') as out:
        
        traffic_writer = csv.writer(out)
        
        for line in f:
            line_dict = json.loads(line)
            
            checkin_dates = line_dict['date'].split(', ')

            if len(checkin_dates) == 0:
                checkin_dates = [line_dict['date']]
            # If there's one or more than one date, store them one by one.
            for date in checkin_dates:
                traffic_writer.writerow([line_dict['business_id'],
                                         None,
                                         date])

print(time.time() - start_t)

### Step 2: Reading the Traffic datasets (new csv file) using [Dask](https://docs.dask.org/en/latest/dataframe.html):

Another alternative is using [chunksize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html).

In [None]:
import dask.dataframe as dd

dd_traffic = dd.read_csv("./yelp_academic_dataset_traffic.csv", parse_dates=['date'])

### Step 3: Prune the Traffic datasets (delete the ones that are not in business dataset)

In [None]:
# It might take a little while, because it's reading in chunks and computing on all.
df_traffic = dd_traffic[dd_traffic.business_id.isin(df_business.business_id)].compute()
df_traffic

# Identifying the influential variables of each category
### Step 1: Calculating the lifetime of businesses (the _classes_ or _y_ of our final training dataset).
df_traffic now contains all kinds of interactions documented by people, whether it's a review someone's written, a tip someone's given, or a check-in date that has been registered. The first interaction would be the date that store/business started their work. Similarly, the last date registered displays the closed date. However, there's one additional information in the df_business that comes in handy! To be more accurate, if the _is_open_ is 1, we set the closed date to *None* since it's no longer the closed date but the last date.

Note! Fortunately, there was at least one review for each business (17294 businesses were found).

In [None]:
grouped_traffic = df_traffic.groupby(by='business_id')

# Lifetime = Last date - first date
first_date = grouped_traffic['date'].min()
last_date = grouped_traffic['date'].max()
# We only keep the days and eliminate the overhead hours/minutes/seconds
lifetime = (last_date.subtract(first_date)).dt.days

# Traffic = Number of tips, reviews, and check-ins
traffic = grouped_traffic.size()

# weighted_stars = The average of all weighted stars (that are not Null)
weighted_stars = df_traffic.dropna(subset=['weighted_stars']).groupby(by='business_id')['weighted_stars'].mean()

df_traffic_results =  pd.concat([traffic, weighted_stars, lifetime], axis=1, keys=['traffic', 'weighted_stars', 'lifetime'])
df_traffic_results

### Step 2: Merging the data together
Note that we kept the business_id all over different datasets to be able to glue the dataset back together later. Once we do this, we don't care about it anymore. We merge the df_dates which we just gathered to df_business which we had from before.

In [None]:
# Merge based on business_id: df_traffic_results + df_business.
df_merged_business = df_business.join(df_traffic_results, on='business_id')

# We collect the columns that might have an influence in a business' lifetime
df_influential_vars = df_merged_business[['business_id', 'categories', 'latitude','longitude',
                                          'traffic', 'stars', 'weighted_stars', 'lifetime']]

df_influential_vars = df_influential_vars.reset_index()
df_influential_vars.pop('index')
df_influential_vars

### A couple of observations!
Before we proceed to calculate the influnetial metrics, let's be sure of some values. To make a metric we have to come up with a combination of arithmetic operations. So, we have to know if there are any zeros in the values we want to work with.

In [None]:
# Is there any business with no traffic count?
print("There are", (df_merged_business['traffic'] == 0).sum(),
      "businesses without any traffic registered.")

# Is there any busniess with zero lifetime?
print("There are", (df_merged_business['lifetime'] == 0).sum(),
      "businesses with zero lifetime.")

### Step 3: Embedding categories (convert them to vectors)

In [None]:
# Counter function
'''Parameters
        category_series: it expects a column of strings of categories as input.
        
   Returns vectors of the input categories. So that each business has a vector that shows its categories.
   For example if we have 7 categories overall, instead of [chicken, Food] it now has [1, 0, 0, 0, 1, 0, 0].
'''    

def get_business_cat_vector(category_series):
    global cat_count

    category_df = pd.DataFrame(category_series.apply(lambda x: x.split(', ') if x is not None else []),
                               columns=['categories']).reset_index(drop=True)
    
    neighbourhood_cats = category_df.explode('categories').drop_duplicates()
    category_df[list(neighbourhood_cats['categories'])] = 0

    def set_count(row):
        for cat in row['categories']:
            row[cat] = 1
        return(row)
    
    counts_df = category_df.apply(set_count, axis=1).drop('categories', axis=1).transpose()
    all_cats_df = pd.DataFrame(cat_count.index).join(counts_df, on=['categories'])
    all_counts_df = all_cats_df.fillna(0).transpose().drop('categories')
    vectors_df = all_counts_df.apply(lambda x: x.astype('int32').to_numpy(), axis=1)
    return vectors_df

In [None]:
# Convert categories to vectors
category_vectors = get_business_cat_vector(df_influential_vars['categories'])
category_vectors_np = np.stack(category_vectors.to_numpy())

# Run PCA
According to the documentation, PCA is a tool that helps us shrink the data without losing any information. We run the PCA on the category vectors. They used to have 915 columns as it was the number of categories in the dataset. And now, it has 100.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
pca.fit(category_vectors_np)
category_vectors_pca = pca.transform(category_vectors_np)

# category_vectors_pca = category_vectors_np
df_cat_vec = pd.DataFrame(category_vectors_pca,
                          columns=['cat_'+ str(i) for i in range(category_vectors_pca.shape[1])])
df_cat_vec

In [None]:
# Add the vectors to the train dataframe
df_train = pd.concat([df_influential_vars, df_cat_vec], axis=1)

# Convert days to years
df_train['lifetime'] = (df_train['lifetime'] / 365).astype('int')
df_train

# Supervised Learning

In [None]:
# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score
import sklearn

# We don't want to change the actual df_train, so we get a copy of that.
df_temp = df_train.copy()
# df_temp = df_temp[df_temp['categories'].str.contains('Shopping')]

df_temp.pop('business_id')
df_temp.pop('categories')
y = df_temp.pop('lifetime').to_numpy().astype('float')
X = df_temp.to_numpy().astype('float')

# Sometimes scaling helps the modeling to classify better.
# X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.01, random_state=10, shuffle=True)

LR = LinearRegression().fit(X_train, y_train).predict(X_test)
LGR = LogisticRegression(random_state=0).fit(X_train, y_train).predict(X_test)
SVM = SVC(kernel='rbf').fit(X_train, y_train).predict(X_test)
GNB = GaussianNB().fit(X_train, y_train).predict(X_test)
DT = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

y_pred = np.mean(np.array((np.transpose(LR), np.transpose(LGR), np.transpose(SVM), np.transpose(GNB), np.transpose(DT))), axis=0)

# print('PRED: ', y_pred)
# print('ACTUAL: ', y_test)

print('LR:', sklearn.metrics.mean_squared_error(y_test, LR, squared=False))
print('LGR:', sklearn.metrics.mean_squared_error(y_test, LGR, squared=False))
print('SVM:', sklearn.metrics.mean_squared_error(y_test, SVM, squared=False))
print('GNB:', sklearn.metrics.mean_squared_error(y_test, GNB, squared=False))
print('DT:', sklearn.metrics.mean_squared_error(y_test, DT, squared=False))
print('Total:', sklearn.metrics.mean_squared_error(y_test, y_pred, squared=False))

# Unsupervised Learning
### Clustering using Agglomerative clustering algorithm:

In [None]:
coords = df_business[['longitude', 'latitude']].to_numpy()

cutoff = 0.0045

clustering = AgglomerativeClustering(None, linkage='complete', distance_threshold=cutoff)
cluster_ids = clustering.fit_predict(coords)

df_business['neighbourhood'] = cluster_ids # ignore warning

print("The number of neighbourhoods found in BC is:", max(cluster_ids))

# Clustering Visualization

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm

# Plot neighborhoods

# filter out some weird businesses in NE Ontario for plotting
plot_clusters = cluster_ids[coords[:,0] < -74.5]
plot_coords = coords[coords[:,0] < -74.5]

def get_color(value):
    cycle = 20
    scale = cm.get_cmap('tab20', 12)
    color_value = (value % cycle) / (cycle - 1)
    return scale(color_value)

# large figure
plt.figure(figsize=(12,8), dpi= 400)

colors = []
for i, label in enumerate(plot_clusters):
    colors.append(get_color(int(label)))

plt.scatter(plot_coords[:,0], plot_coords[:,1], c=colors, s=1)

plt.show()

### Another visualization for the neighbourhoods that also clusters them realtime:

In [None]:
import folium
from folium.plugins import FastMarkerCluster

lats = df_business['latitude'].tolist()
lons = df_business['longitude'].tolist()
locations = list(zip(lats, lons))

map1 = folium.Map(location=[49.2827, -123.1207],
                        tiles = "Stamen Terrain",
                        zoom_start = 12)
FastMarkerCluster(data=locations).add_to(map1)
map1

### Using Kmeans to cluster the businesses into different neighbourhoods:

In [None]:
x = df_business.iloc[:, [6 ,7]].values
cluster_range = range(1225, 1235)


wcss = []
for i in cluster_range:
    print("looking at cluster: ", i)
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
    
#Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(cluster_range, wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()

### Using Elbow Method to find the best K:

In [None]:
# Small K

from yellowbrick.cluster import KElbowVisualizer
    
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))

visualizer.fit(x)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
# Large K

#Clustering based on elbow results
kmeans = KMeans(n_clusters = 1231, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)

fig, ax = plt.subplots(figsize=(12,8), dpi= 400)
ax.scatter(x[:,1], x[:,0], c=y_kmeans, alpha=0.9, s = 1, cmap='jet') # c is colour


#Plotting the centroids of the clusters
ax.scatter(kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:,0], s = 50, c = 'pink', label = 'Centroids', alpha=0.3)

# plt.legend()
plt.show()