# Analysis of the Restaurant datasets

# Installing required packages

In [None]:
!pip install beautifulsoup4
!pip install geocoder

# Importing all required packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import os
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import geopandas as gpd
import matplotlib.pyplot as plt
import descartes
from shapely.geometry import Point, Polygon
import geoplot
import folium
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

# Seeing all files present

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# A. Starting with Top 250 Restaurants

# Defining function to convert % columns to floats

In [None]:
prefix = '/kaggle/input/restaurant-business-rankings-2020/'

def p2f(x):
    return float(x.strip('%'))/100

# A.1 Loading the Top 250 dataframe

In [None]:
df_top250 = pd.read_csv(prefix + 'Top250.csv', converters={'YOY_Sales':p2f, 'YOY_Units':p2f})

df_top250

# A.2 Checking for NaNs
There are plenty of NaNs. The contents column especially has relatively little value for this analysis (more data may be relevant for sentiment analysis, potentially). For now it can be dropped.

In [None]:
df_top250.isna().sum()

In [None]:
df_top250 = df_top250.drop('Content', axis = 1)

# A.3 Filling in missing Headquarters
I believe the HQ location is an important feature for analysis. Where it is centrally located may well have a large bearing on how well it performs. So I do some data scraping from Wikipedia to grab as much location info as possible.

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_restaurant_chains_in_the_United_States"

website_url = requests.get(url).text

soup = BeautifulSoup(website_url,'lxml')

tables = soup.find_all('table',class_="wikitable")

df_list = []
for i, _ in enumerate(tables):
    contents = [item.get_text() for item in tables[i].find_all('td')]
    name = []
    hq = []
    for j, val in enumerate(contents):
        if j % 7 == 0:
            name.append(val.strip('\n'))
        elif (j - 3) % 7 == 0:
            hq.append(val.strip('\n'))

    df = pd.DataFrame(list(zip(name, hq)), 
                   columns =['Restaurant', 'Headquarters_temp']) 
    
    df_list.append(df)
    
name_hq = pd.concat(df_list).reset_index(drop = True)

# A.4 Merging HQ in original df with Wiki data
Some data was missing in Wiki but present in the original table. So I simply merge the two.

In [None]:
new_df = pd.merge(df_top250, name_hq, on='Restaurant', how='left')

col = new_df['Headquarters'].fillna(new_df['Headquarters_temp'])

new_df = new_df.assign(Headquarters=col)

df_top250 = new_df.drop('Headquarters_temp', axis = 1)

# A.5 Check NaNs now...
There are considerably less. Though still 66 is too many missing, so I will try to find another way to fill this gap.

In [None]:
df_top250.isna().sum()

# A.6 Converting locations to a fixed format
I use geolocator to extract the locations from the city info only...

In [None]:
df_top250[['City', 'Extra']] = df_top250['Headquarters'].str.split(',', 1, expand=True)

df_top250.drop(columns = ['Extra', 'Headquarters'], axis = 1, inplace = True)

In [None]:
geolocator = Nominatim(user_agent = "geoapiExercises")

def findfullad(city):
    location = geolocator.geocode(city)
    if location is None:
        location = 'Unknown'
    else:
        location = location[0]
    return(location)

df_top250['Headquarters'] = df_top250['City'].apply(findfullad)

# A.7 Fixing incorrect records
...however, there is a slight issue of cities (and countries) having names elsewhere in the world... So I fix these manually.

In [None]:
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('\d+', '', regex = True)
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace(' ,', '', regex = True)
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace(' -,', '', regex = True)
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('University of Nottingham, Wollaton Vale, Wollaton, City of Nottingham, Nottinghamshire, East Midlands, England, NG RD, United Kingdom', 'University Park, Miami-Dade County, Florida, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('New York, United States', 'New York City, New York, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Αθήνα, Δήμος Αθηναίων, Περιφερειακή Ενότητα Κεντρικού Τομέα Αθηνών, Περιφέρεια Αττικής, Αποκεντρωμένη Διοίκηση Αττικής, Ελλάς', 'Athens, Georgia, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('لبنان', 'Lebanon, Tennessee, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Birmingham, West Midlands Combined Authority, West Midlands, England, United Kingdom', 'Birmingham, Alabama, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Cheshire, England, United Kingdom', 'Cheshire, Connecticut, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Maitland City Council, New South Wales, Australia', 'Maitland, Florida, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Dublin, Dublin Leinster, Éire / Ireland', 'Dublin, Ohio, United States')
df_top250['Headquarters'] = df_top250['Headquarters'].str.replace('Toledo, Castilla-La Mancha, España', 'Toledo, Ohio, United States')

# A.8 Filling in final gaps
I thought of a way of using google searches recursively on the missing restaurants to find the headquarters by web scraping, but it was not placed commonly anywhere... so I did a manual insert on those records.

In [None]:
dict_locs = {"Dunkin'": "Canton, Massachusetts, United States",
"Popeyes Louisiana Kitchen": "Miami, Florida, United States",
"Chili's Grill & Bar": "Dallas, Texas, United States",
"Papa John's": "Louisville, Kentucky, United States",
"Jimmy John's Gourmet Sandwiches": "Champaign, Illinois, United States",
"Hardee's": "Franklin, Tennessee, United States",
"Five Guys Burgers and Fries": "Lorton, Virginia, United States",
"Red Robin Gourmet Burgers and Brews": "Greenwood Village, Colorado, United States",
"Carl's Jr.": "Franklin, Tennessee, United States",
"Bojangles'": "Charlotte, North Carolina, United States",
"BJ's Restaurant & Brewhouse": "Huntington Beach, California, United States",
"P.F. Chang's": "Scottsdale, Arizona, United States",
"Qdoba Mexican Eats": "San Diego, California, United States",
"Bob Evans": "New Albany, Ohio, United States",
"Papa Murphy's Pizza": "Vancouver, Washington, United States",
"Captain D's Seafood Kitchen": "Nashville, Tennessee, United States",
"Perkins Restaurant & Bakery": "Memphis, Tennessee, United States",
"Checkers Drive-In Restaurants": "Tampa, Florida, United States",
"Jamba": "Atlanta, Georgia, United States",
"Portillo's": "Oak Brook, Illinois, United States",
"Potbelly sandwich Shop": "Chicago, Illinois, United States",
"Bahama Breeze Island Grille": "Orlando, Florida, United States",
"Pret A Manger": "London, United Kingdom",
"Mastro's Restaurants": "Newport Beach, California, United States",
"Uncle Julio's": "Irving, Texas, United States",
"Rubio's": "Carlsbad, California, United States",
"A&W All-American Food": "Lexington, Kentucky, United States",
"Brio Tuscan Grille": "Columbus, Ohio, United States",
"Lazy Dog Restaurant & Bar": "Huntington Beach, California, United States",
"Souplantation & Sweet Tomatoes": "San Diego, California, United States",
"Del Frisco's Double Eagle Steak House": "The Post Oak, Houston, Texas, United States",
"Which Wich": "Dallas, Texas, United States",
"Firebirds Wood Fired Grill": "Charlotte, North Carolina, United States",
"True Food Kitchen": "Phoenix, Arizona, United States",
"Mountain Mike's Pizza": "Hayward, California, United States",
"Bubba Gump Shrimp Co.": "Houston, Texas, United States",
"La Madeleine Country French Cafe": "Dallas, Texas, United States",
"Giordano's": "Chicago, Illinois, United States",
"Islands Fine Burgers & Drinks": "Carlsbad, California, United States",
"Mimi's Bistro & Bakery": "Dallas, Texas, United States",
"Beef 'O' Brady's": "Tampa, Florida, United States",
"Metro Diner": "Tampa, Florida, United States",
"Smokey Bones Bar & Fire Grill": "Tampa, Florida, United States",
"LaRosa's Pizzeria": "Cincinnati, Ohio, United States",
"Roosters": "Moore Park, Australia",
"Great Harvest Bread Co.": "Dillon, Montana, United States",
"Shari's Cafe and Pies": "Beaverton, Oregon, United States",
"Grand Lux Cafe": "Calabasas Hills, California, United States",
"Anthony's Coal Fired Pizza": "Fort Lauderdale, Florida, United States",
"Chicken Salad Chick": "Auburn, Alabama, United States",
"Paris Baguette": "Seongnam-si, South Korea",
"Eat'n Park": "Homestead, Pennsylvania, United States",
"Taziki's Mediterranean Cafe": "Birmingham, Alabama, United States",
"Duffy's Sports Grill": "Lake Worth, Florida, United States",
"Topgolf": "Dallas, Texas, United States", 
"Ocean Prime": "Columbus, Ohio, United States",
"Old Country Buffet/HomeTown Buffet": "Hollywood Park, Texas, United States",
"Nobu": "New York, New York, United States",
"Mission BBQ": "Glen Burnie, Maryland, United States",
"Walk-On's Sports Bistreaux": "Baton Rouge, Los Angeles, California, United States",
"WaBa Grill": "Los Angeles, California, United States",
"54th Street Restaurant & Drafthouse": "Kansas City, Missouri, United States",
"Costa Vida Fresh Mexican Grill": "Salt Lake City, Utah, United States",
"Gyu-Kaku": "New York City, New York, United States",
"PDQ": "Tampa, Florida, United States",
"Lupe Tortilla": "Houston, Texas, United States",
"Cook-Out Restaurant": "Thomasville, North Carolina, United States",
"Jollibee":"Quezon City, Luzon"}

In [None]:
for key, value in dict_locs.items():
    df_top250.loc[df_top250['Restaurant'] == key, 'Headquarters'] = value
    
df_top250.drop(columns = ['City'], axis = 1, inplace = True)

In [None]:
df_top250.isna().sum()

# All clean now! Time to use it...

# A.9 Finding lat-long geospatial info from locations
I find the lat-long and split them into two columns.

In [None]:
df_top250

In [None]:
def findlonglat(city):
    location = str(geolocator.geocode(city)[1])[1:-1]
    return(location)



df_top250['Headquarters LongLat'] = df_top250['Headquarters'].apply(findlonglat)

headlonglat = ['Headquarters_Latitude','Headquarters_Longitude']
df_top250[headlonglat] = df_top250['Headquarters LongLat'].str.split(',',expand=True)
df_top250.drop(columns = ['Headquarters LongLat'], axis = 1, inplace = True)

for entry in headlonglat:
    df_top250[entry] = pd.to_numeric(df_top250[entry])

In [None]:
df_top250

# A.10 Plotting data on map
The plot is done using folium. It shows where the most dense HQ is. Clearly this is around the East Coast of the US!

The green points are in the top 50, whilst orange are all others. Apart from Starbucks on the West coast, all other top 50 companies are in the East Coast.

In [None]:
max_sales = max(df_top250['Sales'])

m = folium.Map(location=[50.70, -33.94], zoom_start=1, tiles='CartoDB positron')

def color_producer(val):
    if val < 50:
        return 'green'
    else:
        return 'orange'

for _, r in df_top250.iterrows():
    fill_color=color_producer(r['Rank'])
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])})"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    radius = 10*r['Sales']/max_sales
    popup = r['Headquarters']
    
    cm = folium.CircleMarker(location = location, radius = radius,
                        tooltip = tooltip, popup = popup,
                        color = fill_color)
    
    m.add_child(cm)
    
m

# A.11 Haversine Distance
The spherical polar distance can be calculated using this function, giving the overall distance in km between restaurants.

In [None]:
def haversine_distance(row):
    lat_p, lon_p = row['Headquarters_Latitude_x'], row['Headquarters_Longitude_x']
    lat_d, lon_d = row['Headquarters_Latitude_y'], row['Headquarters_Longitude_y']
    radius = 6371 # km

    dlat = np.radians(lat_d - lat_p)
    dlon = np.radians(lon_d - lon_p)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat_p)) * np.cos(np.radians(lat_d)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = radius * c

    return distance

# A.12 Checking distance between Top N restaurants

In [None]:
def Distance_Check(df, top_n_row):
    perm = df.head(top_n_row)[['Restaurant','Headquarters_Latitude','Headquarters_Longitude']]

    df_list = []

    for i in range(len(perm)):
        x = perm.loc[i].to_frame().transpose()
        newdf = pd.DataFrame(np.repeat(x.values,4,axis=0))
        newdf.columns = x.columns

        df_drop = perm.drop(index = i).reset_index()

        df_stackable = pd.merge(newdf, df_drop, left_index=True, right_index=True).drop('index', axis = 1)

        df_list.append(df_stackable)
    
    
    distance_df = pd.concat(df_list).reset_index(drop = True)

    distance_df['distance'] = distance_df.apply(haversine_distance, axis = 1)

    distance_df = distance_df.drop_duplicates(subset=['distance'])

    distance_df = distance_df.sort_values(by=['distance'], ascending=False).reset_index(drop = True)
    
    return(distance_df)

# A.13 Restaurant distance pair
An obvious pattern doesn't really exist, but it appears aside from the top few restaurants (which are on the West coast), the remaining restaurants are close together. This quantifies this closeness.

In [None]:
def Plot_Rest_Dist():
    distance_df['Restaurant_Pair'] = distance_df['Restaurant_x'] + " - " + distance_df['Restaurant_y']

    sns.set_theme(style="whitegrid")
    tips = sns.load_dataset("tips")
    ax = sns.barplot(x="Restaurant_Pair", y="distance", data=distance_df, palette = 'ch:start=.2,rot=-.3')
    plot = plt.setp(ax.get_xticklabels(), rotation=90)

distance_df = Distance_Check(df_top250, top_n_row = 10)
    
Plot_Rest_Dist()

# A.14 Top 20 Sales by Restaurants
The most sales are from McDonald's.

In [None]:
def Plot_Rest_Sales(df,var,top_n_row = 20):
    sns.set_theme(style="whitegrid")
    tips = sns.load_dataset("tips")
    ax = sns.barplot(x="Restaurant", y=var, data=df.head(top_n_row), palette = 'viridis')
    plot = plt.setp(ax.get_xticklabels(), rotation=90)
    
Plot_Rest_Sales(df_top250, var = 'Sales')

# A.15 Top 20 Sales by Segment Category
Clearly the Quick Service & Burger is quite clearly the most favourite.

In [None]:
top = 20

top_seg = df_top250[['Sales', 'Segment_Category']].groupby(df_top250['Segment_Category']).sum().sort_values('Sales', ascending = False).head(top).reset_index()

sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="Segment_Category", y="Sales", data=top_seg, palette = "mako")
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# A.16 Starburst plot
There are quite significant contributors in the Quick Service & Burgers area - it is largely composed by McDonald's. Whilst Starbucks is Rank 2, it's still significantly smaller but still bolsters Quick Service & Coffee Cafe.

In [None]:
def StarburstPlot(df, var, sortvar):
    
    listA = [var, sortvar]
    listB = [var,'Restaurant']
    
    seg_agg = df[listA].groupby(df[var]).sum().sort_values(sortvar, ascending = False)

    seg_unique = list(seg_agg.index)

    df_sb = df[df[var].isin(seg_unique)]

    fig = px.sunburst(df_sb, path = listB, values=sortvar)

    fig.show()
    
StarburstPlot(df_top250, 'Segment_Category', 'Sales')

# A.17 Correlation map
This checks for the collinearity between variables. As I intend to do a regression model, this may be important.

In [None]:
def CorMap(df, cols):
    df_corr = df.drop(columns = cols, axis = 1)
    
    corr = df_corr.corr()

    g = sns.PairGrid(df_corr)
    g.map_diag(plt.hist)
    g.map_offdiag(plt.scatter);
    
    return(corr)
    
cols = ['Rank', 'Restaurant', 'Segment_Category', 'Headquarters','Headquarters_Longitude', 'Headquarters_Latitude']
    
corr = CorMap(df_top250, cols)

# A.18 Collinearity matrix
This quantifies the correlation between variables.

In [None]:
def Coll_Matrix():
    sns.set_theme(style="white")

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
    
Coll_Matrix()

# A.19 Segment Category count
There are not too many segment categories in total. The starburst plot earlier demonstrated that restaurant performance is quite heavily driven by this parameter.

In [None]:
ax = sns.countplot(df_top250.Segment_Category)
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# A.20 Extracting State and Country
The level of granularity I am interested in is State level - restaurants' general location may be important to determine sales.

In [None]:
df_top250[["State","Country"]] = df_top250.Headquarters.str.rsplit(', ', 2, expand=True).drop(0, axis = 1)

In [None]:
df_LocCat_top250 = df_top250.drop(columns = ['Rank','Restaurant', 'Headquarters', 'Headquarters_Latitude', 'Headquarters_Longitude'], axis = 1)
df_LocCat_top250.loc[152,'State'] = 'London'
df_LocCat_top250.loc[207,'State'] = 'Moore Park'
df_LocCat_top250.loc[216,'State'] = 'Seongnam-si'
df_LocCat_top250.loc[250,'State'] = 'Quezon City'

df_LocCat_top250.loc[152,'Country'] = 'United Kingdom'
df_LocCat_top250.loc[207,'Country'] = 'Australia'
df_LocCat_top250.loc[216,'Country'] = 'South Korea'
df_LocCat_top250.loc[250,'Country'] = 'Luzon'

In [None]:
df_LocCat_top250

# A.21 State count
Certain States certainly dominate the data. This indicates clusters of restaurants in particular regions.

In [None]:
ax = sns.countplot(df_LocCat_top250.State)
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# A.22 Performing regression
I am attempting a regression model to determine Sales from remaining parameters. There are not many records so overfitting is inevitable. However, it is a cool experiment anyway.

Units from the density plot above is exponentially decaying as are sales, so I apply a log transform. I also one-hot encode the categorical variables (State, Segment Category and Country).

Together these transforms result in data within reasonable constraints for modelling.

In [None]:
X = df_LocCat_top250.drop('Sales', axis = 1)

y = df_LocCat_top250[['Sales']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

ohe_list = ['Segment_Category', 'State', 'Country']

for ohe in ohe_list:
    ohe_df = pd.get_dummies(X_train[ohe], prefix = ohe)
    X_train = pd.concat([X_train, ohe_df], axis=1).drop([ohe], axis=1)
    X_train['Units'] = np.log(X_train['Units'])

    ohe_df = pd.get_dummies(X_test[ohe], prefix = ohe)
    X_test = pd.concat([X_test, ohe_df], axis=1).drop([ohe], axis=1)
    X_test['Units'] = np.log(X_test['Units'])

col_list = X_train.append(X_test).columns.tolist()

X_train = X_train.reindex(columns = col_list).fillna(0)
X_test = X_test.reindex(columns = col_list).fillna(0)
    
y_train = np.log(y_train)
y_test = np.log(y_test)

# A.23 Random forest model
I run a random forest regressor.

In [None]:
rf_model = RandomForestRegressor(verbose = 1)
rf_model.fit(X_train, y_train)
y_hat_rf = rf_model.predict(X_test)

# A.24 XGBoost model
I attempt an xgboost model, using $R^2$ as the objective quantity.

In [None]:
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10)

xgb_reg.fit(X_train,y_train)

y_hat_xgb = xgb_reg.predict(X_test)

# A.25 Random Forest Error in model
This is the error in the random-forest model. Due to the log transforms, this value is difficult to interpret. It is NOT the log of the error, however.

In [None]:
mean_absolute_error(y_test, y_hat_rf)

# A.26 XGBoost Error in model
This is the error in the xgboost model. Due to the log transforms, this value is difficult to interpret. It is NOT the log of the error, however.

In [None]:
mean_absolute_error(y_test, y_hat_xgb)

# A.27 Combining the results from models
Here I merge the results from both models with the actuals to plot.

In [None]:
df1 = pd.DataFrame(data=y_hat_rf, columns=["Random_Forest"])
df2 = pd.DataFrame(data=y_hat_xgb, columns=["XGBoost"])

df = pd.concat([y_test.reset_index(drop = True), df1, df2], axis=1).sort_values(by = 'Sales', ascending = False).reset_index(drop = True)

for cols in list(df):
    df[cols] = np.exp(df[cols])

# A.28 Viewing all estimates
It appears that for extreme values, the model becomes inaccurate, yet there is relatively higher accuracy for smaller values.

In [None]:
def Estimateplot(slicer, var):
    df_slice = df.iloc[slicer:]
        
    df_melt = df_slice.melt(var, var_name='Model', value_name='Estimated ' + var)

    g = sns.scatterplot(x=var, y="Estimated " + var, hue='Model', data=df_melt)
    g = sns.lineplot(x = df[var], y = df[var], style=True, palette=['red'], dashes=[(2,2)])

    lim = g.set(xlim=(min(df_slice[var]), max(df_slice[var])), ylim=(min(df_slice[var]), max(df_slice[var])))
    
Estimateplot(0,'Sales')

# A.29 Viewing smaller estimates
This removes the higher 5 values. The closness in estimated values to actuals at lower values is clearer here.

The Random forest regressor is clearly a better fit.

In [None]:
Estimateplot(5, 'Sales')

# B. Starting with Independence 100 Restaurants data

# B.1 Loading Independence 100 data

In [None]:
df_ind100 = pd.read_csv(prefix + 'Independence100.csv')

df_ind100

# B.2 Finding HQ from Location data
Using Nominatim to get consistent location info.

In [None]:
df_ind100['Headquarters'] = df_ind100['City'].apply(findfullad)

# B.3 Correcting small errors

In [None]:
df_ind100['Headquarters'] = df_ind100['Headquarters'].str.replace('\d+', '', regex = True)
df_ind100['Headquarters'] = df_ind100['Headquarters'].str.replace(' ,', '', regex = True)
df_ind100['Headquarters'] = df_ind100['Headquarters'].str.replace(' -,', '', regex = True)
df_ind100.loc[38,'Headquarters'] = 'Bal Harbour, Florida, United States'

# B.4 Grabbing Latitude-Longitude values
Once again using Nominatim to get Lat-Long values

In [None]:
df_ind100['Headquarters LongLat'] = df_ind100['Headquarters'].apply(findlonglat)

headlonglat = ['Headquarters_Latitude','Headquarters_Longitude']
df_ind100[headlonglat] = df_ind100['Headquarters LongLat'].str.split(',',expand=True)
df_ind100.drop(columns = ['Headquarters LongLat'], axis = 1, inplace = True)

for entry in headlonglat:
    df_ind100[entry] = pd.to_numeric(df_ind100[entry])
    
df_ind100

# B.5 Visualising in Folium
Putting top 20% in Green on Nominatim.

In [None]:
max_sales = max(df_ind100['Sales'])

m = folium.Map(location=[40.70, -93.94], zoom_start=3, tiles='CartoDB positron')

def color_producer(val):
    if val < 20:
        return 'green'
    else:
        return 'orange'

for _, r in df_ind100.iterrows():
    fill_color=color_producer(r['Rank'])
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])})"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    radius = 10*r['Sales']/max_sales
    popup = r['Headquarters']
    
    cm = folium.CircleMarker(location = location, radius = radius,
                        tooltip = tooltip, popup = popup,
                        color = fill_color)
    
    m.add_child(cm)
    
m

# B.6 Plotting distances between Top 20 Restaurants

In [None]:
distance_df = Distance_Check(df_ind100, top_n_row = 20)

Plot_Rest_Dist()

# B.7 Top 20 Restaurants by Sales
Top performer is Carmine's (Times Square)

In [None]:
Plot_Rest_Sales(df_ind100, var = 'Sales')

# B.8 Correlation map

In [None]:
cols = ['Rank', 'Restaurant', 'Headquarters','Headquarters_Longitude', 'Headquarters_Latitude']
    
corr = CorMap(df_ind100, cols)

# B.9 Correlation Matrix
Grabbing correlation data between fields.

In [None]:
Coll_Matrix()

# B.10 Replacing New York with New York City

In [None]:
df_ind100.Headquarters = df_ind100.Headquarters.replace('New York, United States', 'New York City, New York, United States')

# B.11 Separating data into State and Country
All records are in United States so I drop the Country column.

In [None]:
df_ind100[["State","Country"]] = df_ind100.Headquarters.str.rsplit(', ', 2, expand=True).drop(0, axis = 1)

df_ind100.drop(columns = ['Country', 'City'], inplace = True)

df_ind100

# B.12 Drop unneeded columns for Location plot

In [None]:
df_LocCat_ind100 = df_ind100.drop(columns = ['Rank','Restaurant', 'Headquarters', 'Headquarters_Latitude', 'Headquarters_Longitude'], axis = 1)

In [None]:
df_LocCat_ind100

# B.13 Viewing Location data
Most restaurants are in New York. Clearly Location matters!

In [None]:
ax = sns.countplot(df_LocCat_ind100.State)
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# B.14 Setting up Regression models
I use log transforms once again to translate the fields. Technically a min-max scaler may be better here with a Yeo-Johnson transform.

In [None]:
X = df_LocCat_ind100.drop('Sales', axis = 1)

y = df_LocCat_ind100[['Sales']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

ohe_df = pd.get_dummies(X_train['State'], prefix = 'State')
X_train = pd.concat([X_train, ohe_df], axis=1).drop(['State'], axis=1)

ohe_df = pd.get_dummies(X_test['State'], prefix = 'State')
X_test = pd.concat([X_test, ohe_df], axis=1).drop(['State'], axis=1)

col_list = X_train.append(X_test).columns.tolist()

X_train = X_train.reindex(columns = col_list).fillna(0)
X_test = X_test.reindex(columns = col_list).fillna(0)

log_cols = ['Average Check', 'Meals Served']

for cols in log_cols:
    X_train[cols] = np.log(X_train[cols])
    X_test[cols] = np.log(X_test[cols])
    
y_train = np.log(y_train)
y_test = np.log(y_test)

# B.15 Random-Forest model
Running a Random-Forest model to predict sales.

In [None]:
rf_model = RandomForestRegressor(verbose = 1)
rf_model.fit(X_train, y_train)
y_hat_rf = rf_model.predict(X_test)

# B.16 Running XGBoost model

In [None]:
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10)

xgb_reg.fit(X_train,y_train)

y_hat_xgb = xgb_reg.predict(X_test)

# B.17 Mean Absolute error for Random Forest model

In [None]:
mean_absolute_error(y_test, y_hat_rf)

# B.18 Mean Absolute error for XGBoost model
Due to earlier log transform, this is difficult to directly interpret.

In [None]:
mean_absolute_error(y_test, y_hat_xgb)

# B.19 Join XGBoost and Random forest with actuals

In [None]:
df1 = pd.DataFrame(data=y_hat_rf, columns=["Random_Forest"])
df2 = pd.DataFrame(data=y_hat_xgb, columns=["XGBoost"])

df = pd.concat([y_test.reset_index(drop = True), df1, df2], axis=1).sort_values(by = 'Sales', ascending = False).reset_index(drop = True)

for cols in list(df):
    df[cols] = np.exp(df[cols])

# B.20 Estimates of Sales plotted
This is for XGBoost and Random forest against actuals.

In [None]:
Estimateplot(0,'Sales')

# B.21 Some estimates removed
Higher values are more difficult to estimate. The Random forest model performs better once again.

In [None]:
Estimateplot(5, 'Sales')

# C. Starting with Future 50 Restaurants data

# C.1 Loading Future 50 data
Both YOY_Sales and YOY_Units are present as % values again so once again transforms are applied.

In [None]:
df_fut50 = pd.read_csv(prefix + 'Future50.csv', converters={'YOY_Sales':p2f, 'YOY_Units':p2f})
df_fut50

# C.2 Extracting HQ data from Location
Nominatim is used to extract location data.

In [None]:
df_fut50['Headquarters'] = df_fut50['Location'].apply(findfullad)

# C.3 Cleaning records...

In [None]:
df_fut50['Headquarters'] = df_fut50['Headquarters'].str.replace('\d+', '', regex = True)
df_fut50['Headquarters'] = df_fut50['Headquarters'].str.replace(' ,', '', regex = True)
df_fut50['Headquarters'] = df_fut50['Headquarters'].str.replace(' -,', '', regex = True)

df_fut50.loc[18,'Headquarters'] = 'Orlando, Florida, United States'
df_fut50.loc[19,'Headquarters'] = 'Orange Park, Florida, United States'
df_fut50.loc[20,'Headquarters'] = 'Doral, Florida, United States'

In [None]:
df_fut50.drop('Location', axis = 1, inplace = True)

# C.4 Finding Longitude and Latitude from HQ locations
Once again Nominatim is applied to find coordinates.

In [None]:
df_fut50['Headquarters LongLat'] = df_fut50['Headquarters'].apply(findlonglat)

headlonglat = ['Headquarters_Latitude','Headquarters_Longitude']
df_fut50[headlonglat] = df_fut50['Headquarters LongLat'].str.split(',',expand=True)
df_fut50.drop(columns = ['Headquarters LongLat'], axis = 1, inplace = True)

for entry in headlonglat:
    df_fut50[entry] = pd.to_numeric(df_fut50[entry])
    
df_fut50

# C.5 Plotting on Folium graph
All Restaurants data is plotted on graph.

In [None]:
max_sales = max(df_fut50['Sales'])

m = folium.Map(location=[40.70, -93.94], zoom_start=3, tiles='CartoDB positron')

def color_producer(val):
    if val < 10:
        return 'green'
    else:
        return 'orange'

for _, r in df_fut50.iterrows():
    fill_color=color_producer(r['Rank'])
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])})"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    radius = 10*r['Sales']/max_sales
    popup = r['Headquarters']
    
    cm = folium.CircleMarker(location = location, radius = radius,
                        tooltip = tooltip, popup = popup,
                        color = fill_color)
    
    m.add_child(cm)
    
m

# C.6 Inter-distance Restaurant
Top 10 restaurant inter-distances found.

In [None]:
distance_df = Distance_Check(df_fut50, top_n_row = 10)
Plot_Rest_Dist()

# C.7 YOY_Sales plot
Top performer is Evergreens

In [None]:
Plot_Rest_Sales(df_fut50, var = 'YOY_Sales')

# C.8 Franchising as function of YOY_Sales
Majority of top 50 are Franchises.

In [None]:
StarburstPlot(df_fut50, 'Franchising', 'YOY_Sales')

# C.9 Correlation map

In [None]:
cols = ['Rank', 'Restaurant', 'Headquarters','Headquarters_Longitude', 'Headquarters_Latitude', 'Franchising']
    
corr = CorMap(df_fut50, cols)

# C.10 Correlation matrix

In [None]:
Coll_Matrix()

# C.11 Finding State and Country data
Separating location info into State and Country.

In [None]:
df_fut50[["State","Country"]] = df_fut50.Headquarters.str.rsplit(', ', 2, expand=True).drop(0, axis = 1)

# C.12 Drop Country
All States are in US so drop Country.

In [None]:
df_fut50.drop('Country', axis = 1, inplace = True)

In [None]:
df_fut50

# C.13 Removing unneeded columns

In [None]:
df_LocCat_fut50 = df_fut50.drop(columns = ['Rank','Restaurant', 'Headquarters', 'Headquarters_Latitude', 'Headquarters_Longitude'], axis = 1)

In [None]:
df_LocCat_fut50

# C.14 Counts of Location data
California is most common location. Clearly location matters once again!

In [None]:
ax = sns.countplot(df_LocCat_fut50.State)
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# C.15 Visualising Franchise data
Clear imbalance between Franchise or not. So it is an important feature.

In [None]:
ax = sns.countplot(df_LocCat_fut50.Franchising)
plot = plt.setp(ax.get_xticklabels(), rotation=90)

# C.16 Setting up Regression model
This time YOY_Sales is the dependent variable since Rating is quantified by this.

In [None]:
X = df_LocCat_fut50.drop('YOY_Sales', axis = 1)

y = df_LocCat_fut50[['YOY_Sales']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

ohe_df = pd.get_dummies(X_train['State'], prefix = 'State')
X_train = pd.concat([X_train, ohe_df], axis=1).drop(['State'], axis=1)

ohe_df = pd.get_dummies(X_test['State'], prefix = 'State')
X_test = pd.concat([X_test, ohe_df], axis=1).drop(['State'], axis=1)

col_list = X_train.append(X_test).columns.tolist()

X_train = X_train.reindex(columns = col_list).fillna(0)
X_test = X_test.reindex(columns = col_list).fillna(0)
  
log_cols = ['Sales', 'Units', 'YOY_Units', 'Unit_Volume']

for cols in log_cols:
    X_train[cols] = np.log(X_train[cols])
    X_test[cols] = np.log(X_test[cols])

X_train.Franchising = X_train.Franchising.replace(['Yes', 'No'], [1,0])
X_test.Franchising = X_test.Franchising.replace(['Yes', 'No'], [1,0])

df.replace(0, 5)
    
y_train = np.log(y_train)
y_test = np.log(y_test)

# C.17 Running Random Forest regressor

In [None]:
rf_model = RandomForestRegressor(verbose = 1)
rf_model.fit(X_train, y_train)
y_hat_rf = rf_model.predict(X_test)

# C.18 Running XGBoost

In [None]:
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10)

xgb_reg.fit(X_train,y_train)

y_hat_xgb = xgb_reg.predict(X_test)

# C.19 MAE for Random Forest model

In [None]:
mean_absolute_error(y_test, y_hat_rf)

# C.20 MAE for XGBoost model
Clearly this is MUCH worse than Random Forest.

In [None]:
mean_absolute_error(y_test, y_hat_xgb)

# C.21 Joining YOY_Sales actuals onto forecasts

In [None]:
df1 = pd.DataFrame(data=y_hat_rf, columns=["Random_Forest"])
df2 = pd.DataFrame(data=y_hat_xgb, columns=["XGBoost"])

df = pd.concat([y_test.reset_index(drop = True), df1, df2], axis=1).sort_values(by = 'YOY_Sales', ascending = False).reset_index(drop = True)

for cols in list(df):
    df[cols] = np.exp(df[cols])

# C.22 Visualising forecasts vs actuals

In [None]:
Estimateplot(0,'YOY_Sales')

# C.23 Subsetting forecasts
Lower values are easier to forecast, and Random_forest clearly runs way better here.

In [None]:
Estimateplot(5,'YOY_Sales')

# D. Comparing all data

# D.1 Plotting all data on one graph
Every graph is plotted onto one graph. There are clearly some clusters in the data.

In [None]:
m = folium.Map(location=[50.70, -33.94], zoom_start=1, tiles='CartoDB positron')

for _, r in df_top250.iterrows():
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])} for Top 250)"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    popup = r['Headquarters']
    
    cm = folium.Circle(location = location,
                        tooltip = tooltip, popup = popup,
                        color = 'blue')
    
    m.add_child(cm)
    
for _, r in df_ind100.iterrows():
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])} for Ind 100)"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    popup = r['Headquarters']
    
    cm = folium.Circle(location = location,
                        tooltip = tooltip, popup = popup,
                        color = 'red')
    
    m.add_child(cm)

for _, r in df_fut50.iterrows():
    tooltip = f"{r['Restaurant']} (Rank: {str(r['Rank'])} for Future 50)"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    popup = r['Headquarters']
    
    cm = folium.Circle(location = location,
                        tooltip = tooltip, popup = popup,
                        color = 'green')
    
    m.add_child(cm)

m

# D.2 Stacking all datasets
All three dataframes with coordinates are stacked.

In [None]:
stack_df_top250 = df_top250[['Headquarters','State','Restaurant','Headquarters_Latitude','Headquarters_Longitude']]
stack_df_top250['Type'] = 'Top 250 Restaurants'

stack_df_ind100 = df_ind100[['Headquarters','State','Restaurant','Headquarters_Latitude','Headquarters_Longitude']]
stack_df_ind100['Type'] = 'Independent 100 Restaurants'

stack_df_fut50 = df_fut50[['Headquarters','State','Restaurant','Headquarters_Latitude','Headquarters_Longitude']]
stack_df_fut50['Type'] = 'Future 50 Restaurants'

stack_df = pd.concat([stack_df_top250,stack_df_ind100, stack_df_fut50],ignore_index=True)

# D.3 Data converted into Array for DBScan

In [None]:
coords = stack_df[['Headquarters_Latitude', 'Headquarters_Longitude']].to_numpy()

# D.4 Calculating clusters using distances
The distance is set to 350 km, to get 14 clusters.

In [None]:
def cluster_map(distance):
    kms_per_radian = 6371.0088
    epsilon = distance / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    
    clustercount = set(list(cluster_labels))
    
    clusters = pd.DataFrame(cluster_labels, columns = ['Clusters'])
    
    print("Total cluster count: " + str(len(clustercount)))
    
    return(clusters)

In [None]:
clusters = cluster_map(350)

result = pd.concat([stack_df, clusters], axis=1)

In [None]:
color_list = ['red', 'blue', 'green', 'purple', 
              'orange', 'darkred','lightred', 
              'beige', 'darkblue', 'darkgreen', 
              'cadetblue', 'darkpurple', 'white',
              'pink', 'lightblue', 'lightgreen', 
              'gray', 'black', 'lightgray']

# D.5 Visualising cluster data
All clusters are placed on one graph.
Clearly there is a strong preference for the West Coast.

In [None]:
m = folium.Map(location=[40.70, -93.94], zoom_start=3, tiles='CartoDB positron')

for _, r in result.iterrows():
    fill_color = color_list[r['Clusters']]
    
    tooltip = f"{r['Restaurant']}; List: {r['Type']}; Cluster: {r['Clusters']}"
    location = [r['Headquarters_Latitude'], r['Headquarters_Longitude']]
    popup = r['Headquarters']
    
    cm = folium.Circle(location = location,
                        tooltip = tooltip, popup = popup,
                        color = fill_color)
    
    m.add_child(cm)
    
m

# Bar chart for each type
All graphs are placed onto one graph. This shows that restaurants tend to cluster similarly for each listing.

Cluster 0, 1 and 2 are the primary clusters. Cluster 0 is the East coast, Cluster 1 is the West coast whilst 2 is on the south end of the East coast.

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

sns.countplot(data=result, x='Clusters', hue='Type', ax=ax)