# The main goal of this analysis is to explore data and simplify the dataset so that it can be used in other BI tools like Tableau by people from non-coding background.# 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# A. Top 250 Restaurants

In [None]:
df_T250 = pd.read_csv('/kaggle/input/restaurant-business-rankings-2020/Top250.csv',index_col=0)
df_T250.head()

**Observation(s):**
**We can see column 'YOY_Sales' and 'YOY_Units' data are in percentage; they need to be changed in order to do analysis.**

# A. 1. Exploring Top 250 Dataframe.

In [None]:
df_T250.dtypes

In [None]:
df_T250.info()

In [None]:
df_T250.describe()

In [None]:
#Function to strip '%' from the data.
def Nopercentage(x):
    return float(x.strip('%'))/100

In [None]:
#Redownloading but this time with a converter (changing all percentage value to regular value).
df_T250 = pd.read_csv(r'/kaggle/input/restaurant-business-rankings-2020/Top250.csv',converters={'YOY_Sales':Nopercentage, 'YOY_Units':Nopercentage})

In [None]:
#To have a higher level scatterplot view of all the numeric data against eachother.
sns.pairplot(df_T250)

In [None]:
#To create scatterplot between 'Sales' against 'Rank' and 'Units.'
runall = ['Rank','Units']
for c in runall:
    plt.figure(figsize = (10, 5))
    sns.regplot(data=df_T250, x='Sales', y=c)

In [None]:
#To create scatterplot between 'YOY_Sales' against 'YOU_Units.'
plt.figure(figsize = (10, 5))
sns.regplot(data=df_T250, x='YOY_Sales', y='YOY_Units')

In [None]:
#To create scatterplot between 'Rank' and 'Units.'
plt.figure(figsize = (10, 5))
sns.regplot(data=df_T250, x='Rank', y='Units')

In [None]:
#To print all the unique values in dataframe.
df_T250.nunique()

**Observation(s): 'Segment_Category' being categorical data has too many categories. We will see if we can lessen the numbers in order to get clearer visualization.**

# A. 2. Looking for Outliers.

In [None]:
#In order to detect outliers and count their frequency.
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        Q1 = np.percentile(df[c],25)
        Q3 = np.percentile(df[c],75)
        IQR = Q3 - Q1 
        outlier_step = IQR * 1.5
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
#To detect outliers on following columns.
df_T250.loc[detect_outliers(df_T250, ["Sales", "Units", "YOY_Sales","YOY_Units"])]

**Observation(s): No outliers found.**

# A. 3. Looking for Null values.

In [None]:
sns.heatmap(df_T250.isnull())

In [None]:
df_T250.isnull().sum()

**Observation(s): We can see here almost 75% of the values in column 'Content' and 'Headquarters' are null values hence the whole column will be eliminated.**

In [None]:
#To eliminate column 'Content' and 'Headquarters.'
df_T250=df_T250.drop(['Content','Headquarters'], axis=1)

# A. 4. We will try to minimize the category for clearer visualization by grouping similar category together.   

In [None]:
df_T250['Segment_Category'].unique()

In [None]:
#To create a function that will put the values belonging to similar category into the same category.
def replace(text):
    if 'Quick Service & ' in text:
        text = text.replace('Quick Service & ', '')  
    elif 'Fast' in text:
        text = text.replace('Fast Casual & ', '')
    elif 'Casual' in text:
        text = text.replace('Casual Dining & ', '')
    elif 'Fine Dining & ' in text:
        text = text.replace('Fine Dining & ', '')
    elif 'Family Dining & ' in text:
        text = text.replace('Family Dining & ', '')
    elif ('Family Style' in text) or ('Family Style' == text):
        text = text.replace('Family Style', 'Family')
    elif ('Family Casual' in text) or ('Family Casual' == text):
        text = text.replace('Family Casual', 'Family')
    elif ('Asian/Noodle' in text) or ('Asian/Noodle' == text):
        text = text.replace('Asian/Noodle', 'Asian')      
    elif ('Italian/Pizza' in text) or ('Italian/Pizza' == text):
        text = text.replace('Italian/Pizza', 'Pizza')
    return text


df_T250['Segment_Category'] = df_T250['Segment_Category'].apply(replace)

In [None]:
df_T250['Segment_Category'].unique()

# A. 5. Visualizing by Category.

In [None]:
#To create barplot on the basis of 'Segment_Category'
runall = ['Sales','Units','YOY_Sales','YOY_Units']
for c in runall:
    plt.figure(figsize = (20, 5))
    sns.barplot(x = 'Segment_Category', y = c , data = df_T250)
    plt.xticks(rotation = 45)

In [None]:
#To create piechart on 'Sales' on the basis of 'Segment_Category' and the restaurants that falls under them.
fig = px.sunburst(df_T250, path=['Segment_Category', 'Restaurant'], values='Sales')
fig.show()

In [None]:
#To create piechart on 'Units' on the basis of 'Segment_Category' and the restaurants that falls under them.
fig = px.sunburst(df_T250, path=['Segment_Category', 'Restaurant'], values='Units')
fig.show()

# B. Future50 Restaurants.

# B. 1. Loading Future50 dataframe but with the percentage converter.

In [None]:
#Downloading with a converter (changing all percentage value to regular value).
df_F50 = pd.read_csv('/kaggle/input/restaurant-business-rankings-2020/Future50.csv',converters={'YOY_Sales':Nopercentage, 'YOY_Units':Nopercentage})

# B. 2. Exploring Future50 Dataframe.

In [None]:
df_F50.head()

In [None]:
df_T250.describe()

In [None]:
df_F50.info()

In [None]:
#To have a higher level scatterplot view of all the numeric data against eachother.
sns.pairplot(df_F50)

In [None]:
#To create scatterplot 'Sales' against 'Rank','Units', and 'Unit_Volume'
runall = ['Rank','Units','Unit_Volume']
for c in runall:
    plt.figure(figsize = (10, 5))
    sns.regplot(data=df_F50, x='Sales', y=c)

In [None]:
#To create scatterplot between 'YOY_Sales' against 'YOY_Units.'
plt.figure(figsize = (10, 5))
sns.regplot(data=df_F50, x='YOY_Sales', y='YOY_Units')

# B. 3. Looking for Outliers.

In [None]:
df_F50.loc[detect_outliers(df_F50, ["Sales", "Units", "YOY_Sales","YOY_Units", "Unit_Volume"])]

**Observation(s): Since, there is only one we will be dropping the entire row from the Dataframe.**

In [None]:
df_F50=df_F50.drop(df_F50.index[1])

# B. 4. Looking for Null values.

In [None]:
sns.heatmap(df_F50.isnull())

In [None]:
df_T250.isnull().sum()

**Observation(s): No Null values found.**

In [None]:
df_F50.nunique()

**Observation(s): We can see in column 'Location' state and city are in the same record; they need to be seperated for the purpose of simplicity and concistency with other dataframe.**

In [None]:
#To create a dictionary for state.
state_dict = {'N.Y.':'NY',
             'Fla.':'FL',
             'D.C.':'DC',
             'Ill.':'IL',
             'Nev.':'NV',
             'N.C.':'NC',
             'Ind.':'IN',
             'Texas':'TX',
             'Pa.':'PA',
             'Calif.':'CA',
             'Ga.':'GA',
             'Mich.':'MI',
             'Mass.':'MA',
             'Ore.':'OR',
             'N.J.':'NJ',
             'Tenn.':'TN',
             'Colo.':'CO',
             'Va.':'VA',
              'Ky.':'KY',
              'Ohio':'OH',
              'Mo.':'MO',
              'Ariz':'AZ',
              'Neb.':'NE',
              'Ark.':'AR',
              'Wash.':'DC',
              'S.C.':'SC'
             }

In [None]:
#To create a list out of data in column 'Location.'
list_of_location = df_F50['Location'].to_list()

In [None]:
#To create two new list for state and city and add it to dataframe.
new_city=[]  
new_state_1=[]
for j in list_of_location:
    for i in state_dict:
        j=" ".join(j.split())
        if i in j:
            new_state_1.append(state_dict[i])
            x=j.split(',')
            new_city.append(x[0])
df_F50['State'] = new_state_1
df_F50['City'] = new_city

In [None]:
#To drop 'location' column from dataframe.
df_F50.drop(['Location'],axis=1, inplace=True)

**Note: Two new columns 'State' and 'City' have been added in the replacment of column 'Locations.'**

# B. 5. Visualizing by state.

In [None]:
#To create barplot on the basis of 'State'
runall = ['Sales','Units','YOY_Sales','YOY_Units','Unit_Volume']
for c in runall:
    plt.figure(figsize = (10, 5))
    sns.barplot(x = 'State', y = c , data = df_F50)
    plt.xticks(rotation = 45)

# B. 6. Visualizing by city.

In [None]:
#To create barplot on the basis of 'City'
runall = ['Sales','Units','YOY_Sales','YOY_Units','Unit_Volume']
for c in runall:
    plt.figure(figsize = (20, 4))
    sns.barplot(x = 'City', y = c , data = df_F50)
    plt.xticks(rotation = 45)

# B. 7. Visualizing by Franchise.

In [None]:
#To create barplot on the basis of 'Franchise'
runall = ['Rank','Sales','Units','YOY_Sales','YOY_Units','Unit_Volume']
for c in runall:
    plt.figure(figsize = (3, 5))
    sns.barplot(x = 'Franchising', y = c , data = df_F50)
    plt.xticks(rotation = 45)

# B. 7. Visualizing by Sales.

In [None]:
#To create piechart on 'Sales' on the basis of 'Franchising', 'State', and 'City' that falls under them.
fig = px.sunburst(df_F50, path=['Franchising','State', 'City'], values='Sales')
fig.show()

# B. 7. Visualizing by Units.

In [None]:
#To create piechart on 'Units' on the basis of 'Franchising', 'State', and 'City' that falls under them.
fig = px.sunburst(df_F50, path=['Franchising','State', 'City'], values='Units')
fig.show()

# B. 7. Visualizing by Unit_Volume.

In [None]:
#To create piechart on 'Unit_Volume' on the basis of 'Franchising', 'State', and 'City' that falls under them.
fig = px.sunburst(df_F50, path=['Franchising','State', 'City'], values='Unit_Volume')
fig.show()

# C. Independence100

# C. 1. Loading Independence100 Dataframe

In [None]:
df_I100 = pd.read_csv('/kaggle/input/restaurant-business-rankings-2020/Independence100.csv')

# C. 2. Exploring Independence Dataframe.

In [None]:
df_I100.head()

In [None]:
df_I100.nunique()

In [None]:
df_I100.describe()

# C. 3. Looking for Null Values.

In [None]:
sns.heatmap(df_I100.isnull())

# C. 4. Since column 'State' is not in a proper format and in order to keep consistency, the data will be changed to universal format.

In [None]:
list_of_state = df_I100['State'].to_list()

In [None]:
new_state=[]
for i in list_of_state:
    x=i.strip()
    if x in state_dict:
        new_state.append(state_dict[x])

In [None]:
df_I100['State'] = new_state

In [None]:
df_I100.head()

# C. 5. Changing the column names in order follow the naming convention followed by other two dataframes for consistency.

In [None]:
df_I100.rename(columns = {'Average Check' : 'Average_Check', 'Meals Served' : 'Meals_Served'}, inplace = True)

# C. 6. Looking for Outliers.

In [None]:
df_I100.loc[detect_outliers(df_I100, ["Sales", "Average_Check", "Meals_Served"])]

# C. 7. Looking for possible correlation.

In [None]:
sns.pairplot(df_I100)

In [None]:
runall = ['Rank','Average_Check','Meals_Served']
for c in runall:
    plt.figure(figsize = (10, 5))
    sns.regplot(data=df_I100, x='Sales', y=c)

In [None]:
plt.figure(figsize = (10, 5))
sns.regplot(data=df_I100, x='Average_Check', y='Meals_Served')

# C. 8. Visualizing by State.

In [None]:
runall = ['Sales','Meals_Served','Average_Check']
for c in runall:
    plt.figure(figsize = (10, 4))
    sns.barplot(x = 'State', y = c , data = df_I100)
    plt.xticks(rotation = 45)

# C. 9. Visualizing by city.

In [None]:
runall = ['Sales','Meals_Served','Average_Check']
for c in runall:
    plt.figure(figsize = (20, 4))
    sns.barplot(x = 'City', y = c , data = df_I100)
    plt.xticks(rotation = 45)

# C. 10. Visualizing by Sales.

In [None]:
fig = px.sunburst(df_I100, path=['State', 'City'], values='Sales')
fig.show()

# C. 11. Visualizing by Average Check.

In [None]:
fig = px.sunburst(df_I100, path=['State', 'City'], values='Average_Check')
fig.show()

# C. 12. Visualizing by Meals Served.

In [None]:
fig = px.sunburst(df_I100, path=['State', 'City'], values='Meals_Served')
fig.show()

# To be Continued