In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 2020 Restaurant's Sales

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib.ticker import FuncFormatter
import numpy as np
import seaborn as sns
import wordcloud

In [None]:
data = pd.read_csv("../input/restaurant-business-rankings-2020/Future50.csv")
data.dtypes

In [None]:
data.head()

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(data.loc[:, "Restaurant"], data.loc[:, "Sales"])
plt.xlabel("Restaurant"
          ,fontsize=18)
plt.ylabel("Sales"
          ,fontsize=18)
plt.xticks(rotation="vertical"
         ,fontsize=12)
plt.yticks(fontsize=12)
plt.title("2020 Restaurant's Sales")

plt.hlines(20, -1, 50, color="r", linestyles="--", linewidth=3, label="Sale = 20")
plt.hlines(50, -1, 50, color="greenyellow", linestyles="--", linewidth=3, label="Sale = 50")
plt.legend(loc="upper center"
          ,fontsize=20)

plt.show()

From above picture, we can see that all of the restaurants' sales between 20 ~ 50.

# 2. Data Cleaning
1. Seperate the column, make the data in Location to City and Region, so we can find that which Region in America has most Sales.
2. If we directly groupby Region might find that there are two Calif. in the result but actually we only need one. Then we soon find out that there are spaces in front of Calif., so we need to fix it.

In [None]:
# "YOY_Sales"、"YOY_Units": object -> float
data.loc[:, "New_YOY_Sales"] = data.loc[:, "YOY_Sales"].map(lambda x: x.split("%")[0]).astype(float)
data.loc[:, "New_YOY_Units"] = data.loc[:, "YOY_Units"].map(lambda x: x.split("%")[0]).astype(float)

# "Location" -> "Region" + "City"
data.loc[:, "City"] = data.loc[:, "Location"].map(lambda x: x.split(",")[0])
data.loc[:, "Region"] = data.loc[:, "Location"].map(lambda x: x.split(",")[1])

In [None]:
data.loc[:, "Region"].unique()

From above output, we can see that there are 'Calif.' and '   Calif.'
We need to remove blank in string.

In [None]:
# Remove the spaces in string in data.loc[:, "Region"]
data.loc[:, "Region"] = data.loc[:, "Region"].map(lambda x: x.strip())
data.loc[:, "Region"].unique()

# 3. 2020 Restaurant's YOY_Sales

In [None]:
data.head()

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(data.loc[:, "Restaurant"], data.loc[:, "New_YOY_Sales"])

# 设置轴标签、轴刻度
plt.xlabel("Restaurant"
          ,fontsize=18)
plt.ylabel("YOY_Sales"
          ,fontsize=18)
plt.xticks(rotation="vertical"
          ,fontsize=12)
plt.yticks(fontsize=12)
plt.title("2020 Restaurant's YOY_Sales"
         ,fontsize=20)

def to_percent(temp, position):
    return "%.1f" % temp + "%"
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))


# 绘制水平线
plt.hlines(40, -1 ,50, color="r", linestyles="--", linewidth=3, label="YOY_Sales=40%")
plt.legend(loc="upper center"
          ,fontsize=16)
plt.show()

Due to Covid-19, most of restaurant's YOY_Sales are below 40%.

# 4. 2020 Region's Sales

In [None]:
region_ = data.groupby(["Region"]).sum()
region_.reset_index(inplace=True)
region_.head()

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(region_.loc[:, "Region"], region_.loc[:, "Sales"])

plt.xlabel("Region"
          ,fontsize=18)
plt.ylabel("Sales"
          ,fontsize=18)
plt.xticks(rotation="vertical"
          ,fontsize=12)
plt.yticks(fontsize=12)
plt.title("2020 Region's Sales"
         ,fontsize=20)

plt.show()

We can find that: Calif. > N.Y. > Texas

Then try to find why the sales in Calif. is the highest.

In [None]:
# To count the number of restaurant in every region
region__ = data.groupby("Region")["Restaurant"].count()
region__ = pd.DataFrame(region__)

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(region__.index, region__.loc[:, "Restaurant"])
plt.xlabel("Region"
          ,fontsize=18)
plt.ylabel("The Number of Restaurant"
          ,fontsize=18)
plt.xticks(rotation="vertical"
           ,fontsize=12)
plt.yticks(fontsize=12)
plt.title("The Number of Restaurant in Each Region"
         ,fontsize=20)

plt.show()

So, we can see that there are 10 restaurants in Calif. Only 4 in Ohio. This is why the Sales in Calif. is the highest.

# 5. Relation between YOY_Sales and YOY_Units

In [None]:
data.head()

In [None]:
plt.figure(figsize=(10, 7))
sns.set(context="notebook"
       ,style="darkgrid")

# 绘制散点图
sns.scatterplot(data.loc[:, "New_YOY_Units"], data.loc[:, "New_YOY_Sales"]
               ,hue=data.loc[:, "Franchising"])

# 设置轴坐标、轴刻度
plt.xlabel("YOY_Units"
          ,fontsize=18)
plt.ylabel("YOY_Sales"
          ,fontsize=18)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

def to_percent(temp, position):
    return "%.1f" % temp + "%"
plt.gca().xaxis.set_major_formatter(FuncFormatter(to_percent))
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))

plt.legend(fontsize=18)
plt.show()

# 6. Franchising vs. Not Franchising

In [None]:
counts_ = []
categories_ = []
for x in data.loc[:, "Franchising"].unique():
    mask = (data.loc[:, "Franchising"] == x)
    counts_.append(data.loc[mask, "Restaurant"].count())
    categories_.append(x)

In [None]:
plt.figure(figsize=(10, 7))
patches, l_text, p_text = plt.pie(x=counts_, labels=categories_
                                  ,autopct="%0.1f%%", radius=1
                                  ,pctdistance=0.6, labeldistance=1.1)
for t in l_text:
    t.set_size(18)
for t in p_text:
    t.set_size(18)
    
plt.title("Franchising vs. Not Franchising"
         ,fontsize=20)
plt.show()

In [None]:
franchising_ = pd.DataFrame(data.groupby("Franchising")["Sales"].sum())
franchising_.head()

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(franchising_.index, franchising_.loc[:, "Sales"])

plt.xlabel("Franchising"
          ,fontsize=18)
plt.ylabel("Sales"
          ,fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.title("")
plt.show()

# 7. Display key restaurant by wordcloud

In [None]:
words = ""
for x in data.loc[:, "Restaurant"]:
    words += x + " "

w = wordcloud.WordCloud()
w.generate(words)

plt.figure(figsize=(10, 7))
plt.imshow(w, interpolation="gaussian")
plt.axis("off")

plt.show()