# Michelin Star Restaurant Guide Dashboard

## Dataset Attributes

- **Name**: The name of the Michelin-starred restaurant.
- **Address**: The full street address of the restaurant.
- **Location**: The city and country where the restaurant is located.
- **Price**: Price range indicator, using $ symbols (e.g. $$$$ for very expensive).
- **Cuisine**: The type or style of cuisine served at the restaurant.
- **Longitude**: The geographic longitude coordinate of the restaurant's location.
- **Latitude**: The geographic latitude coordinate of the restaurant's location.
- **PhoneNumber**: The contact phone number for the restaurant.
- **Url**: The URL of the restaurant's page on the official Michelin Guide website.
- **WebsiteUrl**: The URL of the restaurant's own official website.
- **Award**: The Michelin star rating awarded to the restaurant (e.g. "3 Stars").
- **GreenStar**: A binary indicator (0 or 1) of whether the restaurant has received a Michelin Green Star for sustainability.
- **FacilitiesAndServices**: A list of amenities and services offered by the restaurant.
- **Description**: A brief description of the restaurant, often including details about the chef and cuisine.

## Dependency

In [None]:
# %pip install -r .\requirements.txt
# %pip install -q pandas plotly dash dash-bootstrap-components pyarrow python-dotenv
# %pip freeze > requirements.txt # WARNING!! run this only on a linux distro or wsl

### Imports

In [None]:
import pandas as pd
import pyarrow as pa
from pandas import DataFrame
from pandas._typing import ArrayLike
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_columns", None)
pd.options.mode.copy_on_write = True

## Dataset

In [None]:
df = pd.read_csv("data/michelin_by_Jerry_Ng.csv")

In [None]:
df.head()

## Data cleaning

### Location columns

In [None]:
"""
Cell generated by Data Wrangler.
"""


def clean_data(df: DataFrame):
    # Split text using string ',' in column: 'Location'
    loc_0 = df.columns.get_loc("Location")
    df_clean_split = (
        df["Location"].str.split(pat=",", expand=True).add_prefix("Location_")
    )
    df = pd.concat([df.iloc[:, :loc_0], df_clean_split, df.iloc[:, loc_0:]], axis=1)
    # Rename column 'Location_0' to 'Location_city'
    df = df.rename(columns={"Location_0": "Location_city"})
    # Rename column 'Location_1' to 'Location_country'
    df = df.rename(columns={"Location_1": "Location_country"})
    # Fill missing country values with this dict
    city_country_map = {
        "Singapore": "Singapore",
        "Hong Kong": "China",
        "Macau": "China",
        "Dubai": "United Arab Emirates",
        "Luxembourg": "Luxembourg",
        "Abu Dhabi": "United Arab Emirates",
    }
    df["Location_country"] = df["Location_country"].fillna(
        df["Location_city"].map(city_country_map)
    )
    return df


df_clean = clean_data(df.copy())
df_clean.head()


def select_unique_location_city_where_location_country_is_missing(
    df_clean_1: DataFrame,
) -> ArrayLike:
    # Filter rows based on column: 'Location_country'
    df_clean_1 = df_clean_1[df_clean_1["Location_country"].isna()]
    return df_clean_1["Location_city"].unique()


missing_countries = select_unique_location_city_where_location_country_is_missing(
    df_clean.copy()
)
if missing_countries.size > 0:
    missing_countries
    raise Exception("Missing countries found")


### Price column

In [None]:
"""
Cell generated by Data Wrangler.
"""


def standardize_price(price):
    if pd.isna(price):
        return "Unknown"

    return "$" * len(price)


def clean_data(df: DataFrame):
    # Created column 'Standardized_Price' from formula
    df["Standardized_Price"] = df["Price"].apply(standardize_price)
    return df


df_clean_1 = clean_data(df_clean.copy())
df_clean_1.head()

### FacilitiesAndServices columns

In [None]:
# Create a new DataFrame with Name, Address, and all facilities and services in one column
df_facilitiesandservices = df[["Name", "Address", "FacilitiesAndServices"]].copy()
df_facilitiesandservices["FacilitiesAndServices"] = df_facilitiesandservices[
    "FacilitiesAndServices"
].str.split(",")
df_facilitiesandservices = df_facilitiesandservices.explode("FacilitiesAndServices")
df_facilitiesandservices["FacilitiesAndServices"] = df_facilitiesandservices[
    "FacilitiesAndServices"
].str.strip()

# df_facilitiesandservices = df_facilitiesandservices.reset_index(drop=True)

df_facilitiesandservices.head()

### Cuisine columns

In [None]:
# Create a new DataFrame with Name, Address, and all cuisines in one column
df_cuisine = df[["Name", "Address", "Cuisine"]].copy()
df_cuisine["Cuisine"] = df_cuisine["Cuisine"].str.split(",")
df_cuisine = df_cuisine.explode("Cuisine")
df_cuisine["Cuisine"] = df_cuisine["Cuisine"].str.strip()

# Reset the index
# df_cuisine = df_cuisine.reset_index(drop=True)

# Display the first few rows of the new DataFrame
df_cuisine.head()

### Duplicate rows

#### Primary column

In [None]:
primary_col = df_clean_1[["Name", "Address"]].value_counts()

if primary_col[primary_col > 1].size > 0:
    primary_col[primary_col > 1]
    raise Exception("Duplicate records found")

### Missing values

In [None]:
_ = df_clean_1.isna().sum()
_[_ > 0]

## EDA

In [None]:
pd.concat(
    [
        df_clean_1.describe(include=["object"]).loc[
            :,
            [
                "Location_city",
                "Location_country",
                "Standardized_Price",
                "Award",
            ],
        ],
        df["GreenStar"].astype("object").describe(),
        df_cuisine.describe()["Cuisine"],
        df_facilitiesandservices.describe()["FacilitiesAndServices"],
    ],
    axis=1,
)
