In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import folium

In [None]:
# Settings to make graphs pretty
matplotlib.rc('figure', figsize=(9, 6), autolayout=True)
matplotlib.rc('font', family='serif')
matplotlib.rc('axes', labelsize=12, labelpad=10, titlesize=20, titlepad=15, titleweight=700)
matplotlib.rc('axes.spines', top=False, right=False)
matplotlib.rc('xtick', labelsize=11)
matplotlib.rc('ytick', labelsize=11)
sns.set_palette('light:#80EAFF_r')

# Load and clean-up data

In [None]:
restaurant_data = pd.read_csv("../input/asian-restaurants/asia.csv", index_col='case')
restaurant_data.head()

In [None]:
# Give the columns more-intuitive names
data_dict = {
    'gReviews': 'google_reviews',
    'gRating': 'google_rating',
    'ddReviews': 'doordash_reviews',
    'ddRating': 'doordash_rating',
    'ddFee': 'doordash_fee',
    'yReviews': 'yelp_reviews',
    'yRating': 'yelp_rating',
}
restaurant_data.rename(columns=data_dict, inplace=True)

# Create a column of actual price-ranges
prices = {
    1: '$10 and below',
    2: '$10 to \$30',
    3: '$30 and above',
}
restaurant_data['price_range'] = restaurant_data['price'].map(prices)
restaurant_data.info()

> Some restaurants have a `DoorDash` rating of zero and **no review/fee info**.

In [None]:
def highlight_doordash_info(data):
    df = pd.DataFrame()
    df = df.reindex_like(data).fillna('')
    df[
        ['doordash_reviews', 'doordash_rating', 'doordash_fee']
    ] = 'background-color: yellow'
    return df


(
    # Get 5 rows where the doordash_rating is zero
    restaurant_data.query('doordash_rating == 0').head()
    # Highlight doordash_reviews & doordash_rating columns
    .style.apply(highlight_doordash_info, axis=None)
)

> These instances result in an unusual peak at `doordash_rating==0`.

In [None]:
_ = restaurant_data['doordash_rating'].plot.hist(title='DoorDash Ratings', figsize=(6, 4))

> Let's assume that these instances represent missing values, and replace the zeros with 'NaN' so that they don't affect statistics(mean, mode, median, ...).

In [None]:
# For each platform, treat cases without reviews and having rating == 0 as 'NaN'
for platform in ['doordash', 'google', 'yelp']:
    updated_rating = (
        restaurant_data.where(restaurant_data[f'{platform}_reviews'] > 0)
                       .get(f'{platform}_rating')
    )
    restaurant_data[f'{platform}_rating'] = updated_rating

# Calculate average rating
restaurant_data['average_rating'] = \
    restaurant_data[['doordash_rating', 'google_rating', 'yelp_rating']].mean(axis=1)

In [None]:
_ = restaurant_data['doordash_rating'].plot.hist(title='Updated DoorDash Ratings')

# 1. Ratings by Platform

### 1.1 Google

In [None]:
def print_summary(platform, data=restaurant_data):
    """Get a brief summary of the restaurant reviews and ratings at the
    specified platform.
    
    Parameters
    ----------
    platform : {'doordash', 'google', 'yelp'}
    data : pandas.DataFrame
    """
    ratings = data[f"{platform}_rating"]
    reviews = data[f"{platform}_reviews"]
    print(
        f"The highest restaurant rating on {platform.title()} is {ratings.max()} "
        f"{set(data[ratings==ratings.max()].get('restaurant'))}, "
        f"and the minimum is {ratings.min()} "
        f"{set(data[ratings==ratings.min()].get('restaurant'))}.\n\n"
        f"There are {reviews.sum():,.0f} reviews on {platform.title()}, "
        "with the most reviewed restaurant(s) "
        f"{set(data[reviews==reviews.max()].get('restaurant'))} "
        f"having {reviews.max():,.0f}.\n"
    )

In [None]:
print_summary('google')
ax = restaurant_data['google_rating'].value_counts().sort_index().plot.bar()
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Count-plot of Google Ratings')
ax.grid(axis='y')

for bar in ax.patches:
    ax.text(
        s=f'{bar.get_height():.0f}', x=bar.get_x(), y=bar.get_height()*1.02,
        ha='left'
    )

## 1.2 DoorDash

In [None]:
print_summary('doordash')
ax = restaurant_data['doordash_rating'].value_counts().sort_index().plot.bar()
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Count-plot of DoorDash Ratings')
ax.grid(axis='y')

for bar in ax.patches:
    ax.text(
        s=f'{bar.get_height():.0f}', x=bar.get_x()+bar.get_width()/2,
        y=bar.get_height()*1.02, ha='center'
    )

## 1.3 Yelp

In [None]:
print_summary('yelp')
ax = restaurant_data['yelp_rating'].value_counts().sort_index().plot.bar()
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Count-plot of Yelp Ratings')
ax.grid(axis='y')

for bar in ax.patches:
    ax.text(
        s=f'{bar.get_height():.0f}', x=bar.get_x()+bar.get_width()/2,
        y=bar.get_height()*1.02, ha='center', size=11
    )

## 1.4 Average Ratings

In [None]:
ax = restaurant_data['average_rating'].plot.hist(bins=15)
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Histogram of Average(Combined) Ratings')
ax.grid(axis='y')

for bar in ax.patches:
    ax.text(
        s=f'{bar.get_height():.0f}', x=bar.get_x()*1.01,
        y=bar.get_height()*1.02, ha='left'
    )

In [None]:
ax = restaurant_data.filter(like='rating').mean().plot.bar()
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Average Rating Accross Platforms')
ax.grid(axis='y')

for bar in ax.patches:
    ax.text(
        x=bar.get_x()+bar.get_width()/2, y=bar.get_height()*1.02, size=11,
        s=f'{bar.get_height():.2f}', ha='center', fontfamily='serif'
    )

# 2. Ratings by Price

In [None]:
fig = matplotlib.figure.Figure(figsize=(12, 10))
axes = fig.subplots(ncols=2, nrows=2)
platforms = ['doordash', 'google', 'yelp', 'average']

for platform, ax in zip(platforms, axes.flatten()):
    sns.boxplot(
        data=restaurant_data, y=f'{platform}_rating', x='price_range',
        order=['$10 and below', '$10 to \$30', '$30 and above'], ax=ax
    )
    ax.set_title(f'{platform.title()} Ratings by Price', size=15)
fig

Pricier restaurants seem to have better ratings, perhaps because "special dishes" and high-quality ingredients tend to be costlier.

# 3. Ratings by Number of Reviews

In [None]:
fig = matplotlib.figure.Figure(figsize=(12, 14))
axes = fig.subplots(ncols=2, nrows=3)
platforms = ['doordash']*2 + ['google']*2 + ['yelp']*2
i = 0
for platform, ax in zip(platforms, axes.flatten()):
    if i % 2 == 0:
        sns.regplot(
            data=restaurant_data, x=f'{platform}_reviews', y=f'{platform}_rating',
            ax=ax
        )
        ax.set_title(f'{platform.title()} Ratings by No. of Reviews', size=15)
        i += 1
    else:
        sns.histplot(restaurant_data[f'{platform}_reviews'], kde=True, ax=ax)
        i += 1
        ax.set_title(f'Distribution of {platform.title()} Reviews', size=15)
fig

There doesn't seem to be a significant linear relationship between the ratings and the number of reviews made. Both the highest and lowest ratings mostly stem from restaurants having few reviews.

# 4. Ratings by Town

In [None]:
town_averages = restaurant_data.groupby('town').mean().sort_values(by='average_rating')
town_averages.filter(like='rating').style.bar(color="#00ccff")

In [None]:
m = folium.Map(
    zoom_start=9, min_zoom=8, max_zoom=10,
    location=tuple(town_averages[['latitude', 'longitude']].mean())
)
colors = [f"rgba{tuple(map(lambda x: x * 256, vals))}"
          for vals in sns.color_palette('Blues', n_colors=len(town_averages))]
i = 0
for town, row in town_averages.iterrows():
    folium.CircleMarker(
        location=[row.latitude, row.longitude],
        radius=row.average_rating * 4,
        tooltip=f"<b>{town.title()}</b><br>Average rating = {row.average_rating:.4f}",
        color=colors[i],
        fill=True,
    ).add_to(m)
    i += 1
m