In [None]:
# Download the NYC restaurant inspection data and geojson files
#
# https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j/about_data
# https://www.kaggle.com/datasets/saidakbarp/nyc-zipcode-geodata

In [71]:
import math
import json
import gzip

import pandas as pd
import numpy as np
from jinja2 import Template

import plotly.express as px
import plotly.graph_objects as go

nyc = json.loads(open('./data/nyc-restaurant-inspection-result/nyc-zip-code-tabulation-areas-polygons.geojson').read())
zip_codes = [int(feature['properties']['postalCode']) for feature in nyc['features']]

# get bounding box of NYC
min_lon = math.inf
max_lon = -math.inf
min_lat = math.inf
max_lat = -math.inf

for nyc_i in nyc['features']:
    coords = nyc_i['geometry']['coordinates'][0]
    if not coords:
        continue
    min_lon = min(min_lon, min(i[0] for i in coords))
    max_lon = max(max_lon, max(i[0] for i in coords))
    min_lat = min(min_lat, min(i[1] for i in coords))
    max_lat = max(max_lat, max(i[1] for i in coords))

center_lon = (min_lon+max_lon)/2.0
center_lat = (min_lat+max_lat)/2.0

years = [2021, 2022, 2023, 2024, 2025]

zip_code_to_borough = {int(feature['properties']['postalCode']): feature['properties']['borough'] for feature in nyc['features']}


colormap = {
    'Chinese': "#CC5151",
    'Korean': '#FF8A8A',
    'Asian/Asian Fusion': "#FF60C5",
    'Japanese': "#BC002D",

    'African': "#C2669A",
    'Caribbean': "#F6C4E0",

    'American': '#687FE5',
    'New American': "#021259",
    'Italian/Pizza': '#A7C1A8',

    'Latin American': "#517450",
    'Brazilian': "#36AC32",
    'Tex-Mex': "#FFC004",
    'Mexican': '#FDD354',
    'Spanish': "#FBE6A7",

    'Mediterranean': '#82F0FF',
    'French': '#F6F0F0',
    'Greek': '#00B3FF',
    'Jewish/Kosher': "#90C4FF",
    'Creole': '#98A1BC',
    'Eastern European': '#7C444F',
    'Donuts': '#FFF9BD',

    'Indian': "#FF820C",
    'Bangladeshi': "#FFC39B",
}

In [72]:
data = pd.read_csv('./data/nyc-restaurant-inspection-result/DOHMH_New_York_City_Restaurant_Inspection_Results_20250803.csv')

print(len(data['CAMIS'].unique()))

# drop some cusines

cuisines_to_drop = [
    'Coffee/Tea',
    'Other',
    'Bagels/Pretzels',
    'Sandwiches/Salads/Mixed Buffet',
    'Bottled Beverages',
    'Soups/Salads/Sandwiches',
    'Not Listed/Not Applicable',
    'Fruits/Vegetables',
    'Soups',
    'Pancakes/Waffles',
    'Nuts/Confectionary',
    'Bakery Products/Desserts',
    'Juice, Smoothies, Fruit Salads',
    'Sandwiches',
    'Frozen Desserts',
    'Chicken',
    'Seafood',
    'Donuts',
    'Hotdogs/Pretzels',
    'Hamburgers',
    'Continental',
    'Southwestern',
    'Chimichurri',
    'Haute Cuisine',
    'Chinese/Japanese',
    'Salads',
    'Hotdogs',
    # 'Steakhouse',
    # 'American'
]
data = data[~data['CUISINE DESCRIPTION'].isin(cuisines_to_drop)]

data = data[~pd.isna(data['CUISINE DESCRIPTION'])]
data['Year'] = data['INSPECTION DATE'].str.split('/').str[-1].astype(int)

# filter out nan
data = data[~pd.isna(data['ZIPCODE'])]
data['ZIPCODE'] = data['ZIPCODE'].astype(int)

# drop some zip codes
# these zip codes are for single buildings or blocks, so they are hard to visualize
zip_codes_to_drop = [11005, 10048, 10279, 10165, 10168, 10055, 10176, 10167, 10151, 10106, 10158, 10041, 10118, 10169, 10080, 10154]
data = data[data['ZIPCODE'].isin(zip_codes) & ~data['ZIPCODE'].isin(zip_codes_to_drop)]

# rename russian to eastern european
data['CUISINE DESCRIPTION'] = data['CUISINE DESCRIPTION'].replace({
    'Russian': 'Eastern European',
    'Pizza': 'Italian/Pizza',
    'Italian': 'Italian/Pizza',
})

cuisines = data['CUISINE DESCRIPTION'].unique().tolist()
data['borough'] = data['ZIPCODE'].map(zip_code_to_borough)

# group by year, zipcode, and drop duplicates by CAMIS
tmp = []
for _, group in data.groupby(['Year', 'ZIPCODE']):
    group = group.drop_duplicates(subset=['CAMIS'])
    tmp.append(group)
data = pd.concat(tmp)


30274


In [73]:
# for each zip code in zip_codes, count the number of each type of cuisine and also get the percentage of that cuisine in that zip code
results = []
for (year, zip_code_i), data_sub in data.groupby(['Year', 'ZIPCODE']):
    if year not in years or zip_code_i not in zip_codes:
        continue
    counts = data_sub['CUISINE DESCRIPTION'].value_counts()
    total = counts.sum()
    
    # get most common cuisine
    most_common_cuisine = counts.idxmax()

    for cuisine_i in cuisines:
        count = counts.get(cuisine_i, 0)
        percentage = count / total * 100
        results.append((year, zip_code_i, zip_code_to_borough[zip_code_i], cuisine_i, percentage, True if cuisine_i == most_common_cuisine else False))

results = pd.DataFrame(results, columns=['year', 'zipcode', 'borough', 'cuisine', 'percentage', 'is_most_common'])

results = results[(results['year']==2024) & (results['is_most_common'])]


all_cuisines = data['CUISINE DESCRIPTION'].unique().tolist()
for cuisine_i in all_cuisines:
    tmp = []
    for zip_code_i in results['zipcode']:
        count = len(data[(data['ZIPCODE'] == zip_code_i) & (data['CUISINE DESCRIPTION'] == cuisine_i) & (data['Year']==2024)])
        tmp.append(count)
    results[cuisine_i] = tmp

results

Unnamed: 0,year,zipcode,borough,cuisine,percentage,is_most_common,American,Mediterranean,Vegan,Latin American,...,Californian,Chinese/Cuban,Creole/Cajun,Scandinavian,Moroccan,Brazilian,Basque,Czech,Creole,Armenian
33000,2024,10001,Manhattan,American,40.701754,True,116,10,2,1,...,1,1,0,0,0,0,0,0,0,0
33060,2024,10002,Manhattan,American,21.538462,True,56,2,1,6,...,1,0,1,0,0,0,1,0,0,1
33120,2024,10003,Manhattan,American,30.322581,True,94,6,3,5,...,0,0,0,0,0,0,0,0,0,0
33180,2024,10004,Manhattan,American,42.857143,True,30,1,0,2,...,0,0,0,0,0,1,0,0,0,0
33240,2024,10005,Manhattan,American,54.838710,True,17,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44288,2024,11691,Queens,Caribbean,23.076923,True,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
44340,2024,11692,Queens,American,42.857143,True,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44400,2024,11693,Queens,American,50.000000,True,8,0,0,2,...,0,0,0,0,0,0,0,0,0,0
44460,2024,11694,Queens,American,57.894737,True,11,0,0,2,...,0,0,0,0,0,1,0,0,0,0


In [None]:
fig = px.choropleth_map(
    results,
    geojson=nyc,
    locations="zipcode",
    featureidkey="properties.postalCode",
    color="cuisine",
    color_discrete_map=colormap,
    labels={
        'zipcode':'Zip Code',
        'cuisine': 'Cuisine Type',
        'count': 'Restaurant Count',
        'borough': 'Borough'
    },
    # hover_data={
    #     "zipcode": True,
    #     "borough": True,
    #     "count": True
    # },
    center=dict(lat=center_lat, lon=center_lon),
    zoom=9.8,
    opacity=0.9,
    map_style='carto-positron',
)

fig.update_geos(
    visible=False,
)

fig.update_layout(
    margin={"r":0,"t":0,"l":0,"b":0, 'autoexpand': True}, autosize=True,
    font=dict(family="Helvetica, sans-serif", size=18, color="#000"),
)

fig_json = fig.to_plotly_json()

with gzip.open("./plots/nyc_cuisines.json.gz", "wt", encoding="utf-8") as f:
    json.dump(fig_json, f)

# fig.show(renderer='png', width=800, height=800)
# fig.write_image('./plots/250805-nyc-restarants.svg', width=1000, height=1000)

In [None]:
fig = go.Figure()

traces = []
for cuisine_i in all_cuisines:
    traces.append(
        go.Choroplethmap(
            z=results[cuisine_i],
            geojson=nyc,
            locations=results["zipcode"],
            featureidkey="properties.postalCode",
            name=cuisine_i,
            # marker_color=colormap[cuisine_i],
            # hoverinfo='location+z',
            colorscale='Blues',
            showlegend=True,
            visible=cuisine_i == all_cuisines[0],
            colorbar=dict(
                title='',
                len=0.5,
            ),
            customdata=results[['borough']].values.tolist(),
            hovertemplate='<b>Count:</b> %{z}<br><b>Zip Code:</b> %{location}<br><b>Borough:</b> %{customdata[0]}<br>'
        )
    )

layout = go.Layout(
    map_style="carto-positron",
    map_zoom=9.8,
    map_center=dict(lat=center_lat, lon=center_lon),
    margin=dict(r=0, t=40, l=0, b=0),
    width=900,
    height=800,
    title=dict(
        text=f"{all_cuisines[0]} Cuisine",
        x=0.5,
    ),
    updatemenus=[
        dict(
            buttons=[
                dict(
                    label=col,
                    method="update",
                    args=[
                        {"visible": [i == j for j in range(len(all_cuisines))]},
                        {
                            "title": {'text': f"{col} Cuisine", 'x': 0.5},
                            'coloraxis': {'colorbar': {'title': ''}},
                        }
                    ],
                )
                for i, col in enumerate(all_cuisines)
            ],
            direction="down",
            showactive=True,
            x=0.05,
            xanchor="left",
            y=1.1,
            yanchor="top"
        )
    ]
)

fig = go.Figure(data=traces, layout=layout)

fig_json = fig.to_plotly_json()
with gzip.open("./plots/nyc_cuisines_per_cuisine.json.gz", "wt", encoding="utf-8") as f:
    json.dump(fig_json, f)