# CAPSTONE PROJECT - SEARCH FOR THE SMART CHOICE OF RESTAURANT AS A BUSINESS IN NYC

#### install and import necessary libraries

In [1]:
! pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 8.8MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.2 soupsieve-1.9.5


In [2]:
! pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
[K     |████████████████████████████████| 102kB 7.5MB/s ta 0:00:011
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0


In [3]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print("Done importing libraries.")

Done importing libraries.


## Obtain data set, open source from NYC.gov website on restaurants

#### convert to dataframe

In [4]:
url={'data':'https://data.ny.gov/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD&sorting=true'}
df = pd.read_csv(url["data"])
df.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,40965997,BONSIGNOUR,Manhattan,35,JANE STREET,10014.0,2122299700,American,06/25/2019,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish a...,Y,23.0,,,01/29/2020,Cycle Inspection / Initial Inspection,40.738136,-74.004471,102.0,3.0,7700.0,1077100.0,1006260000.0,MN23
1,50017465,PRO THAI,Manhattan,1575,LEXINGTON AVE,10029.0,9174751494,Thai,01/10/2019,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rins...",Y,15.0,,,01/29/2020,Cycle Inspection / Initial Inspection,40.788733,-73.948824,111.0,8.0,16600.0,1051813.0,1016280000.0,MN33
2,50006614,HUERTAS,Manhattan,107,1ST AVE,10003.0,2122284490,Spanish,12/10/2019,Violations were cited in the following area(s).,06E,"Sanitized equipment or utensil, including in-u...",Y,21.0,,,01/29/2020,Cycle Inspection / Initial Inspection,40.726712,-73.98594,103.0,2.0,3800.0,1006286.0,1004480000.0,MN22
3,50064312,BETTY BAKERY,Brooklyn,448,ATLANTIC AVE,11217.0,7182372271,Bakery,05/16/2018,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,N,7.0,A,05/16/2018,01/29/2020,Cycle Inspection / Initial Inspection,40.686396,-73.9835,302.0,33.0,4100.0,3000934.0,3001840000.0,BK38
4,50006763,LUDWIG'S AT THE YACHT CLUB,Queens,533,BEACH 126TH ST,11694.0,7186344939,American,06/27/2018,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,N,13.0,A,06/27/2018,01/29/2020,Cycle Inspection / Re-inspection,40.581287,-73.848,414.0,32.0,93401.0,4303974.0,4161940000.0,QN10


## Data cleaning

#### get rid of NaN, blank and rows with no distinct cuisine type mentioned

In [5]:
nycrest = df[["DBA","BORO","CUISINE DESCRIPTION","Latitude","Longitude"]]
nycrest = nycrest[~nycrest['BORO'].str.contains('0')]
nycrest = nycrest[~nycrest['CUISINE DESCRIPTION'].str.contains('Not Listed/Not Applicable')]
nycrest = nycrest.dropna()
print('Check number of null or nan rows:',nycrest.isnull().sum())
nycrest.head()

Check number of null or nan rows: DBA                    0
BORO                   0
CUISINE DESCRIPTION    0
Latitude               0
Longitude              0
dtype: int64


Unnamed: 0,DBA,BORO,CUISINE DESCRIPTION,Latitude,Longitude
0,BONSIGNOUR,Manhattan,American,40.738136,-74.004471
1,PRO THAI,Manhattan,Thai,40.788733,-73.948824
2,HUERTAS,Manhattan,Spanish,40.726712,-73.98594
3,BETTY BAKERY,Brooklyn,Bakery,40.686396,-73.9835
4,LUDWIG'S AT THE YACHT CLUB,Queens,American,40.581287,-73.848


## Check the "absent" cuisines relative to others

#### Overall lowest count of cuisine types in NYC by borough

In [7]:
cuisinecount = nycrest[["CUISINE DESCRIPTION","DBA"]]
cuisinecount.groupby(["CUISINE DESCRIPTION"])["DBA"].count().reset_index(name="count").nsmallest(25,'count').set_index('CUISINE DESCRIPTION')

Unnamed: 0_level_0,count
CUISINE DESCRIPTION,Unnamed: 1_level_1
Basque,7
Chilean,34
Czech,43
Nuts/Confectionary,43
Iranian,66
Scandinavian,76
Southwestern,80
Fruits/Vegetables,82
Cajun,86
Californian,93


#### Lowest cuisines per borough

In [6]:
forcounts = nycrest[["DBA","BORO","CUISINE DESCRIPTION"]]
print('Check which cuisine type is not the most prevalent per NYC Boro')
forcounts.groupby(["BORO","CUISINE DESCRIPTION"])["DBA"].count().reset_index(name="count").nsmallest(40,'count').set_index('BORO')

Check which cuisine type is not the most prevalent per NYC Boro


Unnamed: 0_level_0,CUISINE DESCRIPTION,count
BORO,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,Pakistani,3
Bronx,Hotdogs,4
Brooklyn,English,4
Bronx,Vegetarian,5
Manhattan,Creole/Cajun,6
Bronx,Armenian,7
Manhattan,Basque,7
Queens,Fruits/Vegetables,7
Staten Island,Continental,7
Staten Island,Tapas,7


## Prepare coordinate data for mapping

In [8]:
geo = Nominatim(user_agent="my-application")
nycloc = geo.geocode('New York')
nyclat = nycloc.latitude
nyclon = nycloc.longitude

nycrest_map = folium.Map(location=[nyclat, nyclon], zoom_start=10)

#### Mapping for hotdogs

In [9]:
hotdogs = nycrest[nycrest['CUISINE DESCRIPTION'].str.contains('Hotdogs')]

for lt, ln, bor in zip(hotdogs['Latitude'], hotdogs['Longitude'],hotdogs['BORO']):
    label = '{}'.format(bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lt, ln],
        radius=2.5,
        popup=label,
        color='black',
        fill=True,
        fill_color='PuBu',
        fill_opacity=0.5).add_to(nycrest_map)  
    
nycrest_map

#### Mapping for English style food

In [10]:
english_food = nycrest[nycrest['CUISINE DESCRIPTION'].str.contains('English')]

for lt, ln, bor in zip(english_food['Latitude'], english_food['Longitude'],english_food['BORO']):
    label = '{}'.format(bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lt, ln],
        radius=2.5,
        popup=label,
        color='black',
        fill=True,
        fill_color='PuBu',
        fill_opacity=0.5).add_to(nycrest_map)  
    
nycrest_map

#### Mapping for pancakes

In [11]:
pancakes = nycrest[nycrest['CUISINE DESCRIPTION'].str.contains('Pancakes')]

for lt, ln, bor in zip(pancakes['Latitude'], pancakes['Longitude'],pancakes['BORO']):
    label = '{}'.format(bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lt, ln],
        radius=2.5,
        popup=label,
        color='black',
        fill=True,
        fill_color='PuBu',
        fill_opacity=0.5).add_to(nycrest_map)  
    
nycrest_map