# Coursera Capstone Project
## Choosing the ideal location to open a Chinese Restaurant in Chicago

In [8]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')

Libraries imported.


### 1.Scrape the Wikipedia page for neighborhood information

In [9]:
# import the library we use to open URLs
import urllib.request

# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/Community_areas_in_Chicago"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

### Using BeautifulSoup package to scrape the table

In [10]:
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
import requests
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Community_areas_of_Chicago").text
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neigh = []

### Get the dataframe of 78 community areas in Chicago

In [11]:
right_table=soup.find_all('div', class_='mw-category')[0].findAll('li')

for row in right_table:
    neigh.append(row.text)

neigh = neigh[4:]
    
df = pd.DataFrame({'Neighborhood': neigh})
df['Neighborhood'] = df.Neighborhood.str.replace(', Chicago','', regex=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Neighborhood
0,Albany Park
1,Archer Heights
2,Armour Square
3,Ashburn
4,Auburn Gresham


In [14]:
# print the number of rows of the dataframe
df.head()

Unnamed: 0,Neighborhood
0,Albany Park
1,Archer Heights
2,Armour Square
3,Ashburn
4,Auburn Gresham


### 2.Get the geographical coordinates of the community areas

Import relevant libraries

In [12]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Folium installed
Libraries imported.


In [13]:
import geocoder
# define a function to get coordinates
def get_coor(neighborhood):
    # initialize your variable to None
    lat_long_coords = None
    # loop until you get the coordinates
    while(lat_long_coords is None):
        g = geocoder.arcgis('{},IL,USA'.format(neighborhood))
        lat_long_coords = g.latlng
    return lat_long_coords

In [14]:
coords = [ get_coor(nei) for nei in df['Neighborhood'].tolist() ]

In [16]:
df_coor = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coor['Latitude']
df['Longitude'] = df_coor['Longitude']
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Albany Park,41.96829,-87.72338
1,Archer Heights,41.81154,-87.72556
2,Armour Square,41.83458,-87.63189
3,Ashburn,41.74785,-87.70995
4,Auburn Gresham,41.74319,-87.65504


In [35]:
address = 'Albany Park Chicago'

lat = df.loc[0,'Latitude']
long = df.loc[0,'Longitude']

In [69]:
# create map of Chicago neighborhoods using latitude and longitude values
map_chicago = folium.Map(location=[lat,long], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], 
                                           df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago

In [20]:
# save the map as HTML file
map_chicago.save('map_chicago.html')

### 3. Use the Foursquare API to explore the neighborhoods

In [21]:
CLIENT_ID = 'C41VKH2DGZUFOXKPMNF51ZIWFVDWXXNXPEBCP3DLJBLBRFCY' # your Foursquare ID
CLIENT_SECRET = 'WMFRFL3TVPRTKOU3ZL1ZFXYBJKGNLGE13BQUG2YJZHDSU5M0' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: C41VKH2DGZUFOXKPMNF51ZIWFVDWXXNXPEBCP3DLJBLBRFCY
CLIENT_SECRET:WMFRFL3TVPRTKOU3ZL1ZFXYBJKGNLGE13BQUG2YJZHDSU5M0


### Search for different types of restaurants around Chicago

In [28]:
venues = []
search_query = 'Chinese Restaurant'

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [96]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

# Filter with Chinese restaurant category
Crestaurants_df = venues_df[venues_df['VenueCategory'].str.contains(search_query)]
Crestaurants_df.reset_index(drop=True, inplace=True)

Jrestaurants_df = venues_df[venues_df['VenueCategory'].str.contains('Japanese')]
Jrestaurants_df.reset_index(drop=True, inplace=True)

Frestaurants_df = venues_df[venues_df['VenueCategory'].str.contains('French')]
Frestaurants_df.reset_index(drop=True, inplace=True)

In [86]:
Crestaurants_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Albany Park,41.96829,-87.72338,Peking Mandarin Resturant,41.968292,-87.715783,Chinese Restaurant
1,Armour Square,41.83458,-87.63189,Han 202,41.837958,-87.641753,Chinese Restaurant
2,Armour Square,41.83458,-87.63189,Chinese Kitchen,41.838226,-87.637581,Chinese Restaurant
3,Armour Square,41.83458,-87.63189,New Furama Restaurant,41.841439,-87.631359,Chinese Restaurant
4,Avalon Park,41.74507,-87.58816,Hoe Toy Chop Suey,41.738951,-87.585135,Chinese Restaurant


In [93]:
Jrestaurants_df

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Near West Side,41.88003,-87.66672,Gyu-Kaku Japanese BBQ,41.884449,-87.661385,Japanese Restaurant
1,North Park,41.98294,-87.71915,Midori,41.983062,-87.711808,Japanese Restaurant
2,West Town,41.89329,-87.65743,Tsukiji Fish Market,41.891172,-87.656812,Japanese Restaurant


In [97]:
Frestaurants_df

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Edison Park,42.00789,-87.81399,Café Touché,42.003467,-87.817173,French Restaurant
1,Near North Side,41.90021,-87.63433,Kiki's Bistro,41.898947,-87.635895,French Restaurant


In [167]:
la = 41.851832
longg =  -87.623177

venues_map = folium.Map(location=[la, longg], zoom_start=10) # generate map centred around the Conrad Hotel

# add a red circle marker to represent the Chicago Loop Community area

for l, lo, lab in zip(Crestaurants_df.Latitude, Crestaurants_df.Longitude, 
                           Crestaurants_df.Neighborhood):
    folium.features.CircleMarker(
        [l, lo],
        radius=10,
        color='red',
        popup= lab,
        fill = True,
        fill_color = 'red',
        fill_opacity = 0.6
    ).add_to(venues_map)

# add the Chinese restaurants as blue circle markers
for lat, lng, label in zip(Crestaurants_df.VenueLatitude, 
                           Crestaurants_df.VenueLongitude, Crestaurants_df.VenueName):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup = folium.Popup(label, parse_html=True),
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)
    
# add the French restaurants as orange circle markers
for lat, lng, label in zip(Frestaurants_df.VenueLatitude, 
                           Frestaurants_df.VenueLongitude, Frestaurants_df.VenueName):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='orange',
        popup = folium.Popup(label, parse_html=True),
        fill = True,
        fill_color='orange',
        fill_opacity=0.6
    ).add_to(venues_map)


# add the Japanese restaurants as green circle markers
for lat, lng, label in zip(Jrestaurants_df.VenueLatitude, 
                           Jrestaurants_df.VenueLongitude, Jrestaurants_df.VenueName):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='green',
        popup = folium.Popup(label, parse_html=True),
        fill = True,
        fill_color='green',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

In [95]:
venues_map.save('venues_map.html')

### 4. Cluster Community Areas

Run k-means to cluster the community areas in Chicago into 3 clusters.

In [126]:
count = pd.DataFrame(Crestaurants_df.groupby('Neighborhood').size())
count.reset_index(inplace=True)
count.columns = ['Neighborhood','size']
chi_res = pd.merge(left = df,right=count,left_on='Neighborhood',right_on='Neighborhood',how='left')

chi_res['size'].fillna(0,inplace=True)
chi_res.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,size
0,Albany Park,41.96829,-87.72338,1.0
1,Archer Heights,41.81154,-87.72556,0.0
2,Armour Square,41.83458,-87.63189,3.0
3,Ashburn,41.74785,-87.70995,0.0
4,Auburn Gresham,41.74319,-87.65504,0.0


In [156]:
# Number of clusters
cluster_size = 3

chi_clustering = chi_res.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=cluster_size, random_state=0).fit(chi_clustering)
labels = pd.DataFrame(kmeans.labels_,columns = ['Cluster'])

# check how many cluster labels are generated
set(kmeans.labels_)

{0, 1, 2}

In [168]:
la = 39.742043
longg =  -104.991531
colo = ['blue','green','red']

restaurants_map = folium.Map(location=[la, longg], zoom_start=5) # generate map centred around the Conrad Hotel


for l, lo, lab,name in zip(df.Latitude, df.Longitude, 
                           labels.Cluster,df.Neighborhood):
    folium.features.CircleMarker(
        [l, lo],
        radius=8,
        color = colo[lab],
        popup = folium.Popup(name, parse_html=True),
        fill = True,
        fill_color = colo[lab],
        fill_opacity = 0.75
    ).add_to(restaurants_map)
    
    
# display map
restaurants_map

In [166]:
restaurants_map.save('restaurants_map.html')

### 5.Observations:

Most of the Chinese restaurants are concentrated in the central area of Chicago, with the highest number in cluster 0 and moderate number in cluster 2. On the other hand, cluster 1 has very low number to totally no Chinese restaurants in the neighborhoods, which seems to be an outlier.

This represents a great opportunity and high potential areas to open new restaurants in Cluster 2 areas since there will be high competition in areas like downtown Chicago in Cluster 0. Cluster 1 will result in lack of customers since there are no restaurants nearby. 

Therefore, this project recommends property developers to capitalize on these findings to open new restaurants in cluster 2 which has moderate competition and potential customers. 

Property developers with unique selling propositions to stand out from the competition can also consider open new restaurants in Cluster 0 areas since the crowdedness of already opened restaurants indicates there are many potential customers. Existence of places like ChinaTown will attract customers and bring cash flows to the restaurants. Lastly, property developers are advised to avoid neighborhoods in Cluster 1.