# Final Capstone Project

This notebook is going to be the final notebook for the capstone project. 

## Importing appropriate packages

In [1]:
# standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# packages for clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as hc
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

# preprocessing packages
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import folium # map rendering library

print('All packages are imported.')

All packages are imported.


## Webscrape a webpage for each of the zipcodes of New York City

In [2]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_the_Bronx").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
bx = pd.DataFrame({"Neighborhood": neighborhoodList})


bx['Neighborhood'] = bx['Neighborhood'].astype(str).str[1:]


sep = '('
bx['Neighborhood'] = bx['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
bx['Neighborhood'] = bx['Neighborhood'].str.strip()

In [4]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Bronx, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in bx["Neighborhood"].tolist() ]

In [5]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
bx['Latitude'] = df_coords['Latitude']
bx['Longitude'] = df_coords['Longitude']

In [6]:
# save the DataFrame as CSV file
bx.to_csv("bx_neighborhoods.csv", index=False)

# check the neighborhoods and the coordinates
print(bx.shape)
bx

(31, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Bedford Park, Bronx‎",40.86999,-73.88573
1,"Belmont, Bronx‎",40.85546,-73.88569
2,"City Island, Bronx‎",40.85564,-73.79171
3,"Co-op City, Bronx‎",40.879939,-73.82317
4,"Concourse, Bronx‎",40.82763,-73.92533
5,"Eastchester, Bronx‎",40.88811,-73.82835
6,"Fordham, Bronx‎",40.85894,-73.89885
7,"Highbridge, Bronx‎",40.842117,-73.9282
8,"Hunts Point, Bronx‎",40.81242,-73.8845
9,"Jerome Park, Bronx‎",40.86547,-73.89896


## Manhattan

In [7]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Manhattan").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
mh = pd.DataFrame({"Neighborhood": neighborhoodList})


mh['Neighborhood'] = mh['Neighborhood'].astype(str).str[1:]


sep = '('
mh['Neighborhood'] = mh['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
mh['Neighborhood'] = mh['Neighborhood'].str.strip()

In [8]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Manhattan, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in mh["Neighborhood"].tolist() ]

In [9]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
mh['Latitude'] = df_coords['Latitude']
mh['Longitude'] = df_coords['Longitude']

# save the DataFrame as CSV file
mh.to_csv("mh_neighborhoods.csv", index=False)

# check the neighborhoods and the coordinates
print(mh.shape)
mh

(44, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Battery Park City‎,40.71131,-74.0159
1,Bowery‎,40.72328,-73.992893
2,"Chelsea, Manhattan‎",40.7461,-74.00045
3,"Civic Center, Manhattan‎",40.71337,-74.0038
4,Columbus Circle‎,40.76573,-73.98338
5,East Harlem‎,40.79828,-73.94081
6,"East Village, Manhattan‎",40.72804,-73.98499
7,"Financial District, Manhattan‎",40.70826,-74.0141
8,"Five Points, Manhattan‎",45.858578,-111.328778
9,Flatiron District‎,40.73942,-73.99035


## Brooklyn

In [10]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Brooklyn").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
bk = pd.DataFrame({"Neighborhood": neighborhoodList})


bk['Neighborhood'] = bk['Neighborhood'].astype(str).str[1:]


sep = '('
bk['Neighborhood'] = bk['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
bk['Neighborhood'] = bk['Neighborhood'].str.strip()

In [11]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Brooklyn, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in bk["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
bk['Latitude'] = df_coords['Latitude']
bk['Longitude'] = df_coords['Longitude']

In [12]:
# save the DataFrame as CSV file
bk.to_csv("bk_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(bk.shape)
bk

(40, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Bay Ridge, Brooklyn‎",40.6196,-74.027569
1,"Bedford–Stuyvesant, Brooklyn‎",40.681945,-73.933294
2,"Bensonhurst, Brooklyn‎",40.60482,-73.99528
3,Boerum Hill‎,40.68943,-73.98801
4,"Borough Park, Brooklyn‎",40.63882,-73.98912
5,Brighton Beach‎,40.57457,-73.95343
6,Brooklyn Heights‎,40.69535,-73.99405
7,Brooklyn Navy Yard‎,40.705177,-73.971624
8,"Brownsville, Brooklyn‎",40.671134,-73.913476
9,"Bushwick, Brooklyn‎",40.713488,-73.941454


## Queens

In [14]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Queens,_New_York").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
qn = pd.DataFrame({"Neighborhood": neighborhoodList})


In [15]:
qn['Neighborhood'] = qn['Neighborhood'].astype(str).str[1:]

sep = '('
qn['Neighborhood'] = qn['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
qn['Neighborhood'] = qn['Neighborhood'].str.strip()

In [16]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Brooklyn, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in qn["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
qn['Latitude'] = df_coords['Latitude']
qn['Longitude'] = df_coords['Longitude']

In [17]:
# save the DataFrame as CSV file
qn.to_csv("qn_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(qn.shape)
qn

(29, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Astoria, Queens‎",40.77205,-73.92606
1,"Bayside, Queens‎",40.77731,-73.78068
2,"Corona, Queens‎",40.74637,-73.85483
3,"Douglaston–Little Neck, Queens‎",40.768561,-73.747657
4,"East Elmhurst, Queens‎",40.76439,-73.87402
5,"Elmhurst, Queens‎",40.74361,-73.88433
6,"Floral Park, New York‎",40.664506,-73.979842
7,"Flushing, Queens‎",40.706477,-73.925219
8,"Forest Hills, Queens‎",40.72266,-73.84791
9,"Fresh Meadows, Queens‎",40.74162,-73.78299


## Staten Island

In [22]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Staten_Island").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
si = pd.DataFrame({"Neighborhood": neighborhoodList})


In [19]:
si['Neighborhood'] = si['Neighborhood'].astype(str).str[1:]

sep = '('
si['Neighborhood'] = si['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
si['Neighborhood'] = si['Neighborhood'].str.strip()

In [23]:
si

Unnamed: 0,Neighborhood
0,List of Staten Island neighborhoods
1,"Annadale, Staten Island"
2,"Arden Heights, Staten Island"
3,"Arlington, Staten Island"
4,"Arrochar, Staten Island"
...,...
69,"West New Brighton, Staten Island"
70,"West Shore, Staten Island"
71,"Westerleigh, Staten Island"
72,"Willowbrook, Staten Island"


In [20]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Brooklyn, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in si["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
si['Latitude'] = df_coords['Latitude']
si['Longitude'] = df_coords['Longitude']

In [21]:
# save the DataFrame as CSV file
si.to_csv("si_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(si.shape)
si

(74, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,ist of Staten Island neighborhoods,40.642420,-74.075270
1,"nnadale, Staten Island",40.690475,-73.992467
2,"rden Heights, Staten Island",40.554140,-74.173910
3,"rlington, Staten Island",40.690475,-73.992467
4,"rrochar, Staten Island",40.690475,-73.992467
...,...,...,...
69,"est New Brighton, Staten Island",40.642070,-74.093360
70,"est Shore, Staten Island",40.690475,-73.992467
71,"esterleigh, Staten Island",40.621020,-74.132050
72,"illowbrook, Staten Island",40.603170,-74.139050


In [None]:
# get the coordinates of New York City
address = 'Bronx, New York'

geolocator = Nominatim(user_agent="coursera-capstone-project")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bronx, New York {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(bx['Latitude'], bx['Longitude'], bx['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_ny)  
    
map_ny