# Final Capstone Project

This notebook is going to be the final notebook for the capstone project. 

## Importing appropriate packages

In [18]:
# standard packages
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# packages for clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as hc
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

# preprocessing packages
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import folium # map rendering library

print('All packages are imported.')

All packages are imported.


## Webscrape a webpage for each of the zipcodes of New York City

#### Since New York is divided by borough, each their own county, they need to be webscraped individually. 

In [30]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_the_Bronx").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
bx = pd.DataFrame({"Neighborhood": neighborhoodList})


bx['Neighborhood'] = bx['Neighborhood'].astype(str).str[1:]


sep = '('
bx['Neighborhood'] = bx['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
bx['Neighborhood'] = bx['Neighborhood'].str.strip()

In [31]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Bronx, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in bx["Neighborhood"].tolist() ]

In [32]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
bx['Latitude'] = df_coords['Latitude']
bx['Longitude'] = df_coords['Longitude']

In [33]:
# save the DataFrame as CSV file
bx.to_csv("bx_neighborhoods.csv", index=False)

# check the neighborhoods and the coordinates
print(bx.shape)
bx

(31, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Bedford Park, Bronx‎",40.86999,-73.88573
1,"Belmont, Bronx‎",40.85546,-73.88569
2,"City Island, Bronx‎",40.85564,-73.79171
3,"Co-op City, Bronx‎",40.879939,-73.82317
4,"Concourse, Bronx‎",40.82763,-73.92533
5,"Eastchester, Bronx‎",40.88811,-73.82835
6,"Fordham, Bronx‎",40.85894,-73.89885
7,"Highbridge, Bronx‎",40.842117,-73.9282
8,"Hunts Point, Bronx‎",40.81242,-73.8845
9,"Jerome Park, Bronx‎",40.86547,-73.89896


## Manhattan

In [34]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Manhattan").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
mh = pd.DataFrame({"Neighborhood": neighborhoodList})


mh['Neighborhood'] = mh['Neighborhood'].astype(str).str[1:]


sep = '('
mh['Neighborhood'] = mh['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
mh['Neighborhood'] = mh['Neighborhood'].str.strip()

In [35]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Manhattan, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in mh["Neighborhood"].tolist() ]

In [36]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
mh['Latitude'] = df_coords['Latitude']
mh['Longitude'] = df_coords['Longitude']

# save the DataFrame as CSV file
mh.to_csv("mh_neighborhoods.csv", index=False)

# check the neighborhoods and the coordinates
print(mh.shape)
mh

(44, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Battery Park City‎,40.71131,-74.0159
1,Bowery‎,40.72328,-73.992893
2,"Chelsea, Manhattan‎",40.7461,-74.00045
3,"Civic Center, Manhattan‎",40.71337,-74.0038
4,Columbus Circle‎,40.76573,-73.98338
5,East Harlem‎,40.79828,-73.94081
6,"East Village, Manhattan‎",40.72804,-73.98499
7,"Financial District, Manhattan‎",40.70826,-74.0141
8,"Five Points, Manhattan‎",45.858578,-111.328778
9,Flatiron District‎,40.73942,-73.99035


## Brooklyn

In [37]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Brooklyn").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
bk = pd.DataFrame({"Neighborhood": neighborhoodList})


bk['Neighborhood'] = bk['Neighborhood'].astype(str).str[1:]


sep = '('
bk['Neighborhood'] = bk['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
bk['Neighborhood'] = bk['Neighborhood'].str.strip()

In [38]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Brooklyn, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in bk["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
bk['Latitude'] = df_coords['Latitude']
bk['Longitude'] = df_coords['Longitude']

In [39]:
bk = bk.drop(32,axis = 0)

In [40]:
# save the DataFrame as CSV file
bk.to_csv("bk_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(bk.shape)
bk

(39, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Bay Ridge, Brooklyn‎",40.6196,-74.027569
1,"Bedford–Stuyvesant, Brooklyn‎",40.681945,-73.933294
2,"Bensonhurst, Brooklyn‎",40.60482,-73.99528
3,Boerum Hill‎,40.68943,-73.98801
4,"Borough Park, Brooklyn‎",40.63882,-73.98912
5,Brighton Beach‎,40.57457,-73.95343
6,Brooklyn Heights‎,40.69535,-73.99405
7,Brooklyn Navy Yard‎,40.705177,-73.971624
8,"Brownsville, Brooklyn‎",40.671134,-73.913476
9,"Bushwick, Brooklyn‎",40.713488,-73.941454


## Queens

In [11]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Queens,_New_York").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
qn = pd.DataFrame({"Neighborhood": neighborhoodList})


In [12]:
qn['Neighborhood'] = qn['Neighborhood'].astype(str).str[1:]

sep = '('
qn['Neighborhood'] = qn['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
qn['Neighborhood'] = qn['Neighborhood'].str.strip()

In [13]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Brooklyn, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in qn["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
qn['Latitude'] = df_coords['Latitude']
qn['Longitude'] = df_coords['Longitude']

In [14]:
qn = qn.drop([20,22],axis = 0)


In [15]:
# save the DataFrame as CSV file
qn.to_csv("qn_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(qn.shape)
qn

(27, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Astoria, Queens‎",40.77205,-73.92606
1,"Bayside, Queens‎",40.77731,-73.78068
2,"Corona, Queens‎",40.74637,-73.85483
3,"Douglaston–Little Neck, Queens‎",40.768561,-73.747657
4,"East Elmhurst, Queens‎",40.76439,-73.87402
5,"Elmhurst, Queens‎",40.74361,-73.88433
6,"Floral Park, New York‎",40.664506,-73.979842
7,"Flushing, Queens‎",40.706477,-73.925219
8,"Forest Hills, Queens‎",40.72266,-73.84791
9,"Fresh Meadows, Queens‎",40.74162,-73.78299


## Staten Island

In [67]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Staten_Island").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
si = pd.DataFrame({"Neighborhood": neighborhoodList})


In [68]:
si = si.drop([0],axis = 0)

In [69]:
si

Unnamed: 0,Neighborhood
1,"Annadale, Staten Island"
2,"Arden Heights, Staten Island"
3,"Arlington, Staten Island"
4,"Arrochar, Staten Island"
5,Aspen Knolls
6,"Bay Terrace, Staten Island"
7,"Bloomfield, Staten Island"
8,"Brighton Heights, Staten Island"
9,"Bulls Head, Staten Island"
10,"Castleton Corners, Staten Island"


In [70]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Staten Island, New York'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in si["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
si['Latitude'] = df_coords['Latitude']
si['Longitude'] = df_coords['Longitude']

In [71]:
si

Unnamed: 0,Neighborhood,Latitude,Longitude
1,"Annadale, Staten Island",40.559891,-74.198791
2,"Arden Heights, Staten Island",40.642395,-74.11082
3,"Arlington, Staten Island",40.64242,-74.07527
4,"Arrochar, Staten Island",40.562344,-74.184025
5,Aspen Knolls,40.554536,-74.135866
6,"Bay Terrace, Staten Island",40.610592,-74.179655
7,"Bloomfield, Staten Island",40.504032,-74.243297
8,"Brighton Heights, Staten Island",40.64242,-74.07527
9,"Bulls Head, Staten Island",40.62122,-74.12915
10,"Castleton Corners, Staten Island",40.549418,-74.216839


In [72]:
# save the DataFrame as CSV file
si.to_csv("si_neighborhoods.csv", index=False)


# check the neighborhoods and the coordinates
print(si.shape)
si

(73, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
1,"Annadale, Staten Island",40.559891,-74.198791
2,"Arden Heights, Staten Island",40.642395,-74.11082
3,"Arlington, Staten Island",40.64242,-74.07527
4,"Arrochar, Staten Island",40.562344,-74.184025
5,Aspen Knolls,40.554536,-74.135866
6,"Bay Terrace, Staten Island",40.610592,-74.179655
7,"Bloomfield, Staten Island",40.504032,-74.243297
8,"Brighton Heights, Staten Island",40.64242,-74.07527
9,"Bulls Head, Staten Island",40.62122,-74.12915
10,"Castleton Corners, Staten Island",40.549418,-74.216839


In [87]:
ny = pd.concat([bx,mh,bk,qn,si], ignore_index = True)

In [88]:
ny

Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Bedford Park, Bronx‎",40.86999,-73.88573
1,"Belmont, Bronx‎",40.85546,-73.88569
2,"City Island, Bronx‎",40.85564,-73.79171
3,"Co-op City, Bronx‎",40.879939,-73.82317
4,"Concourse, Bronx‎",40.82763,-73.92533
5,"Eastchester, Bronx‎",40.88811,-73.82835
6,"Fordham, Bronx‎",40.85894,-73.89885
7,"Highbridge, Bronx‎",40.842117,-73.9282
8,"Hunts Point, Bronx‎",40.81242,-73.8845
9,"Jerome Park, Bronx‎",40.86547,-73.89896


In [89]:
ny = ny.drop([39,47,213],axis= 0)

In [90]:
ny.to_csv('ny_neighborhood.csv',index=False)


## Map the points using Folium

In [91]:
# get the coordinates of New York City
address = 'New York, New York'

geolocator = Nominatim(user_agent="coursera-capstone-project")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York, New York {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York, New York 40.7127281, -74.0060152.


In [93]:
# create map of Toronto using latitude and longitude values
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(ny['Latitude'], ny['Longitude'], ny['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_ny)  
    
map_ny

## Do the Same with Los Angeles

#### This should be faster because the Wikipedia page isn't split by county here

In [97]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Los_Angeles").text

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
la = pd.DataFrame({"Neighborhood": neighborhoodList})


la['Neighborhood'] = la['Neighborhood'].astype(str).str[1:]


sep = '('
la['Neighborhood'] = la['Neighborhood'].apply(lambda x: x.split(sep,1)[0])
la['Neighborhood'] = la['Neighborhood'].str.strip()

In [98]:
la

Unnamed: 0,Neighborhood
0,Historic districts in Los Angeles‎
1,People by Los Angeles district or neighborhood‎
2,Los Angeles Historic Preservation Overlay Zones‎
3,Central Los Angeles‎
4,Districts of Downtown Los Angeles‎
5,Eastside Los Angeles‎
6,Northeast Los Angeles‎
7,Northwest Los Angeles‎
8,San Fernando Valley‎
9,South Los Angeles‎


In [100]:
la.drop([0,1,2],axis=0,inplace=True)

In [101]:
la

Unnamed: 0,Neighborhood
3,Central Los Angeles‎
4,Districts of Downtown Los Angeles‎
5,Eastside Los Angeles‎
6,Northeast Los Angeles‎
7,Northwest Los Angeles‎
8,San Fernando Valley‎
9,South Los Angeles‎
10,Westside
11,"Arleta, Los Angeles‎"
12,"Atwater Village, Los Angeles‎"


In [105]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Los Angeles, California'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in la["Neighborhood"].tolist() ]

# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

# merge the coordinates into the original dataframe
la['Latitude'] = df_coords['Latitude']
la['Longitude'] = df_coords['Longitude']

In [109]:
la

Unnamed: 0,Neighborhood,Latitude,Longitude
3,Central Los Angeles‎,33.96011,-118.26919
4,Districts of Downtown Los Angeles‎,33.92068,-118.27298
5,Eastside Los Angeles‎,34.18195,-118.35952
6,Northeast Los Angeles‎,33.97458,-118.30042
7,Northwest Los Angeles‎,33.8044,-118.21453
8,San Fernando Valley‎,34.24905,-118.43349
9,South Los Angeles‎,34.1197,-118.25887
10,Westside,34.02157,-118.36765
11,"Arleta, Los Angeles‎",34.08361,-118.43483
12,"Atwater Village, Los Angeles‎",34.096533,-118.403295


In [110]:
la.drop([90,91,92],axis = 0, inplace = True)

In [111]:
la.to_csv('la_neighborhoods.csv',index=False)

In [112]:
# get the coordinates of New York City
address = 'Los Angeles, California'

geolocator = Nominatim(user_agent="coursera-capstone-project")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles, California {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Los Angeles, California 34.0536909, -118.242766.


In [113]:
# create map of Toronto using latitude and longitude values
map_la = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(la['Latitude'], la['Longitude'], la['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_la)  
    
map_la