**Capstone Project – The Battle of Neighborhoods | Finding a Better Place in North London**

**1. Installing and Importing Python Libraries and Dependencies**

In [1]:
!pip install geocoder
!pip install folium

In [2]:
import pandas as pd
import requests
import numpy as np
import geocoder
import folium
import requests 
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import xml
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("All Required Libraries Imported!")

**2. Data Extraction and Cleaning**

Using BeautifulSoup Scraping List of Postal Codes of Given Wikipedia Page. Link:https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_N

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
wiki_data = BeautifulSoup(extracting_data, 'lxml')

Converting content of PostalCode HTML table as dataframe

In [4]:
column_names = ['Postalcode','Borough','Neighborhood']
Ontario = pd.DataFrame(columns = column_names)

content = wiki_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    Ontario = Ontario.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [5]:
# clean dataframe 
Ontario =  Ontario[ Ontario.Borough!='Not assigned']
Ontario =  Ontario[ Ontario.Borough!= 0]
Ontario.reset_index(drop = True, inplace = True)
i = 0
for i in range(0, Ontario.shape[0]):
    if  Ontario.iloc[i][2] == 'Not assigned':
        Ontario.iloc[i][2] =  Ontario.iloc[i][1]
        i = i+1

In [6]:
df = Ontario.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.head()

In [7]:
df.describe()

In [8]:
df = df.dropna()
empty = 'Not assigned'
df = df[(df.Postalcode != empty ) & (df.Borough != empty) & (df.Neighborhood != empty)]

In [9]:
df.head()

In [10]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['Postalcode', 'Borough'])
df_2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [11]:
df_2.describe()

In [12]:
print(df_2.shape)
df_2.head()

In [13]:
def get_latilong(postal_code):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lati_long_coords = g.latlng
    return lati_long_coords
    
get_latilong('M4G')

In [14]:
# Retrieving Postal Code Co-ordinates
postal_codes = df_2['Postalcode']    
coords = [ get_latilong(postal_code) for postal_code in postal_codes.tolist() ]

In [15]:
# Adding Columns Latitude & Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df_2['Latitude'] = df_coords['Latitude']
df_2['Longitude'] = df_coords['Longitude']

In [16]:
df_2[df_2.Postalcode == 'M5G']

In [17]:
df_2.head(10)

In [18]:
address = 'Guelph,Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude_x = location.latitude
longitude_y = location.longitude
print('The Geograpical Co-ordinate of Seattle,Washington are {}, {}.'.format(latitude_x, longitude_y))

**3. Map of Guelph**

In [19]:
map_Guelph = folium.Map(location=[latitude_x, longitude_y], zoom_start=10)

for lat, lng, nei in zip(df_2['Latitude'], df_2['Longitude'], df_2['Neighborhood']):
    
    label = '{}'.format(nei)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Guelph)  
    
map_Guelph

In [20]:
address = 'Guelph,Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude_n1 = location.latitude
longitude_n1 = location.longitude
print('The Geograpical Co-ordinate of Neighborhood_1 are {}, {}.'.format(latitude_x, longitude_y))

In [21]:
# @hiddel_cell
CLIENT_ID = 'DPBYY4JUY3DU20ALPSUV4ONY2K1GOJJKJ1NIHBB32XEMOVYY' # my Foursquare ID
CLIENT_SECRET = '1MV443TYEP4HUO0WDUW5NQ5W10L2Y4G05NWG11WIR3NUGC5B' # my Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: '+CLIENT_ID)
print('CLIENT_SECRET: '+CLIENT_SECRET)

In [22]:
radius = 700 
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude_n1, 
   longitude_n1, 
    radius, 
   LIMIT)
results = requests.get(url).json()

In [23]:
venues=results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
nearby_venues.columns

In [24]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

**4. Nearby Venues/Locations**

In [25]:
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues.head()

**5. Categories of Nearby Venues/Locations**

In [26]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(5)

In [27]:
# Top 10 Categories
a=pd.Series(nearby_venues.categories)
a.value_counts()[:10]

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=700):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # making GET request
        venue_results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venue_results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
# Nearby Venues
Guelph_venues = getNearbyVenues(names=df_2['Neighborhood'],
                                   latitudes=df_2['Latitude'],
                                   longitudes=df_2['Longitude']
                                  )

In [30]:
print('There are {} Uniques Categories.'.format(len(Guelph_venues['Venue Category'].unique())))
Guelph_venues.groupby('Neighborhood').count().head()

**One Hot Encoding of Features**

In [31]:
# one hot encoding
Guelph_onehot = pd.get_dummies(Guelph_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Guelph_onehot['Neighborhood'] = Guelph_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Guelph_onehot.columns[-1]] + list(Guelph_onehot.columns[:-1])
Guelph_onehot = Guelph_onehot[fixed_columns]
Guelph_grouped = Guelph_onehot.groupby('Neighborhood').mean().reset_index()
Guelph_onehot.head(5)

In [32]:
num_top_venues = 5
for hood in Guelph_grouped['Neighborhood']:
    print("---- "+hood+" ----")
    temp =Guelph_grouped[Guelph_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Most Common venues near neighborhood**

In [34]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Guelph_grouped['Neighborhood']

for ind in np.arange(Guelph_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Guelph_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

**K-Means Clustering Approach**

In [35]:
# Using K-Means to cluster neighborhood into 3 clusters
Guelph_grouped_clustering = Guelph_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=3, random_state=0).fit(Guelph_grouped_clustering)
kmeans.labels_

In [36]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Guelph_merged =df_2.iloc[:16,:]

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Guelph_merged = Guelph_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Guelph_merged.head()# check the last columns!

**Map of Clusters**

In [37]:
kclusters = 10

In [None]:
# create map
map_clusters = folium.Map(location=[latitude_x, longitude_y], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
# colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
# rainbow = [colors.rgb2hex(i) for i in colors_array]

colors = ['blue', 'purple', 'purple', 'green', '#ff0000']

# add markers to the map
markers_colors = []
i=0
for lat, lon, poi, cluster in zip(Guelph_merged['Latitude'], Guelph_merged['Longitude'], Guelph_merged['Neighborhood'], Guelph_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=colors[i-1],
        fill=True,
        fill_color=colors[i],
        fill_opacity=0.7).add_to(map_clusters)

    print(i)
    i=i+1
       
map_clusters

**5. Examining Clusters****

**Cluster 1**

In [39]:
Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 0, Guelph_merged.columns[[0] + list(range(4, Guelph_merged.shape[1]))]]

**Cluster 2**

In [40]:
Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 1, Guelph_merged.columns[[0] + list(range(4, Guelph_merged.shape[1]))]]

**Cluster 3**

In [41]:
Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 2, Guelph_merged.columns[[0] + list(range(4, Guelph_merged.shape[1]))]]

**School Rating by Clusters**

In [43]:
df1=Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 0,Guelph_merged.columns[[2] + list(range(5, Guelph_merged.shape[1]))]]
df2=Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 1,Guelph_merged.columns[[2] + list(range(5, Guelph_merged.shape[1]))]]
df3=Guelph_merged.loc[Guelph_merged['Cluster Labels'] == 2,Guelph_merged.columns[[2] + list(range(5, Guelph_merged.shape[1]))]]

In [44]:
clusters=pd.DataFrame({"Cluster1":df1["Neighborhood"],
                      "Cluster2":df2["Neighborhood"],
                      "Cluster4":df3["Neighborhood"]})
clusters = clusters.replace(np.nan, '', regex=True)

In [45]:
new_Guelph=Guelph_merged.set_index("Neighborhood",drop=True)
#Source:https://www.greatschools.org
Guelph_school_ratings=pd.DataFrame({"Neighborhood":df["Neighborhood"],
                                      "Top School Rating":[7,9,5,8,10,10,7,10,1,2,1,2,7,2,3,2,6,
                                                           5,4,8
                                                         ]})

In [46]:
Guelph_school_ratings.set_index('Neighborhood',inplace=True,drop=True)

In [47]:
Guelph_school_ratings.plot(kind='bar',figsize=(16,10),color='green',alpha=0.75);