# HELLO CAPSTONE PROJECT COURSE!

This notebook is created in the scope of "Data Science Professional Certificate" course track provided by IBM in Coursera. Aim of the project is to use the things taught throughout the project to come up with a creative analysis called **"The Battle of Neighborhoods"**. 

So what we will do is given a city like the City of Toronto, we will segment it into different neighborhoods using the geographical coordinates of the center of each neighborhood, and then using a combination of location data and machine learning, we will group the neighbourhoods into clusters.

*__Let's dive into it!__*

# Week 3 - Part 1 : Scraping data 

In [None]:
!conda install beautifulsoup4
!conda install lxml
!conda install requests

print("Downloaded!")

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'   # this is the wikipedia page we want to scrape from

page_response = requests.get(url, timeout=5)  # here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "lxml")  #we use the lxml parser to parse the url content and store it in a variable
# print(page_content.prettify()) 


In [4]:
match=page_content.find('tbody') #finding the relevant part in html 
rows=match.find_all('tr') 
# print(rows)


In [5]:
import re   

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)

[M9Z
, Not assigned
, 
]


str

In [6]:
import pandas as pd
import numpy as np

df = pd.DataFrame(list_rows)  #turn the list into dataframe to allow operations
df.head(10)

Unnamed: 0,0
0,[]
1,"[M1A\n, Not assigned\n, \n]"
2,"[M2A\n, Not assigned\n, \n]"
3,"[M3A\n, North York\n, Parkwoods\n]"
4,"[M4A\n, North York\n, Victoria Village\n]"
5,"[M5A\n, Downtown Toronto\n, Regent Park / Harb..."
6,"[M6A\n, North York\n, Lawrence Manor / Lawrenc..."
7,"[M7A\n, Downtown Toronto\n, Queen's Park / Ont..."
8,"[M8A\n, Not assigned\n, \n]"
9,"[M9A\n, Etobicoke\n, Islington Avenue\n]"


In [7]:
df1 = df[0].str.split(',', expand=True) #expanding the data into columns
df1.head(10)

Unnamed: 0,0,1,2,3
0,[],,,
1,[M1A\n,Not assigned\n,\n],
2,[M2A\n,Not assigned\n,\n],
3,[M3A\n,North York\n,Parkwoods\n],
4,[M4A\n,North York\n,Victoria Village\n],
5,[M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n],
6,[M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n],
7,[M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n],
8,[M8A\n,Not assigned\n,\n],
9,[M9A\n,Etobicoke\n,Islington Avenue\n],


In [8]:
# renaming the columns

indexes=pd.Series(['PostalCode','Borough','Neighborhood']) #creating another dataframe and concatenating it to main dataframe
ind_df=pd.DataFrame([indexes]) 
df2=pd.concat([ind_df,df1], ignore_index=True) 

df2 = df2.rename(columns=df2.iloc[0]) #renaming the columns
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,nan
0,PostalCode,Borough,Neighborhood,
1,[],,,
2,[M1A\n,Not assigned\n,\n],
3,[M2A\n,Not assigned\n,\n],
4,[M3A\n,North York\n,Parkwoods\n],


In [9]:
# data wrangling (preparing data to analysis)

df2.drop([0], axis=0, inplace=True) #dropping an unnecessary row
df2.drop([1], axis=0, inplace=True) #dropping an unnecessary row
df2.drop(df2.columns[[3]], axis=1,inplace=True) #dropping an unnecessary column


# data cleaning

df2['PostalCode'] = df2['PostalCode'].str.strip('[\n]')
df2['Neighborhood'] = df2['Neighborhood'].str.strip(']')
df2['Neighborhood'] = df2['Neighborhood'].str.strip('\n')
df2['Borough'] = df2['Borough'].str.strip('\n')
df2['PostalCode'] = df2['PostalCode'].str.strip('\n')
df2.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M1A,Not assigned,
3,M2A,Not assigned,
4,M3A,North York,Parkwoods
5,M4A,North York,Victoria Village
6,M5A,Downtown Toronto,Regent Park / Harbourfront


In [10]:
df2.astype('str').dtypes #arranging data types for further processes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [11]:
# finding indexes to eliminate Not assigned values

index=df2[df2['Borough']==' Not assigned'].index 
index

Int64Index([  2,   3,   9,  12,  17,  18,  21,  26,  27,  30,  31,  35,  36,
             37,  39,  40,  44,  45,  46,  53,  54,  55,  62,  63,  64,  71,
             72,  73,  80,  81,  89,  90,  98,  99, 103, 107, 108, 112, 117,
            120, 121, 125, 126, 127, 129, 130, 133, 134, 135, 136, 138, 139,
            142, 143, 147, 148, 151, 152, 156, 157, 160, 161, 163, 164, 165,
            166, 168, 169, 172, 173, 174, 175, 176, 177, 178, 179, 181],
           dtype='int64')

In [12]:
# dropping related rows

df2.drop([2,   3,   9,  12,  17,  18,  21,  26,  27,  30,  31,  35,  36,
             37,  39,  40,  44,  45,  46,  53,  54,  55,  62,  63,  64,  71,
             72,  73,  80,  81,  89,  90,  98,  99, 103, 107, 108, 112, 117,
            120, 121, 125, 126, 127, 129, 130, 133, 134, 135, 136, 138, 139,
            142, 143, 147, 148, 151, 152, 156, 157, 160, 161, 163, 164, 165,
            166, 168, 169, 172, 173, 174, 175, 176, 177, 178, 179, 181], axis=0, inplace=True)

In [13]:
df2.shape

(103, 3)

In [14]:
df2.reset_index(drop=True, inplace=True)
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


# Week 3 - Part 2 Adding Location Data 

In [15]:
#uploading the location data

df_loc=pd.read_csv('https://cocl.us/Geospatial_data')
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
#editing the column names in order to make it same with the main one

df_loc.columns=['PostalCode', 'Latitude', 'Longitude']

#merging two dataframes
result = pd.merge(df2, df_loc, on='PostalCode')
result.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [17]:
result.shape

(103, 5)

# Week 3 - Part 3 Clustering the Neighborhoods

### Define Foursquare Credentials and Version

 For this part of the project, I decided to analyze the neighborhoods that contain the word **'York'**. My list shrinked down to the list below.

In [18]:
#shrinking the dataset

york=result[result['Borough'].str.contains('York')]   
york.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937


In [19]:
CLIENT_ID = '1DFALF2T3LISBTCHBTQJXRUEXYSPVR0DEUFHQ35XCTVPV0LF' # your Foursquare ID
CLIENT_SECRET = 'VGDXYNJXG2UGAIZPS0WSAMRSJTHFC0HE20VCJIT3LVGOB0S1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500 
LIMIT=100   #limit venue count to 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1DFALF2T3LISBTCHBTQJXRUEXYSPVR0DEUFHQ35XCTVPV0LF
CLIENT_SECRET:VGDXYNJXG2UGAIZPS0WSAMRSJTHFC0HE20VCJIT3LVGOB0S1


In [20]:
#function to get nearby venues in neighborhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
#calling all venues in york boroughs

york_venues = getNearbyVenues(names=york['Neighborhood'],
                                   latitudes=york['Latitude'],
                                   longitudes=york['Longitude']
                                  )
print(york_venues.shape)
york_venues.head()

 Parkwoods
 Victoria Village
 Lawrence Manor / Lawrence Heights
 Don Mills
 Parkview Hill / Woodbine Gardens
 Glencairn
 Don Mills
 Woodbine Heights
 Humewood-Cedarvale
 Caledonia-Fairbanks
 Leaside
 Hillcrest Village
 Bathurst Manor / Wilson Heights / Downsview North
 Thorncliffe Park
 Fairview / Henry Farm / Oriole
 Northwood Park / York University
 East Toronto
 Bayview Village
 Downsview
 York Mills / Silver Hills
 Downsview
 North Park / Maple Leaf Park / Upwood Park
 Humber Summit
 Willowdale / Newtonbrook
 Downsview
 Bedford Park / Lawrence Manor East
 Del Ray / Mount Dennis / Keelsdale and Silverthorn
 Humberlea / Emery
 Willowdale
 Downsview
 Runnymede / The Junction North
 Weston
 York Mills West
 Willowdale
(342, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [22]:
#expanding the venue categories of neighborhoods

york_exp=pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")
york_exp.head()

Unnamed: 0,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,Bar,...,Theater,Toy / Game Store,Trail,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# add neighborhood column back to dataframe
york_exp['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_exp.columns[-1]] + list(york_exp.columns[:-1])
york_exp = york_exp[fixed_columns]

york_exp.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,...,Theater,Toy / Game Store,Trail,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
york_neighbor = york_exp.groupby('Neighborhood').mean().reset_index()
york_neighbor

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,...,Theater,Toy / Game Store,Trail,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Bathurst Manor / Wilson Heights / Downsview N...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bedford Park / Lawrence Manor East,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
4,Del Ray / Mount Dennis / Keelsdale and Silver...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
5,Don Mills,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Fairview / Henry Farm / Oriole,0.0,0.0,0.014706,0.0,0.014706,0.0,0.0,0.029412,0.029412,...,0.014706,0.029412,0.0,0.0,0.014706,0.0,0.0,0.0,0.029412,0.0
9,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbor_sorted = pd.DataFrame(columns=columns)
neighbor_sorted['Neighborhood'] = york_neighbor['Neighborhood']

for ind in np.arange(york_neighbor.shape[0]):
    neighbor_sorted.iloc[ind, 1:] = return_most_common_venues(york_neighbor.iloc[ind, :], num_top_venues)

neighbor_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bathurst Manor / Wilson Heights / Downsview N...,Coffee Shop,Bank,Supermarket,Middle Eastern Restaurant,Fried Chicken Joint
1,Bayview Village,Japanese Restaurant,Chinese Restaurant,Café,Bank,Dog Run
2,Bedford Park / Lawrence Manor East,Sandwich Place,Coffee Shop,Italian Restaurant,Pizza Place,Restaurant
3,Caledonia-Fairbanks,Park,Women's Store,Market,Yoga Studio,Distribution Center
4,Del Ray / Mount Dennis / Keelsdale and Silver...,Coffee Shop,Skating Rink,Turkish Restaurant,Sandwich Place,Discount Store


# Clustering

In [27]:
#import required libraries

import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs 
%matplotlib inline

In [28]:
# set number of clusters
kclusters = 4

york_clustering = york_neighbor.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_clustering)

print(kmeans.labels_[0:10]) # check cluster labels generated for each row in the dataframe


# add clustering labels 
neighbor_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

[0 0 0 1 0 0 0 1 0 0]


In [29]:
#merge it with the first dataset to get latitude and longtitude values
york_merged = york

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = pd.merge(neighbor_sorted.set_index('Neighborhood'),york, on='Neighborhood')

york_merged

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,PostalCode,Borough,Latitude,Longitude
0,Bathurst Manor / Wilson Heights / Downsview N...,0,Coffee Shop,Bank,Supermarket,Middle Eastern Restaurant,Fried Chicken Joint,M3H,North York,43.754328,-79.442259
1,Bayview Village,0,Japanese Restaurant,Chinese Restaurant,Café,Bank,Dog Run,M2K,North York,43.786947,-79.385975
2,Bedford Park / Lawrence Manor East,0,Sandwich Place,Coffee Shop,Italian Restaurant,Pizza Place,Restaurant,M5M,North York,43.733283,-79.41975
3,Caledonia-Fairbanks,1,Park,Women's Store,Market,Yoga Studio,Distribution Center,M6E,York,43.689026,-79.453512
4,Del Ray / Mount Dennis / Keelsdale and Silver...,0,Coffee Shop,Skating Rink,Turkish Restaurant,Sandwich Place,Discount Store,M6M,York,43.691116,-79.476013
5,Don Mills,0,Restaurant,Japanese Restaurant,Coffee Shop,Gym,Beer Store,M3B,North York,43.745906,-79.352188
6,Don Mills,0,Restaurant,Japanese Restaurant,Coffee Shop,Gym,Beer Store,M3C,North York,43.7259,-79.340923
7,Downsview,0,Grocery Store,Park,Discount Store,Liquor Store,Baseball Field,M3K,North York,43.737473,-79.464763
8,Downsview,0,Grocery Store,Park,Discount Store,Liquor Store,Baseball Field,M3L,North York,43.739015,-79.506944
9,Downsview,0,Grocery Store,Park,Discount Store,Liquor Store,Baseball Field,M3M,North York,43.728496,-79.495697


# Mapping the clusters

In [30]:
# !conda install -c conda-forge folium=0.5.0 --yes 

In [31]:
#import required libraries

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [32]:
latitude=43.753259
longitude=-79.329656

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color = rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters