# <u>Part 1</u>

### Reading the Toronto FSA table from the postal codes of Canada wiki and assigning it to a dataframe

In [249]:
import pandas as pd

headings = ['Postcode', 'Borough', 'Neighbourhood']
tables_in_wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

toronto_fsa_table = []
for each_table in tables_in_wiki:
    current_headings = each_table.columns.values[:3]
    if current_headings.tolist()==headings:
        toronto_fsa_table = each_table

print("Shape of Toronto FSA dataframe is " + str(toronto_fsa_table.shape))
toronto_fsa_table.head()

Shape of Toronto FSA dataframe is (288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Removing the cells from Toronto FSA dataframe with a Borough that is `Not assigned`

In [250]:
toronto_fsa_table = toronto_fsa_table[toronto_fsa_table['Borough'] != 'Not assigned']
print("Shape of dataframe after droppping cells with Borough = 'Not assigned' is " + str(toronto_fsa_table.shape))
toronto_fsa_table.head()

Shape of dataframe after droppping cells with Borough = 'Not assigned' is (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Combining multiple rows having same postal code area into one row with the neighborhoods separated by comma

In [251]:
unique_postal_code = set(list(toronto_fsa_table['Postcode']))
dict_of_postal_code = { i : ['',''] for i in unique_postal_code }

# Creating a dictonary (hash_map) in format {"Postalcode":["Borough","Neighbourhood_1,Neighbourhood_2"]}
for _,row in toronto_fsa_table.iterrows():
    if dict_of_postal_code[row['Postcode']][0] == '':
        dict_of_postal_code[row['Postcode']][0] = row['Borough']
    if dict_of_postal_code[row['Postcode']][1] == '':
        dict_of_postal_code[row['Postcode']][1] = row['Neighbourhood']
    else:
        dict_of_postal_code[row['Postcode']][1] += (", " + row['Neighbourhood'])

# Converting {"Postalcode":["Borough","Neighbourhood_1,Neighbourhood_2"]} 
#               to
# [["Postalcode_1","Borough_1","Neighbourhood_1,Neighbourhood_2"],
#  ["Postalcode_2","Borough_3","Neighbourhood_1,Neighbourhood_2"],]
list_of_rows = []        
for key, value in dict_of_postal_code.items():
    temp_row = [key,value[0],value[1]]
    list_of_rows.append(temp_row)

# Converting list of rows into dataframe
final_dataframe = pd.DataFrame(listOfUniquePostalCode, columns = ['Postcode','Borough','Neighbourhood']) 
print("Shape of dataframe after merging the rows with same postal code: " + str(final_dataframe.shape))
final_dataframe.head()

Shape of dataframe after merging the rows with same postal code: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M7R,Mississauga,Canada Post Gateway Processing Centre
1,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
2,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."
3,M4B,East York,"Woodbine Gardens, Parkview Hill"
4,M2R,North York,Willowdale West


## Setting neighborhood = borough, when neighborhood = `Not assigned`

In [252]:
for _,row in final_dataframe.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
        print("The Postal_Code = {} with row having neighborhood = 'Not assigned'".format(row["Postcode"]))

The Postal_Code = M7A with row having neighborhood = 'Not assigned'


## Printing the shape of final dataframe

In [253]:
final_dataframe.shape

(103, 3)

# <u>Part 2</u>

## Since the Geocoder package is unreliable, we are using the given CSV to read geographical coordinates of the neighborhoods

In [254]:
geo_coordinates_df = pd.read_csv("http://cocl.us/Geospatial_data") 
geo_coordinates_df = geo_coordinates_df.rename(columns={"Postal Code": "Postcode"})
print("The shape of the geo cordinate dataframe is " + str(geo_coordinates_df.shape))

The shape of the geo cordinate dataframe is (103, 3)


## Using the Panda dataframe merge function to merge Toronto FSA dataframe that was generated in Part 1 with the geographical coordinates of the neighborhoods

In [255]:
toronto_fsa_merged_coordinated_df = pd.merge(final_dataframe, geo_coordinates_df, on="Postcode")
print("The shape of toronto_fsa dataframe merged with geo coordinate is " + str(toronto_fsa_merged_coordinated_df.shape))
toronto_fsa_merged_coordinated_df.head()

The shape of toronto_fsa dataframe merged with geo coordinate is (103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
1,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
2,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
3,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
4,M2R,North York,Willowdale West,43.782736,-79.442259


# <u>Part 3</u>

## As suggested in instruction dropping all the rows which does not have Toronto in it

In [256]:
toronto_fsa_merged_coordinated_df = toronto_fsa_merged_coordinated_df[toronto_fsa_merged_coordinated_df.Borough.str.contains("Toronto")]
toronto_fsa_merged_coordinated_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
13,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
14,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


## Defining the functions to fetch the venues from foursquare

In [257]:
import requests;
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'WYLLBB0B32YKF53GZ5RNYINXDIM25UHWMCNC2AYCCKS3D0PD', 
            'WMSGADHT3X2GEI30YGNGG0I03VWHQG0HNPOZM0DLXY4PJC42', 
            '20180605', 
            lat, 
            lng, 
            500, 
            10)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Neighbourhood', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [258]:
toronto_venues =  getNearbyVenues(names=toronto_fsa_merged_coordinated_df['Postcode'],
                                   latitudes=toronto_fsa_merged_coordinated_df['Latitude'],
                                   longitudes=toronto_fsa_merged_coordinated_df['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Neighbourhood,Venue Latitude,Venue Longitude,Venue Category
0,M5V,43.628947,-79.39442,Billy Bishop Toronto City Airport (YTZ) (Billy...,43.631579,-79.395605,Airport
1,M5V,43.628947,-79.39442,Porter Lounge,43.63068,-79.395756,Airport Lounge
2,M5V,43.628947,-79.39442,Toronto Harbour,43.633045,-79.396484,Harbor / Marina
3,M5V,43.628947,-79.39442,Billy Bishop Café,43.631132,-79.396139,Airport Food Court
4,M5V,43.628947,-79.39442,Air Canada Check-In Counter,43.631226,-79.395987,Airport Terminal


## Lets see how many datapoint we were able to gather per postal code

In [259]:
toronto_venues.groupby('Postal Code').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Neighbourhood,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4E,5,5,5,5,5,5
M4K,10,10,10,10,10,10
M4L,10,10,10,10,10,10
M4M,10,10,10,10,10,10
M4N,4,4,4,4,4,4
M4P,9,9,9,9,9,9
M4R,10,10,10,10,10,10
M4S,10,10,10,10,10,10
M4T,4,4,4,4,4,4
M4V,10,10,10,10,10,10


## Using one hot encoding to convert each venue category to column, this makes clustering better. The model adjusts well with this transformation.

In [260]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postal Code,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Swim School,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M5V,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5V,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5V,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5V,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5V,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Next, let's group rows by postal code and by taking the mean of the frequency of occurrence of each category

In [261]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Swim School,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now let's create the new dataframe and display the top 10 venues for each postal code.

In [262]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Pub,Health Food Store,Coffee Shop,Neighborhood,Trail,Dim Sum Restaurant,Dessert Shop,Dance Studio,Cuban Restaurant,Yoga Studio
1,M4K,Greek Restaurant,Ice Cream Shop,Yoga Studio,Brewery,Italian Restaurant,Pub,Cosmetics Shop,Creperie,Cuban Restaurant,Dance Studio
2,M4L,Italian Restaurant,Sushi Restaurant,Pub,Liquor Store,Burger Joint,Brewery,Ice Cream Shop,Fish & Chips Shop,Park,Gym
3,M4M,Comfort Food Restaurant,Chinese Restaurant,Bookstore,Ice Cream Shop,Fish Market,Cheese Shop,Café,Coffee Shop,Neighborhood,Sandwich Place
4,M4N,Park,Bus Line,Swim School,Dim Sum Restaurant,Eastern European Restaurant,Comfort Food Restaurant,Comic Shop,Concert Hall,Cosmetics Shop,Creperie


## Run *k*-means to cluster the postal code into 5 clusters.

In [263]:
from sklearn.cluster import KMeans
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 3, 3, 1, 3, 3, 1, 1, 4, 1], dtype=int32)

## Let's create a new dataframe that includes the cluster as well as the top 10 venues for each postal code.

In [264]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_fsa_merged_coordinated_df.rename(columns={"Postcode":"Postal Code"})

toronto_merged.head()
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Postal Code'), on='Postal Code')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,1,Airport Terminal,Airport Lounge,Airport,Harbor / Marina,Coffee Shop,Plane,Airport Gate,Airport Food Court,Cosmetics Shop,Creperie
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Yoga Studio,Sporting Goods Shop,Diner,Dessert Shop,Mexican Restaurant,Coffee Shop,Restaurant,Salon / Barbershop,Spa,Chinese Restaurant
8,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,1,Sushi Restaurant,Coffee Shop,Tea Room,Bookstore,Burrito Place,Fish & Chips Shop,Pub,Food,Café,Diner
13,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049,1,College Gym,Dessert Shop,Italian Restaurant,Bookstore,Beer Bar,Bar,Bakery,Restaurant,Japanese Restaurant,French Restaurant
14,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,4,Park,Playground,Gym,Restaurant,Dog Run,College Gym,Comfort Food Restaurant,Comic Shop,Concert Hall,Cosmetics Shop


## Finally, let's visualize the resulting clusters

In [267]:
# create map
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[43.7184038,-79.5181413], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postal Code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters