## --- Start of Question 1 ---

### Import table from wiki website

In [1]:
import pandas as pd

table = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

df_table = table[0]
df_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Size of table

In [2]:
#df_table.shape
df_table.dtypes

Postal code     object
Borough         object
Neighborhood    object
dtype: object

### Drop the rows with 'Not assigned' in columns 'Borough'

In [3]:
df_table.drop(df_table[df_table['Borough']=='Not assigned'].index, inplace=True)
df_table.reset_index(inplace=True)
df_table.drop('index', axis=1, inplace=True)
df_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Size of table (after processed)

In [4]:
df_table.shape

(103, 3)

### Combine those with same 'Postal code' with 'Neighborhood' separated by commas

In [5]:
df_table["Neighborhood"]= df_table["Neighborhood"].str.split("/") 
df_table["Neighborhood"] = df_table["Neighborhood"].str.join(",") 
df_table.groupby(['Postal code'], as_index = False).agg({'Neighborhood': ', '.join})

df_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### Assign 'Neighborhood' same as 'Borough' if it's not assigned

In [6]:
df_table['Neighborhood'] = df_table[['Neighborhood']].fillna('No value')
for x in range(df_table.shape[0]):
    if(df_table['Neighborhood'][x]=='No value'): #  np.nan):
        df_table['Neighborhood'][x]=df_table['Borough'][x]

df_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### Number of rows in the dataframe

In [7]:
df_table.shape

(103, 3)

### === End of Question 1 ===

## --- Start of Question 2 ---

### Import Geospatial data

In [8]:
geospatial = pd.read_csv("http://cocl.us/Geospatial_data")
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge dataframe with Latitude, Longitude

In [9]:
geospatial.rename(columns={"Postal Code":"Postal code"}, inplace=True)
df_newtable = pd.merge(df_table, geospatial, on='Postal code')
df_newtable.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### ===  End of Question 2 ===

## --- Start of Question 3 ---

In [10]:
toronto_data = df_newtable[df_newtable['Borough'].str.contains('Toronto')].reset_index(drop=True)

toronto_data

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond , Adelaide , King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669005,-79.442259


In [11]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

address = 'Downtown Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


In [12]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [13]:
toronto_onehot = pd.get_dummies(toronto_data['Borough'], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_data['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,"Regent Park , Harbourfront",0,1,0,0
1,"Queen's Park , Ontario Provincial Government",0,1,0,0
2,"Garden District, Ryerson",0,1,0,0
3,St. James Town,0,1,0,0
4,The Beaches,0,0,1,0


In [14]:
toronto_onehot.shape

(39, 5)

In [15]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,Berczy Park,0,1,0,0
1,"Brockton , Parkdale Village , Exhibition Place",0,0,0,1
2,Business reply mail Processing CentrE,0,0,1,0
3,"CN Tower , King and Spadina , Railway Lands , ...",0,1,0,0
4,Central Bay Street,0,1,0,0
5,Christie,0,1,0,0
6,Church and Wellesley,0,1,0,0
7,"Commerce Court , Victoria Hotel",0,1,0,0
8,Davisville,1,0,0,0
9,Davisville North,1,0,0,0


In [16]:
toronto_grouped.shape

(39, 5)

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
import numpy as np

num_top_venues = 4

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,Berczy Park,Downtown Toronto,West Toronto,East Toronto,Central Toronto
1,"Brockton , Parkdale Village , Exhibition Place",West Toronto,East Toronto,Downtown Toronto,Central Toronto
2,Business reply mail Processing CentrE,East Toronto,West Toronto,Downtown Toronto,Central Toronto
3,"CN Tower , King and Spadina , Railway Lands , ...",Downtown Toronto,West Toronto,East Toronto,Central Toronto
4,Central Bay Street,Downtown Toronto,West Toronto,East Toronto,Central Toronto


In [19]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

  return_n_iter=True)


array([4, 2, 3, 4, 4, 4, 4, 4, 1, 1], dtype=int32)

In [20]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,4,Downtown Toronto,West Toronto,East Toronto,Central Toronto
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,4,Downtown Toronto,West Toronto,East Toronto,Central Toronto
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,4,Downtown Toronto,West Toronto,East Toronto,Central Toronto
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,4,Downtown Toronto,West Toronto,East Toronto,Central Toronto
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,East Toronto,West Toronto,Downtown Toronto,Central Toronto


In [21]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### ===  End of Question 3 ===