# Getting the table from Wiki 

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from sklearn.cluster import KMeans
import requests

#Getting the table 
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
postcode = df["Postcode"].tolist()
borough = df["Borough"].tolist()
neigh = df["Neighbourhood"].tolist()

# Creating the dataframe

In [2]:
# instantiate the dataframe
neighborhoods = pd.DataFrame(
{
 "Postal Code":postcode, 
    "Borough":borough,
    "Neighbourhood":neigh
}
)

# Cleaning the data

In [3]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
neighborhoods.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

In [4]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, 
# you will notice that M5A is listed twice and has two neighborhoods: 
# Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
# separated with a comma as shown in row 11 in the above table.

neighborhoods1=neighborhoods.groupby("Postal Code").agg(lambda x:','.join(set(x)))

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 
# So for the 9th cell in the table on the Wikipedia page, 
# the value of the Borough and the Neighborhood columns will be Queen's Park.

neighborhoods1.loc[neighborhoods1['Neighbourhood']=="Not assigned",'Neighbourhood']=neighborhoods1.loc[neighborhoods1['Neighbourhood']=="Not assigned",'Borough']

neighborhoods1.shape



(103, 2)

In [5]:
neighborhoods1

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,West Hill,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Kennedy Park,Ionview"
M1L,Scarborough,"Clairlea,Oakridge,Golden Mile"
M1M,Scarborough,"Scarborough Village West,Cliffside,Cliffcrest"
M1N,Scarborough,"Cliffside West,Birch Cliff"


# Using the csv file 

In [6]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


# Creating the new dataframe

In [7]:
neighborhoods1['Latitude']=geo_data['Latitude'].values
neighborhoods1['Longitude']=geo_data['Longitude'].values

neighborhoods1

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,West Hill,Morningside",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park,Kennedy Park,Ionview",43.727929,-79.262029
M1L,Scarborough,"Clairlea,Oakridge,Golden Mile",43.711112,-79.284577
M1M,Scarborough,"Scarborough Village West,Cliffside,Cliffcrest",43.716316,-79.239476
M1N,Scarborough,"Cliffside West,Birch Cliff",43.692657,-79.264848


In [9]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

# Explore and cluster the neighborhoods 

## Only boroughs that contain the word Toronto

In [16]:
torontos= neighborhoods1[neighborhoods1['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
torontos.shape

(38, 4)

## Creating a map 

In [19]:
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# add markers to map
for lat, lng, label in zip(torontos['Latitude'], torontos['Longitude'], torontos['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [39]:
CLIENT_ID = 'I4BBG5UENS5BYELIJVWM3NN0AELS24LXVP03OCZRKHK20YSX' # your Foursquare ID
CLIENT_SECRET = 'PC3VZR2GLX0OF0CFZ51DIDGP1NECVDQ1BMN354XFILYXFWXJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## function to repeat the same process to all the neighborhoods

In [40]:
page=BeautifulSoup(r.text,"html.parser")
page
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Using the function

In [41]:
torontos2 = getNearbyVenues(names=torontos['Neighbourhood'],
                                   latitudes=torontos['Latitude'],
                                   longitudes=torontos['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Rathnelly,Summerhill West,South Hill,Deer Park,Forest Hill SE
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Regent Park,Harbourfront
Garden District,Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide,Richmond,King
Harbourfront East,Union Station,Toronto Islands
Toronto Dominion Centre,Design Exchange
Victoria Hotel,Commerce Court
Roselawn
Forest Hill West,Forest Hill North
Yorkville,The Annex,North Midtown
University of Toronto,Harbord
Kensington Market,Grange Park,Chinatown
King and Spadina,CN Tower,Harbourfront West,South Niagara,Railway Lands,Bathurst Quay,Island airport
Stn A PO Boxes 25 The Esplanade
Underground city,First Canadian Place
Christie
Dovercourt Village,Dufferin
Trinity,Little Portugal
Exhibition Place,Brockton,Parkdale Village
The Junction South,High Park
Roncesvalles,Parkda

In [42]:
print(torontos2.shape)
torontos2.head()

(1678, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
4,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors


In [44]:
torontos2.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,Richmond,King",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
Business Reply Mail Processing Centre 969 Eastern,15,15,15,15,15,15
"Cabbagetown,St. James Town",46,46,46,46,46,46
Central Bay Street,79,79,79,79,79,79
Christie,17,17,17,17,17,17
Church and Wellesley,85,85,85,85,85,85
Davisville,32,32,32,32,32,32
Davisville North,7,7,7,7,7,7
"Dovercourt Village,Dufferin",14,14,14,14,14,14


# Analyze Each Neighborhood

In [47]:
# one hot encoding
torontos3 = pd.get_dummies(torontos2[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
torontos3['Neighborhood'] = torontos2['Neighborhood'] 

# move neighborhood column to the first column
cols=list(torontos3.columns.values)
cols.pop(cols.index('Neighborhood'))
torontos3=torontos3[['Neighborhood']+cols]

# rename Neighborhood for Neighbourhood so that future merge works
torontos3.rename(columns = {'Neighborhood': 'Neighbourhood'}, inplace = True)
torontos3.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## printing each neighborhood along with the top 5 most common venues

In [50]:
tg = torontos3.groupby('Neighbourhood').mean().reset_index()

In [54]:
num_top_venues = 5

for hood in tg['Neighbourhood']:
    print("----"+hood+"----")
    temp = tg[tg['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,Richmond,King----
              venue  freq
0       Coffee Shop  0.08
1              Café  0.05
2        Steakhouse  0.04
3               Bar  0.04
4  Sushi Restaurant  0.03


----Berczy Park----
            venue  freq
0     Coffee Shop  0.07
1      Steakhouse  0.04
2  Farmers Market  0.04
3        Beer Bar  0.04
4            Café  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0  Gym / Fitness Center  0.07
1         Auto Workshop  0.07
2         Garden Center  0.07
3                Garden  0.07
4    Light Rail Station  0.07


----Cabbagetown,St. James Town----
                venue  freq
0         Coffee Shop  0.07
1                 Pub  0.04
2  Italian Restaurant  0.04
3         Pizza Place  0.04
4                Park  0.04


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.14
1      Ice Cream Shop  0.05
2  Italian Restaurant  0.05
3      Sandwich Place  0.04
4                Café  0.04


---

In [57]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## new dataframe with the most common venues 

In [61]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = tg['Neighbourhood']

for ind in np.arange(tg.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tg.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Bar,Steakhouse,Restaurant,Asian Restaurant,Bakery,Hotel,Thai Restaurant,American Restaurant
1,Berczy Park,Coffee Shop,Seafood Restaurant,Bakery,Steakhouse,Beer Bar,Cheese Shop,Cocktail Bar,Café,Farmers Market,Irish Pub
2,Business Reply Mail Processing Centre 969 Eastern,Skate Park,Garden,Burrito Place,Fast Food Restaurant,Auto Workshop,Farmers Market,Spa,Brewery,Restaurant,Garden Center
3,"Cabbagetown,St. James Town",Coffee Shop,Restaurant,Pub,Bakery,Park,Chinese Restaurant,Market,Pharmacy,Pizza Place,Italian Restaurant
4,Central Bay Street,Coffee Shop,Ice Cream Shop,Italian Restaurant,Sandwich Place,Burger Joint,Café,Bubble Tea Shop,Gym / Fitness Center,Japanese Restaurant,Dessert Shop
5,Christie,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Candy Store,Baby Store,Restaurant,Convenience Store
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Pub,Men's Store,Mediterranean Restaurant,Italian Restaurant,Hotel
7,Davisville,Sandwich Place,Dessert Shop,Gym,Café,Italian Restaurant,Coffee Shop,Sushi Restaurant,Pizza Place,Fried Chicken Joint,Restaurant
8,Davisville North,Gym,Park,Breakfast Spot,Clothing Store,Hotel,Food & Drink Shop,Sandwich Place,Dessert Shop,Eastern European Restaurant,Dumpling Restaurant
9,"Dovercourt Village,Dufferin",Pharmacy,Bakery,Supermarket,Music Venue,Middle Eastern Restaurant,Café,Brewery,Bar,Bank,Park


# Cluster Neighbourhoods

## Run k-means = 5 & create a new dataframe that includes the cluster

In [68]:
kclusters = 5

toronto_grouped_clustering = tg.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [70]:
toronto_merged = torontos
toronto_merged['Cluster Labels'] = kmeans.labels_
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

# Clusters visualization - Toronto Neighborhoods

In [71]:
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters