# Data Analysis

## Importing libraries

In [59]:
import pandas as pd
import numpy as np
import geocoder
from opencage.geocoder import OpenCageGeocode
import folium
import requests
from sklearn.cluster import KMeans


## Scraping data

Using the pandas library to read the html

In [2]:
wiki_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
wiki_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## Cleaning the data

- Removing the not assigned borough
- Replacing the '/' with a ',' where we have multiple entries for Neighborhood
- There are no enteries where we donot have a value for Neighborhood

In [3]:
neighborhood_data = pd.DataFrame(columns = ['Postal code','Borough', 'Neighborhood'], 
                                 data = wiki_data[wiki_data.Borough!='Not assigned'].values)
neighborhood_data.Neighborhood = neighborhood_data.Neighborhood.replace({'/':','},regex=True)
neighborhood_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [4]:
print("The size of the data frame read is {}".format(neighborhood_data.shape))

The size of the data frame read is (103, 3)


## Fetching the latitude and longitude

Using geocoder package to get the latitude and longitude
- But this failed so used the csv file provided.

In [16]:
# for i,data in neighborhood_data.iterrows():
#     lat_lng = None
#     while(lat_lng==None):
#         location = geocoder.google('{}, Toronto, Ontario'.format(data['Postal code']))
#         lat_lng = location.latlng
#     neighborhood_data['Latitude'] = lat_lng[0]
#     neighborhood_data['Longitude'] = lat_lng[1]
#     print(i, data['Postal code'])
#     time.sleep(1.0)

### Reading the coordinates data

The index of the data frame is set to be the Postal code column.

In [6]:
lat_lng = pd.read_csv('./Geospatial_Coordinates.csv')
lat_lng.set_index('Postal Code', inplace=True)
lat_lng.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


## Merging the latitude and longitude data with neighborhood data

In [8]:
neighborhood_data['Latitude'] = lat_lng.loc[neighborhood_data['Postal code'], 'Latitude'].values
neighborhood_data['Longitude'] = lat_lng.loc[neighborhood_data['Postal code'], 'Longitude'].values
neighborhood_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


## Analysing and understanding the data

Let's check how many unique Borough's and Neighborhood's are present.

In [10]:
print('The data has {} boroughs and {} neighborhoods.'.format(
        len(neighborhood_data['Borough'].unique()),
        len(neighborhood_data['Neighborhood'].unique())
    )
)

The data has 10 boroughs and 98 neighborhoods.


## Getting the latitude and longitude of the Toronto 

Geocoder package was not working so used open cage geocode

In [18]:
# lat_lng = None
# while(lat_lng==None):
#     location = geocoder.google('Toronto, Ontario')
#     lat_lng = location.latlng
#  = location.latlng[0]
#  = location.latlng[1]
geolocator = OpenCageGeocode("d9151bc21e3c47daa72ac27bf5422c76")
location = geolocator.geocode("Toronto", country="CA")
latitude_TO = location[0]['geometry']['lat']
longitude_TO = location[0]['geometry']['lng']
print("The latitude and longitude of Toronto, CA are {}, {}".format(latitude_TO, longitude_TO))


The latitude and longitude of Toronto, CA are 43.6534817, -79.3839347


## Plotting the neighborhoods on a Map

In [22]:
map_toronto = folium.Map(location=[latitude_TO, longitude_TO], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhood_data['Latitude'], neighborhood_data['Longitude'], neighborhood_data['Borough'], neighborhood_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Declaring foursquare api 

In [23]:
CLIENT_ID = 'UFX0WT0O50CEQBP4GIIDCPB5FELMNMCMDN50MAJGB0GDZDBV' # your Foursquare ID
CLIENT_SECRET = 'WCNIIINW5HL1SHJBEUSBIRA5FC35GB4BHYAPN2UYCHWYFPRF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UFX0WT0O50CEQBP4GIIDCPB5FELMNMCMDN50MAJGB0GDZDBV
CLIENT_SECRET:WCNIIINW5HL1SHJBEUSBIRA5FC35GB4BHYAPN2UYCHWYFPRF


## Explore the Neighborhoods

In [29]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Venues withing 500m radius and top 100

In [31]:
toronto_venues = getNearbyVenues(names=neighborhood_data['Neighborhood'],
                                   latitudes=neighborhood_data['Latitude'],
                                   longitudes=neighborhood_data['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


### The size of the data frame with venues

In [33]:
print(toronto_venues.shape)

(2136, 7)


### The number of unique categories from each venue

In [34]:
print("The number of unique categories are {}".format(len(toronto_venues['Venue Category'].unique())))

The number of unique categories are 269


### One hot encoding

In [52]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# # move neighborhood column to the first column
neighborhoodIndex = list(toronto_onehot.columns).index('Neighborhood')
fixed_columns = [toronto_onehot.columns[neighborhoodIndex]] + list(toronto_onehot.columns[:neighborhoodIndex])+list(toronto_onehot.columns[neighborhoodIndex+1:])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
print("The size of the onehot dataframe is {}".format(toronto_onehot.shape))

The size of the onehot dataframe is (2136, 269)


### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [54]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood , Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor , Wilson Heights , Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park , Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,"Willowdale , Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
90,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
91,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.100000,0.0,0.0,0.0,0.0,0.0,0.0
92,"York Mills , Silver Hills",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
print("The new size is {}".format(toronto_grouped.shape))

The new size is (94, 269)


### Top 10 common venues

In [56]:
# num_top_venues = 10

# for hood in toronto_grouped['Neighborhood']:
#     print("----"+hood+"----")
#     temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
#     temp.columns = ['venue','freq']
#     temp = temp.iloc[1:]
#     temp['freq'] = temp['freq'].astype(float)
#     temp = temp.round({'freq': 2})
#     print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#     print('\n')

----Agincourt----
                        venue  freq
0              Breakfast Spot  0.25
1                      Lounge  0.25
2                Skating Rink  0.25
3   Latin American Restaurant  0.25
4               Metro Station  0.00
5  Modern European Restaurant  0.00
6           Mobile Phone Shop  0.00
7          Miscellaneous Shop  0.00
8   Middle Eastern Restaurant  0.00
9          Mexican Restaurant  0.00


----Alderwood , Long Branch----
                        venue  freq
0                 Pizza Place  0.22
1                        Pool  0.11
2                 Coffee Shop  0.11
3                Skating Rink  0.11
4                    Pharmacy  0.11
5                         Gym  0.11
6              Sandwich Place  0.11
7                         Pub  0.11
8  Modern European Restaurant  0.00
9           Mobile Phone Shop  0.00


----Bathurst Manor , Wilson Heights , Downsview North----
                       venue  freq
0                Coffee Shop  0.09
1                       Ba

                       venue  freq
0                     Bakery  0.12
1                   Pharmacy  0.12
2                        Bar  0.06
3              Grocery Store  0.06
4           Recording Studio  0.06
5  Middle Eastern Restaurant  0.06
6                       Café  0.06
7                       Bank  0.06
8                Supermarket  0.06
9       Brazilian Restaurant  0.06


----East Toronto----
                             venue  freq
0                      Coffee Shop  0.33
1                Convenience Store  0.33
2                             Park  0.33
3                    Metro Station  0.00
4  Molecular Gastronomy Restaurant  0.00
5       Modern European Restaurant  0.00
6                Mobile Phone Shop  0.00
7               Miscellaneous Shop  0.00
8        Middle Eastern Restaurant  0.00
9               Mexican Restaurant  0.00


----Eringate , Bloordale Gardens , Old Burnhamthorpe , Markland Wood----
            venue  freq
0     Pizza Place  0.12
1      Beer Store 

9                  Bakery  0.06


----Moore Park , Summerhill East----
                        venue  freq
0                 Summer Camp  0.33
1                        Park  0.33
2                  Playground  0.33
3           Accessories Store  0.00
4  Modern European Restaurant  0.00
5           Mobile Phone Shop  0.00
6          Miscellaneous Shop  0.00
7   Middle Eastern Restaurant  0.00
8          Mexican Restaurant  0.00
9               Metro Station  0.00


----New Toronto , Mimico South , Humber Bay Shores----
                  venue  freq
0           Pizza Place  0.08
1                Bakery  0.08
2            Restaurant  0.08
3                  Café  0.08
4    Seafood Restaurant  0.08
5  Fast Food Restaurant  0.08
6           Flower Shop  0.08
7          Liquor Store  0.08
8   Fried Chicken Joint  0.08
9              Pharmacy  0.08


----North Park , Maple Leaf Park , Upwood Park----
                             venue  freq
0       Construction & Landscaping  0.25
1          

                             venue  freq
0                      Pizza Place  0.25
1                     Hockey Arena  0.25
2            Portuguese Restaurant  0.25
3                      Coffee Shop  0.25
4                Accessories Store  0.00
5        Middle Eastern Restaurant  0.00
6              Monument / Landmark  0.00
7  Molecular Gastronomy Restaurant  0.00
8       Modern European Restaurant  0.00
9                Mobile Phone Shop  0.00


----Westmount----
                      venue  freq
0               Pizza Place  0.29
1               Coffee Shop  0.14
2            Sandwich Place  0.14
3        Chinese Restaurant  0.14
4            Discount Store  0.14
5              Intersection  0.14
6           Organic Grocery  0.00
7       Moroccan Restaurant  0.00
8            Medical Center  0.00
9  Mediterranean Restaurant  0.00


----Wexford , Maryvale----
                       venue  freq
0                     Bakery  0.14
1      Vietnamese Restaurant  0.14
2              Shoppi

In [57]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [61]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Electronics Store,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
1,"Alderwood , Long Branch",Pizza Place,Skating Rink,Gym,Pharmacy,Coffee Shop,Pub,Sandwich Place,Pool,Donut Shop,Discount Store
2,"Bathurst Manor , Wilson Heights , Downsview North",Bank,Coffee Shop,Deli / Bodega,Shopping Mall,Sandwich Place,Restaurant,Diner,Middle Eastern Restaurant,Supermarket,Ice Cream Shop
3,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Yoga Studio
4,"Bedford Park , Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Restaurant,Fast Food Restaurant,Café,Sushi Restaurant,Spa,Pizza Place,Thai Restaurant


## Clustering the Neighborhoods

Using k-means clustering

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 