In [24]:
# Download libraries for processing XML and HTML with Python

!pip install lxml

import lxml.html as lh

import requests  # for retrieving sending & requests from the web

import pandas as pd

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library & maps

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [26]:
# Retrieve/download the file with Postal Codes & places from the web-site

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

data = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [27]:
df.shape

(180, 3)

In [28]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

# Drop rows with ''Borough'' that is ''Not assigned'', and then reset the index

df.drop(df.loc[df['Borough'] == 'Not assigned'].index, inplace=True)

df = df.reset_index(drop=True)
print('\n')
print('Size of dataframe after removing  --Borough-- that is --Not assigned--')
print('# Rows, Columns',df.shape)
print('\n')



Size of dataframe after removing  --Borough-- that is --Not assigned--
# Rows, Columns (103, 3)




In [None]:
# Check the data to ensure the right records are displayed

In [32]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [None]:
# Importing Longitude and Latitude for each postal codes

In [33]:
filename = 'http://cocl.us/Geospatial_data'

In [34]:
# Read the contents of the file with postal codes and latitude/longitude
# into a data frame, and assign column headers in the data fram

headers = ['Postal Code','Latitude','Longitude']

df_pcode_long_lat = pd.read_csv(filename)

# Display the first 5 rows of the data frame

df_pcode_long_lat.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
# Merge the 2 dataframes

merged_PCode = pd.merge(left=df, right=df_pcode_long_lat, left_on='Postal Code', right_on='Postal Code')

# Convert the Lat/Long to float from object
merged_PCode['Latitude'] = merged_PCode['Latitude'].astype(float)
merged_PCode['Longitude'] = merged_PCode['Longitude'].astype(float)

# What's the size of the output data?
merged_PCode.shape

(103, 5)

In [36]:
# Display data frame with Postal Code, Borough, Neighborhood, Latitude, Longitude
merged_PCode.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636


In [None]:
# Segmentation and clustering

In [37]:
# We want to check count of neighborhoods for each Borough in city of Toronto

CntNeighByBorough = merged_PCode[['Borough','Neighborhood']]

CntNeighByBorough.groupby(['Borough'], as_index=False).count()

Unnamed: 0,Borough,Neighborhood
0,Central Toronto,9
1,Downtown Toronto,19
2,East Toronto,5
3,East York,5
4,Etobicoke,12
5,Mississauga,1
6,North York,24
7,Scarborough,17
8,West Toronto,6
9,York,5


In [None]:
# For this assignment, we pick up Boroughs containing the string 'Toronto'.
# Customers typically prefer a local area/region as the first choice.

# Depending upon our analysis, we would either find an area of interest locally or
# expand our search to other surrounding areas or to a completely a different location

In [38]:
list_tor = merged_PCode['Borough'].str.contains('Toronto') 
Toronto_data = merged_PCode[list_tor]
Toronto_data = Toronto_data.reset_index(drop=True)

Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [39]:
# Create a function to check/explore all the neighborhoods in the extracted data set

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
                
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [43]:
LIMIT = 200 # Limit the number results returned
radius = 30 # search within 30 mile radius

# Define Four Square credentials

CLIENT_ID = 'JWBO22J423YEHZEJJLU0YJ3ZB4ZVVXRETYGWD4AGZ0PBMRPQ' # your Foursquare ID
CLIENT_SECRET = '5JL1QNJP0YAJJIPF0Y5KEVBQDBSZEDO1GEP4Q1O3EF4PZ10A' # your Foursquare SecreJWBO22J423YEHZEJJLU0YJ3ZB4ZVVXRETYGWD4AGZ0PBMRPQt
VERSION = '20180604'

# Retrieve all the venues, categories, latitude / longitude of each neighborhood in the data subset
# This data set 'Toronto_data', only has neighborhoods for boroughs with name 'Toronto'

Toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )

In [44]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1614, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [45]:
# Lets analyze  by looking at the data distribution 

# Prepare new data frame to hold count of all distinct categories for each neighborhood 

Neigh_Cat_Count_Tmp = Toronto_venues[['Neighborhood','Venue Category']]

In [48]:
# Create an empty df to hold the count column

column_names = ["Count"]

EmptyDF = pd.DataFrame(columns=column_names)

Neigh_Cat_Count = pd.concat([Neigh_Cat_Count_Tmp, EmptyDF], ignore_index=True)

Neigh_Cat_Count['Count'].fillna(1, inplace=True)

Neigh_Cat_Count.head()

print('# of Venues per category per Neighborhood :', Neigh_Cat_Count.shape)

# of Venues per category per Neighborhood : (1614, 3)


In [49]:
# Now check the count of distinct categories for each neighborhood

Neigh_Cat_CountGrpBy = Neigh_Cat_Count.groupby(['Neighborhood','Venue Category'], as_index=False)["Count"].count()
Neigh_Cat_CountGrpBy.head()

Unnamed: 0,Neighborhood,Venue Category,Count
0,Berczy Park,Art Gallery,1
1,Berczy Park,BBQ Joint,1
2,Berczy Park,Bagel Shop,1
3,Berczy Park,Bakery,2
4,Berczy Park,Basketball Stadium,1


In [51]:
print('# of Venues per category per Neighborhood :', Neigh_Cat_CountGrpBy.shape)

# of Venues per category per Neighborhood : (1089, 3)


In [52]:
# Sum all the counts to add up to the original num of rows, to counter check
print('Verification only :',Neigh_Cat_CountGrpBy['Count'].sum())

Verification only : 1614


In [53]:
# Now check count of  Categories to see what Category appears the most, order by count in Desc

VenCatCnt = Neigh_Cat_Count.groupby(['Venue Category'], as_index=False)["Count"].count()
VenCatCnt.sort_values(by=['Count'], inplace=True, ascending=False)
VenCatCnt = VenCatCnt.reset_index(drop=True)
VenCatCnt.head(10)

Unnamed: 0,Venue Category,Count
0,Coffee Shop,142
1,Café,90
2,Restaurant,55
3,Italian Restaurant,42
4,Park,36
5,Hotel,33
6,Bakery,32
7,Japanese Restaurant,32
8,Bar,27
9,Pizza Place,26


In [57]:
# Iterate through all  neighboorhoods to search for venue categories of specify type in each neighboorhood
# Create a new data frame to only hold the new data 

# The categories of interest was obtained from FourSquare API documentation 
# The included any category which may be related to out-door, e.g - Park, Lake, River,e.t.c

# Create an empty data frame
Toronto_Outdoor = pd.DataFrame()
Toronto_Outdoor_data = pd.DataFrame()

Neigh = ''
VenueCat = ''
Count= 0

i = 0
for Neigh, VenueCat,Count  in zip(Neigh_Cat_CountGrpBy['Neighborhood'], Neigh_Cat_CountGrpBy['Venue Category'], Neigh_Cat_CountGrpBy['Count']):
    if ("Lake" in VenueCat or "Park" in VenueCat or "Outdoors & Recreation" in VenueCat or \
        "Scenic" in VenueCat or "Beach" in VenueCat or "Mountain" in VenueCat or "Botanical" in VenueCat or \
        "River" in VenueCat or "Nature" in VenueCat or "Other Great Outdoors" in VenueCat or "Forest" in VenueCat):
            
            Venues = Neigh_Cat_CountGrpBy.iloc[[i],:]
            
            Toronto_Outdoor = pd.DataFrame(Venues,columns=['Neighborhood','Venue Category','Count'])
           
            Toronto_Outdoor_data = Toronto_Outdoor_data.append(Toronto_Outdoor)
           
    i += 1

In [58]:
# The output of this data is list of Neighborhoods, Venue Categories, Count of venues for each category

Toronto_Outdoor_data = Toronto_Outdoor_data.reset_index(drop=True)

Toronto_Outdoor_data.head(5)

Unnamed: 0,Neighborhood,Venue Category,Count
0,Berczy Park,Beach,1
1,Berczy Park,Park,1
2,"Business reply mail Processing Centre, South C...",Park,1
3,"Business reply mail Processing Centre, South C...",Skate Park,1
4,Central Bay Street,Park,1


In [59]:
print('# of Neighborhoods which have outdoor venues :', Toronto_Outdoor_data.shape)

# of Neighborhoods which have outdoor venues : (32, 3)


In [60]:
# How many distinct categories are there for out-door activity
# Extract only the Venue Category and perform a count

VenCatCntOut = Toronto_Outdoor_data.groupby(['Venue Category'], as_index=False)["Count"].count()
VenCatCntOut.sort_values(by=['Count'], inplace=True, ascending=False)
VenCatCntOut = VenCatCntOut.reset_index(drop=True)

VenCatCntOut.head(50)

Unnamed: 0,Venue Category,Count
0,Park,26
1,Lake,2
2,Beach,1
3,Other Great Outdoors,1
4,Scenic Lookout,1
5,Skate Park,1


In [62]:
# Merge Toronto_data with Toronto_Outdoor_data to retrieve the latitude/longitude 
#  This data will be used to show on the map for visualization

#Toronto_data
#Toronto_Outdoor_data

#Merge the 2 dataframes

Toronto_Outdoor_data_map = pd.merge(left=Toronto_Outdoor_data, right=Toronto_data, how='left', left_on='Neighborhood', right_on='Neighborhood')

# Convert the Lat/Long to float from object
#merged_PCode['Latitude'] = merged_PCode['Latitude'].astype(float)
#merged_PCode['Longitude'] = merged_PCode['Longitude'].astype(float)

# What's the size of the output data?
print('Verification only - # of Neighorhoods with out-door activity :', Toronto_Outdoor_data_map.shape)

Verification only - # of Neighorhoods with out-door activity : (32, 7)


In [63]:
Toronto_Outdoor_data_map.head()

Unnamed: 0,Neighborhood,Venue Category,Count,Postal Code,Borough,Latitude,Longitude
0,Berczy Park,Beach,1,M5E,Downtown Toronto,43.644771,-79.373306
1,Berczy Park,Park,1,M5E,Downtown Toronto,43.644771,-79.373306
2,"Business reply mail Processing Centre, South C...",Park,1,M7Y,East Toronto,43.662744,-79.321558
3,"Business reply mail Processing Centre, South C...",Skate Park,1,M7Y,East Toronto,43.662744,-79.321558
4,Central Bay Street,Park,1,M5G,Downtown Toronto,43.657952,-79.387383


In [64]:
# Create a map of Toronto to visualize the distribution of Neighborhoods

# Retrieve the latitude, longitude for Toronto, Canada first, and then map the out-door locations
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="Tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Toronto using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map for the individual neighborhoods that have out-door activities
for lat, lng, borough, neighborhood in zip(Toronto_Outdoor_data_map['Latitude'], Toronto_Outdoor_data_map['Longitude'], Toronto_Outdoor_data_map['Borough'], Toronto_Outdoor_data_map['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
   
    label = folium.Popup(label, parse_html=True)
   
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)
map_tor


In [None]:
###  The end