# Data Science Capstone Notebook

This notebook will primarily be used in my capstone project. 

In [8]:
import pandas as pd
import numpy as np
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


### WEEK 3
##### Use bs4 to scrape wikipedia, find Toronto data

In [9]:
from bs4 import BeautifulSoup
import requests

In [10]:
#Fetch
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [11]:

soup = BeautifulSoup(source, 'lxml')
# print(soup.prettify())

In [12]:
table = soup.find('table')

In [13]:
columns = list()
table.tr.find_all('th')


[<th>Postal code
 </th>,
 <th>Borough
 </th>,
 <th>Neighborhood
 </th>]

##### Remove non bourough assigned elements

In [14]:
data = list()
for rows in table.find_all('tr'):
    
    row = rows.find_all('td')
    if row:
        postalcode = row[0].text.rstrip()
        borough = row[1].text.rstrip()
        neighborhood = row[2].text.rstrip()
        if borough != 'Not assigned':
            if neighborhood == 'Not assigned':
                neighborhood = borough
            data.append([postalcode, borough, neighborhood])

col_head = list()
for cols in table.tr.find_all('th'):
    col_head.append(cols.text.strip())

In [15]:
col_head

['Postal code', 'Borough', 'Neighborhood']

In [16]:
df = pd.DataFrame(data, columns = col_head)
print(df.shape)

(103, 3)


In [17]:
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M4M,North York,Downsview
freq,1,24,4


##### Group

In [18]:
df = df.groupby('Postal code').agg(
    {
        'Borough':'first', 
        'Neighborhood': ', '.join,}
    ).reset_index()

In [19]:
df.describe()

Unnamed: 0,Postal code,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M4M,North York,Downsview
freq,1,24,4


In [20]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [21]:
df.shape

(103, 3)

In [22]:
columns = list()
table.tr.find_all('th')

[<th>Postal code
 </th>,
 <th>Borough
 </th>,
 <th>Neighborhood
 </th>]

##### CLeam Data (Column names)

In [23]:
df.rename(columns={'Postal code' : 'Postcode'}, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### Get Geos

In [24]:
dfgeo = pd.read_csv(r'http://cocl.us/Geospatial_data')
dfgeo.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
dfgeo.rename(columns={'Postal Code' : 'Postcode'}, inplace=True)
dfgeo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Merge Geos and Scraped Data by Postcode

In [27]:
df2 = pd.merge(df, dfgeo, on="Postcode", how='left')
df2.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


##### Clustering

In [28]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image 
from IPython.core.display import HTML 
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Folium installed
Libraries imported.


In [29]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coords, City of Toronto: {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


Coords, City of Toronto: 43.6534817, -79.3839347.


###### Map it

In [30]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_toronto)  
    
map_toronto

##### Get Foursquare API set up and get data

In [31]:
CLIENT_ID = '1PQYYST2E0P5CGBCCFYBBSQ0EXTOEAQ4CPU24WDIAT2S41H2' # your Foursquare ID
CLIENT_SECRET = 'QDT5PSYKEN0RJNL415D2DOBG1FHLUG5U0QGPJ51ODMKM3RQF' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1PQYYST2E0P5CGBCCFYBBSQ0EXTOEAQ4CPU24WDIAT2S41H2
CLIENT_SECRET:QDT5PSYKEN0RJNL415D2DOBG1FHLUG5U0QGPJ51ODMKM3RQF


In [32]:
df3 = df2[df2['Borough'].str.contains('Toronto')]

df4 = df3.reset_index(drop=True)
df4.head(11)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


In [33]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_toronto)
    
map_toronto

In [34]:
df4.loc[0, 'Neighborhood']

'The Beaches'

In [35]:
neighborhood_lat = df4.loc[0, 'Latitude'] 
neighborhood_long = df4.loc[0, 'Longitude'] 
neighborhood_name = df4.loc[0, 'Neighborhood'] 

In [36]:
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_lat, 
    neighborhood_long, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=1PQYYST2E0P5CGBCCFYBBSQ0EXTOEAQ4CPU24WDIAT2S41H2&client_secret=QDT5PSYKEN0RJNL415D2DOBG1FHLUG5U0QGPJ51ODMKM3RQF&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [37]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ea2e81dedbcad001ba12170'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [38]:
'{} places near {}'.format(len(results['response']['groups'][0]['items']),neighborhood_name)

'4 places near The Beaches'

In [39]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [40]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [41]:
map_toronto = folium.Map(location=[neighborhood_lat, neighborhood_long], zoom_start=16)

folium.features.CircleMarker(
    [neighborhood_lat, neighborhood_long],
    radius=12,
    color='red',
    popup= neighborhood_name,
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(map_toronto)

for lat, lng, name, categories in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories']):
    label = '{}, {}'.format(name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_toronto)  

map_toronto