In [1]:

!conda install -c conda-forge geocoder geopy folium beautifulsoup4 --yes

import numpy as np
import pandas as pd
import requests

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from bs4 import BeautifulSoup

Solving environment: done


  current version: 4.4.10
  latest version: 4.6.8

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /anaconda3

  added / updated specs: 
    - beautifulsoup4
    - folium
    - geocoder
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libblas-3.8.0              |       4_openblas           6 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    ratelim-0.1.6              |           py36_0           5 KB  conda-forge
    pandas-0.24.2              |   py36h0a44026_0        10.1 MB  conda-forge
    libcblas-3.8.0             |       4_openblas           6 KB  conda-forge
    numpy-1.16.2               |   py36hbb3c62a_1         4.1 MB  conda-forge
    openssl-1.0.2r             |       h1de35cc_0         3.0 MB  conda-forge
    branca-0.3.1  

In [6]:
sourcedata = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(sourcedata, 'html.parser')
postalCodes = []
boroughs = []
hoods = []
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodes.append(cells[0].text)
        boroughs.append(cells[1].text)
        hoods.append(cells[2].text.rstrip('\n'))

In [7]:
temp = { 'PostalCode': postalCodes,
        'Borough': boroughs,
        'Neighborhood': hoods}
postal_codes_df = pd.DataFrame.from_dict(temp)
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [8]:
postal_codes_df.replace("Not assigned", np.nan, inplace=True)
postal_codes_df.dropna(subset=['Borough'], inplace=True)
postal_codes_df['Neighborhood'].fillna(postal_codes_df['Borough'], inplace=True)

In [9]:
postal_codes_df = postal_codes_df.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
postal_codes_df.shape

(103, 3)

In [11]:
#Part 2

In [12]:
coords_df = pd.read_csv("https://cocl.us/Geospatial_data")
coords_df.rename(columns={'Postal Code':'PostalCode'}, inplace = True)
coords_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
tempdf1 = postal_codes_df.set_index('PostalCode')
tempdf2 = coords_df.set_index('PostalCode')
toronto_coords_df = pd.concat([tempdf1, tempdf2], axis=1, join='inner')
toronto_coords_df.reset_index(inplace=True)
toronto_coords_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [14]:
#Part 3

In [15]:
from pandas.io.json import json_normalize
import json
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

from geopy.geocoders import Nominatim
import geocoder

from sklearn.cluster import KMeans

In [16]:
g = geocoder.arcgis('Toronto Ontario')
toronto_lat = g.latlng[0]
toronto_lng = g.latlng[1]

map_tr = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=11)

for lat, long, post, borough, hood in zip(toronto_coords_df['Latitude'], toronto_coords_df['Longitude'],toronto_coords_df['PostalCode'], toronto_coords_df['Borough'],toronto_coords_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, hood)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tr)
map_tr

In [27]:
#notes!

In [18]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup

In [20]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [21]:
soup = BeautifulSoup(source, 'html5lib')

In [22]:
postal_codes_dict = {} # initialize an empty dictionary to save the data in
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass

In [23]:
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood


In [24]:
# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"PostalCode": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

In [25]:

# print number of rows of dataframe
toronto_data.shape[0]

0

In [26]:

# print number of rows of dataframe
toronto_data.shape

(0, 3)