In [3]:
# Importing all dependencies we'll need
import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import warnings
warnings.filterwarnings('ignore')


print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [7]:
import bs4 # BeautifulSoup Package
from bs4 import BeautifulSoup
import csv

In [8]:
### Reading wiki link

canada_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup1 = BeautifulSoup(canada_data,'lxml')

### Extracting the data 

tag1 = soup1.find('table')
post_list = list()
for post in tag1.find_all('td'):
    post_list.append(post.text)
post_list1 = [el.replace('\n', '') for el in post_list]

### Save the list into a csv file

with open("listofpostalcodesofcanada.csv", "w") as output:
    writer = csv.writer(output)
    for line in (post_list1[i:i+3] for i in range(0,len(post_list1),3)):
        writer.writerow(line)


Create and process DATAFRAME


In [9]:
### Create df_can dataframe from csv file
df_can = pd.read_csv('listofpostalcodesofcanada.csv',header=None)
df_can.columns = ['PostalCode','Borough','Neighborhood']

### Create a new data frame fd_can_filtered after filtering rows with borough='Not assigned'
df_can_filtered = df_can[df_can.Borough != 'Not assigned']
df_can_filtered.reset_index()

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df_can_filtered.Neighborhood[df_can_filtered.Neighborhood == 'Not assigned'] = df_can_filtered.Borough[df_can_filtered.Neighborhood == 'Not assigned']
df_can_filtered.reset_index()

### Group neighborhoods based on postal code area
df_can_grouped = pd.DataFrame(df_can_filtered.groupby(['PostalCode','Borough'])['Neighborhood'].agg(lambda x: ", ".join(set(x)))).reset_index()
df_can_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
### Find number of rows in the dataframe
df_can_grouped.shape

(103, 3)

# Task 2: Add the latitude and longitude coordinates to the dataframe

In [15]:
### Read the geosptial csv data
df_can_pos = pd.read_csv('https://cocl.us/Geospatial_data')
df_can_pos.columns = ['PostalCode','Latitude','Longitude']

### Join the neighborhood data from df_can_grouped and geospatial data
df_can_geo_data = pd.merge(df_can_grouped, df_can_pos, how='inner', on=['PostalCode'])
df_can_geo_data.head(30)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [16]:

df_can_geo_data.shape

(103, 5)