In [2]:
#!conda install -c conda-forge beautifulsoup4
from bs4 import BeautifulSoup
#!conda install -c conda-forge lxml
import requests
from geopy.geocoders import Nominatim
!conda install -c conda-forge folium
import folium
import pandas as pd
from pandas.io.json import json_normalize

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.9.1               |             py_0          59 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will be IN

### 1. Scrape the Wikipedia page to get data

In [3]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
#print(soup.prettify())

In [4]:
table=soup.find('table',class_='wikitable sortable')
data=table.find_all('td')
Postcode=[]
Borough=[]
Neighborhood=[]
for item in data[0::3]:
    Postcode.append(item.text)
for item in data[1::3]:
    try:
        Borough.append(item.a.text)
    except:
        Borough.append(None)
for item in data[2::3]:
    try:
        Neighborhood.append(item.a.text)
    except:
        Neighborhood.append(None)

### 2. Create the dataframe

In [5]:
df_data={'Postcode':Postcode,'Borough':Borough,'Neighborhood':Neighborhood}
df=pd.DataFrame(df_data,columns=['Postcode','Borough','Neighborhood'])
df.shape

(288, 3)

#### Remove cells with a borough that is Not assigned

In [6]:
df.dropna(subset=['Borough'],inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Assign neighborhoods that are Not assigned to their borough

In [7]:
missing_index=df[df['Neighborhood'].isna()].index
df.loc[missing_index,'Neighborhood']=df.loc[missing_index,'Borough']
df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### Combine multiple neighborhoods with the same postcode togeter, separated with ',' 

In [8]:
df.drop_duplicates(inplace=True)
Toronto_Neighborhoods=df.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).to_frame().reset_index()
Toronto_Neighborhoods.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Scarborough, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Scarborough
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Scarborough, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough"
9,M1N,Scarborough,"Birch Cliff, Scarborough"


In [9]:
Toronto_Neighborhoods.shape

(100, 3)

#### Get the location information (latitude, longitude)

In [10]:
df_geo=pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_Toronto_Neighborhoods=pd.merge(Toronto_Neighborhoods,df_geo,left_on='Postcode',right_on='Postal Code')
df_Toronto_Neighborhoods.drop(columns=['Postal Code'],inplace=True)
df_Toronto_Neighborhoods.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Scarborough, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Scarborough,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Scarborough, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Scarborough",43.692657,-79.264848
