In [5]:
!conda install -c anaconda beautifulsoup4

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    beautifulsoup4: 4.6.0-py35h442a8c9_1 --> 4.6.3-py35_0 anaconda

beautifulsoup4 100% |################################| Time: 0:00:00   6.12 MB/s


In [6]:
!conda install -c anaconda lxml

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    libgcc-ng: 7.2.0-h7cc24e2_2     --> 8.2.0-hdf63c60_1     anaconda
    libxml2:   2.9.4-h6b072ca_5     --> 2.9.8-hf84eae3_0     anaconda
    libxslt:   1.1.29-hcf9102b_5    --> 1.1.33-h7d1a2b0_0    anaconda
    lxml:      4.1.0-py35ha401a81_0 --> 4.2.5-py35hefd8a0e_0 anaconda

libgcc-ng-8.2. 100% |################################| Time: 0:00:00  70.97 MB/s
libxml2-2.9.8- 100% |################################| Time: 0:00:00  19.31 MB/s
libxslt-1.1.33 100% |################################| Time: 0:00:00  51.73 MB/s
lxml-4.2.5-py3 100% |################################| Time: 0:00:00  48.04 MB/s


In [10]:
!conda install -c anaconda requests

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    requests: 2.18.4-py35hb9e6ad1_1 --> 2.19.1-py35_0 anaconda

requests-2.19. 100% |################################| Time: 0:00:00  34.72 MB/s


# Website Scraping and Transforming Data

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(source, 'lxml')

In [4]:
table = soup.find_all('table')[0]

In [5]:
# import this HTML table in a DataFrame
dfs = pd.read_html(str(table), header = 0)

In [6]:
# function read_html always returns a list of DataFrame objects
df = dfs[0]

In [7]:
# renaming of columns headers
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [8]:
# Remove Boroughs that are not assigned
df = df[df['Borough'] != 'Not assigned']

In [9]:
# copying the Borough name onto Neighborhood only if Neighborhood name is not assigned
df['Neighborhood'] = df['Borough'].where(df['Neighborhood'] == 'Not assigned', df['Neighborhood'])
# or can do this >>> df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [10]:
# join all neighborhoods with the same postalcode and borough
grouped = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x))

# convert a groupby object to dataframe
grouped.reset_index()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
df_nbh = pd.DataFrame(grouped)
dfnbh = df_nbh.reset_index()
dfnbh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
dfnbh.shape

(103, 3)

# Geocoding Neighborhoods

In [None]:
# insert geospatial data via Geocoder Python package
# import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [13]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_merged = pd.merge(left=dfnbh, right=geo, left_on='PostalCode', right_on='Postal Code', how='left')
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [15]:
df_merged.drop(['Postal Code'], axis=1, inplace=True)
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
