# Part 1

In [2]:
# Install necessary packages if it is not present
# !pip install wikipedia  https://pypi.org/project/wikipedia/
# !pip install beautifulsoup4

import pandas as pd 
import wikipedia as wp
from bs4 import BeautifulSoup

- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
- More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
- Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
- In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [20]:
# Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")

df = pd.read_html(html, header = 0)[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


In [21]:
# Ignore cells with 'Not assigned'.
df = df.loc[df.Borough!='Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


In [22]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, 
# you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
#These two rows will be combined into one row with the neighborhoods separated with a comma

df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [25]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as 
# the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the 
# Neighborhood columns will be Queen's Park.
for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [26]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
df.shape

(103, 3)

# Part 2 
Use the csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [27]:
import requests
import io

r=requests.get("http://cocl.us/Geospatial_data").content
r

b'Postal Code,Latitude,Longitude\r\nM1B,43.8066863,-79.1943534\r\nM1C,43.7845351,-79.1604971\r\nM1E,43.7635726,-79.1887115\r\nM1G,43.7709921,-79.2169174\r\nM1H,43.773136,-79.2394761\r\nM1J,43.7447342,-79.2394761\r\nM1K,43.7279292,-79.2620294\r\nM1L,43.7111117,-79.2845772\r\nM1M,43.716316,-79.2394761\r\nM1N,43.692657,-79.2648481\r\nM1P,43.7574096,-79.273304\r\nM1R,43.7500715,-79.2958491\r\nM1S,43.7942003,-79.2620294\r\nM1T,43.7816375,-79.3043021\r\nM1V,43.8152522,-79.2845772\r\nM1W,43.7995252,-79.3183887\r\nM1X,43.8361247,-79.2056361\r\nM2H,43.8037622,-79.3634517\r\nM2J,43.7785175,-79.3465557\r\nM2K,43.7869473,-79.385975\r\nM2L,43.7574902,-79.3747141\r\nM2M,43.789053,-79.4084928\r\nM2N,43.7701199,-79.4084928\r\nM2P,43.7527583,-79.4000493\r\nM2R,43.7827364,-79.4422593\r\nM3A,43.7532586,-79.3296565\r\nM3B,43.7459058,-79.352188\r\nM3C,43.7258997,-79.340923\r\nM3H,43.7543283,-79.4422593\r\nM3J,43.7679803,-79.4872619\r\nM3K,43.7374732,-79.4647633\r\nM3L,43.7390146,-79.5069436\r\nM3M,43.72849

In [31]:
coord=pd.read_csv(io.StringIO(r.decode('utf-8')))
print(coord)
#rename the POstal Code --> PostCode to merge with the wikipedia df 
coord.columns = ['Postcode', 'Latitude', 'Longitude']
dfmerge = pd.merge(coord, df, on='Postcode')
dfmerge

    Postal Code   Latitude  Longitude
0           M1B  43.806686 -79.194353
1           M1C  43.784535 -79.160497
2           M1E  43.763573 -79.188711
3           M1G  43.770992 -79.216917
4           M1H  43.773136 -79.239476
..          ...        ...        ...
98          M9N  43.706876 -79.518188
99          M9P  43.696319 -79.532242
100         M9R  43.688905 -79.554724
101         M9V  43.739416 -79.588437
102         M9W  43.706748 -79.594054

[103 rows x 3 columns]


Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,York,Weston
99,M9P,43.696319,-79.532242,Etobicoke,Westmount
100,M9R,43.688905,-79.554724,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,43.739416,-79.588437,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [32]:
# show the merged dataframe as needed
dfmerge = dfmerge[['Postcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]
dfmerge.to_csv('torontoNeighbourhoods.csv')
dfmerge

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
