### Obtain table from Wikipedia page and transform the data into a pandas dataframe

In [24]:
import pandas as pd
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Rename 'Postcode' into 'PostalCode' and 'Neighbourhood' into 'Neighborhood'

In [25]:
df.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Drop rows where Borough is 'Not assigned'

In [26]:
# Get names of indexes for which column Borough has value 'Not Assigned'
indexNames = df[df['Borough'] == 'Not assigned'].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


### Make sure that Neighborhood has 'Not assigned' values

In [27]:
df.Neighborhood.unique()

array(['Parkwoods', 'Victoria Village', 'Harbourfront',
       'Lawrence Heights', 'Lawrence Manor', "Queen's Park",
       'Not assigned', 'Rouge', 'Malvern', 'Don Mills North',
       'Woodbine Gardens', 'Parkview Hill', 'Ryerson', 'Garden District',
       'Glencairn', 'Cloverdale', 'Islington', 'Martin Grove',
       'Princess Gardens', 'West Deane Park', 'Highland Creek',
       'Rouge Hill', 'Port Union', 'Flemingdon Park', 'Don Mills South',
       'Woodbine Heights', 'St. James Town', 'Humewood-Cedarvale',
       'Bloordale Gardens', 'Eringate', 'Markland Wood',
       'Old Burnhamthorpe', 'Guildwood', 'Morningside', 'West Hill',
       'The Beaches', 'Berczy Park', 'Caledonia-Fairbanks', 'Woburn',
       'Leaside', 'Central Bay Street', 'Christie', 'Cedarbrae',
       'Hillcrest Village', 'Bathurst Manor', 'Downsview North',
       'Wilson Heights', 'Thorncliffe Park', 'Adelaide', 'King',
       'Richmond', 'Dovercourt Village', 'Dufferin',
       'Scarborough Village', 'Fairv

### Replace Neighborhood with 'Not assigned' value with Borough's value of the same row

In [28]:
df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df['Borough']
df.Neighborhood.unique()

array(['Parkwoods', 'Victoria Village', 'Harbourfront',
       'Lawrence Heights', 'Lawrence Manor', "Queen's Park", 'Rouge',
       'Malvern', 'Don Mills North', 'Woodbine Gardens', 'Parkview Hill',
       'Ryerson', 'Garden District', 'Glencairn', 'Cloverdale',
       'Islington', 'Martin Grove', 'Princess Gardens', 'West Deane Park',
       'Highland Creek', 'Rouge Hill', 'Port Union', 'Flemingdon Park',
       'Don Mills South', 'Woodbine Heights', 'St. James Town',
       'Humewood-Cedarvale', 'Bloordale Gardens', 'Eringate',
       'Markland Wood', 'Old Burnhamthorpe', 'Guildwood', 'Morningside',
       'West Hill', 'The Beaches', 'Berczy Park', 'Caledonia-Fairbanks',
       'Woburn', 'Leaside', 'Central Bay Street', 'Christie', 'Cedarbrae',
       'Hillcrest Village', 'Bathurst Manor', 'Downsview North',
       'Wilson Heights', 'Thorncliffe Park', 'Adelaide', 'King',
       'Richmond', 'Dovercourt Village', 'Dufferin',
       'Scarborough Village', 'Fairview', 'Henry Farm', 'Or

In [29]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


### Group dataframe based on PostalCode and Borough and combine Neighborhood value for each group

In [30]:
df = df.groupby(["PostalCode", "Borough"])
df = df["Neighborhood"].agg(lambda column: ", ".join(column))
df = df.reset_index(name="Neighborhood")
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Check the number of rows and column of the dataframe

In [31]:
df.shape

(103, 3)

### Try using the Geocoder Python package to get the latitude and the longitude coordinates of each neighborhood

In [82]:
#import geocoder
import geocoder

#initialize your variable to None
lat_lng_coords=None

#loop until you get the coordinates
while (lat_lng_coords is None):
    g=geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords=g.latlng
    
latitude=lat_lng_coords[0]
longitude=lat_lng_coords[1]

ModuleNotFoundError: No module named 'geocoder'

#### Let's try the other method!

###  Load the csv file that has the geographical coordinates of each postal code

In [32]:
coord_list=pd.read_csv("http://cocl.us/Geospatial_data")
coord_list

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


### Concatenate columns in df and coord_list dataframes

In [33]:
df = pd.concat([df, coord_list], axis=1)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848


### Eliminate 'Postal Code' column

In [34]:
df.drop(df.columns[3], axis = 1, inplace = True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
