# Week 3 Assignment

## Code to parse the wikipage and build dataframe

In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')
filename = 'wiki_scrape.csv'

# csv_file = open(filename, 'w')
# csv_writer = csv.writer(csv_file)
# headers = ['PostCode', 'Borough', 'Neighbourhood']

# get the table and its tbody from html 
wikitable = soup.find('table', class_='wikitable')
wikitbody = wikitable.find('tbody')
# loop through all rows in tbody
dataList=[]
for trow in wikitbody.find_all('tr'):
    # the tbody is returning first column as th - handle it by using as first row
    cols = trow.find_all('th')
    if (cols != []) : 
        headers=[cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()]        
        # csv_writer.writerow(headers)
        
    else : 
        # now work on the columns 
        cols = trow.find_all('td')                 
        
        postcode = cols[0].text.strip()       
        
        # get borough value as it can be either a tag or just text
        if (cols[1].a != None) :  borough = cols[1].a.text.strip() 
        else :  borough = cols[1].text.strip()
       
        # get neighborhood and if it's value is 'not assigned', use borough value
        neighborhood = cols[2].text.strip() 
        if (neighborhood.lower() == 'not assigned') : neighborhood = borough
        
        # Skip loading 'not assigned' borough 
        if ( borough.lower() != "not assigned") :  
            # csv_writer.writerow([postcode, borough, neighborhood])
            dataList.append([postcode, borough, neighborhood])        
        
# csv_file.close()
# tor_df = pd.read_csv(filename)
tor_df = pd.DataFrame(dataList, columns=headers)
tor_df = tor_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
tor_df.head()
                      

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### As geocoder did not return any values, ignoring the code and using the Excel file given in assignment. Double-click <b>here</b> for the solution.

<!-- 
# !pip install geocoder
# The following code did not work as geocoder was not returning any value for any iteration
# So, ignoring the code and using the Excel file given in assignment

import geocoder
# initialize your variable to None
lat_lng_coords = None
i=0
# loop until you get the coordinates
while(lat_lng_coords is None):
    i=i+1
    print( i)
    gvar = geocoder.google('{}, Toronto, Ontario'.format(tor_df['Postcode'][0]))
    lat_lng_coords = gvar.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

-->

In [4]:
# Use the excel file to get the longitudes and lattitudes
lat_lng_df = pd.read_csv('http://cocl.us/Geospatial_data')

lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### # Rename the first column for merging with Toronto dataframe

In [5]:
lat_lng_df = lat_lng_df.rename(index=str, columns={"Postal Code": "Postcode"})

### # Merge both data frames to get final data frame with boroughs and longitude+latitudes

In [7]:
tor_df_loc = pd.merge(tor_df, lat_lng_df, on=['Postcode'])
tor_df_loc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [8]:
print('The dataframe has {} boroughs and {} postcodes.'.format(
        len(tor_df_loc['Borough'].unique()),
        tor_df_loc.shape[0]
    )
)

The dataframe has 11 boroughs and 103 postcodes.


In [9]:
from geopy.geocoders import Nominatim

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronoto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronoto are 43.653963, -79.387207.


In [10]:
import folium
# create map of Toronot using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(tor_df_loc['Latitude'], tor_df_loc['Longitude'], tor_df_loc['Borough'], tor_df_loc['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto