## Notebook to scrape the Wikipedia page <a href=https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M>List of postal codes of Canada: M</a>

In [None]:
! pip install beautifulsoup4
! pip install lxml
! pip install geocoder
# problem installing below 2 packages in local machine
! pip install geopy
! pip install folium

In [None]:
! pip install requests
! pip install html5lib
! pip install bs4

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline

### Getting the html of the page is just the first step. Next step is to create a Beautiful Soup object from the html. This is done by passing the html to the BeautifulSoup() function. The Beautiful Soup package is used to parse the html, that is, take the raw html text and break it into Python objects. The second argument 'lxml' is the html parser whose details you do not need to worry about at this point.

In [17]:
from platform import python_version
print(python_version())

3.7.3


In [13]:
from bs4 import BeautifulSoup

In [None]:
import folium

In [None]:
from geopy.geocoders import Nominatim

In [7]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [8]:
import bs4
from urllib.request import urlopen

In [None]:
html = urlopen(url)

In [None]:
soup = BeautifulSoup(html, 'html.parser')

### To perform web scraping importing the libraries.
The urllib.request module is used to open URLs.
The Beautiful Soup package is used to extract data from html files.
The Beautiful Soup library's name is bs4 which stands for Beautiful Soup version 4.


In [None]:
title = soup.title
print(title)

In [None]:
addr = 'Toronto, Canada'
geoloc = Nominatim(user_agent="toronto_explorer")
location = geoloc.geocode(addr)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
#print(soup.prettify())
soup = soup.find_all('table', class_='wikitable sortable')
#print(soup)

In [None]:
import requests
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

In [None]:
df.head()

### Clean the data by filter out the Borough having value "Not assigned"
df_data = df['Borough']!='Not assigned'
df_main = df[df_data]
df_main.head()

In [None]:
### Combine multiple rows of Neighbourhood of same Borough
df1 = df_main.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [None]:
### Replace Neighbourhoods having value "Not assigned" with value of Borough
df1['Neighbourhood'] = np.where(df1['Neighbourhood']=='Not assigned', df1['Borough'], df1['Neighbourhood'])

In [None]:
df1.head()
df1.shape

In [None]:
## Import geodata from CSV file
geodata = "https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv"
df_geodata = pd.read_csv(geodata)
df_geodata.head()

In [None]:
df1.rename(columns={"Postcode": "Postal Code"}, inplace=True)

In [None]:
df1.set_index(['Postal Code'])
df_geodata.set_index(['Postal Code'])

In [None]:
## Now need to join both the dataframes on common key "Postcode"

df_main = pd.merge(df1, df_geodata, on='Postal Code', how='inner')

## Now Let's explore neighborhoods in Toronto

In [None]:
LIMIT = 100
CLIENT_ID = 'HEUOIKGRIHN0N42H4SF0QD15XHFYVAJHQ3D2DHVZBRCUST0S'
CLIENT_SECRET = 'ENEK5AYTY4XA4FCHBQQM5ZHFZNTRD1SCM5VPXGNHIMXDUBQN'
VERSION = '20180604'
# Create a new function
def getVenues(names, latitudes, longitudes, radius=500):
    
    list_of_venues=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        list_of_venues.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    venues = pd.DataFrame([item for venue_list in list_of_venues for item in venue_list])
    venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(venues)

In [None]:
venues_of_tornoto = getVenues(names=df_main['Neighbourhood'], 
                              latitudes=df_main['Latitude'],
                              longitudes=df_main['Longitude']
                              )

In [None]:
print(venues_of_tornoto.shape)
venues_of_tornoto.head()

### Encode the data

In [None]:
trans_data = pd.get_dummies(venues_of_tornoto[['Venue Category']], prefix="", prefix_sep="")
# add neighbourhood column back to dataframe
trans_data['Neighbourhood'] = venues_of_tornoto['Neighbourhood'] 
# move neighbourhood column to the first
fixed_columns = [trans_data.columns[-1]] + list(trans_data.columns[:-1])
trans_data = trans_data[fixed_columns]
trans_data.head()

### Now group rows by neighbourhood taking mean of the frequency of occurrence of each category

In [None]:
group_by_venue = trans_data.groupby('Neighbourhood').mean().reset_index()
group_by_venue.head()

### Let's print each neighbourhood along with top 5 common venues

In [None]:
topVenues = 5

for v in group_by_venue['Neighbourhood']:
    print("----"+v+"----")
    temp = group_by_venue[group_by_venue['Neighbourhood'] == v].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(topVenues))
    print('\n')

## Function to sort the venues in descending order

In [None]:
def common_venues(venue_var, top_how_many):
    venue_cat = venue_var.iloc[1:]
    sorted_venue_cat = venue_cat.sort_values(ascending=False)
    
    return sorted_venue_cat.index.values[0:top_how_many]

## Now create a new dataframe and display the top 10 venues for each neighborhood.

In [None]:
topVenues = 10

indicators = ['st', 'nd', 'rd']
# create columns against top venues
column_names = ['Neighbourhood']
for ind in np.arange(topVenues):
    try:
        column_names.append('{}{} Most Frequent Venue'.format(ind+1, indicators[ind]))
    except:
        column_names.append('{}th Most Frequent Venue'.format(ind+1))

# create a new dataframe of sorted venues
sorted_v = pd.DataFrame(columns=column_names)
sorted_v['Neighbourhood'] = group_by_venue['Neighbourhood']

for ind in np.arange(group_by_venue.shape[0]):
    sorted_v.iloc[ind, 1:] = common_venues(group_by_venue.iloc[ind, :], topVenues)

sorted_v.head()

### Run k-means to cluster the neighborhoods

In [None]:
# number of clusters to made
no_c = 5

venue_clusters = group_by_venue.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=no_c, random_state=0).fit(venue_clusters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add cluster labels
sorted_v.insert(0, 'Clusters', kmeans.labels_)

tor_df = df_main

# merge main dataframe with previous dataframe to add lat long for the neighbourhoods
tor_df = tor_df.join(sorted_v.set_index('Neighbourhood'), on='Neighbourhood')
tor_df = tor_df.fillna(0)
tor_df["Clusters"] = tor_df["Clusters"].astype(int)
tor_df.head()

### Visualize the clusters

In [None]:
# create map
cl_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(no_c)
ys = [i + x + (i*x)**2 for i in range(no_c)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['Neighbourhood'], tor_df['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(cl_map)
       
cl_map