# 1. Creation of DataFrame

<span> Importing request,BeautifulSoup and pandas libraries </span>

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

<span> Creating Dictionary with the specified columns PostalCode, Borough, and Neighborhood </span>
<br>
<span> Using Beautiful Soup Library to scrape the data from the given URL</span>

In [5]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
table = soup.table
headers = table.find_all('th')
rows = table.find_all('tr')

In [10]:
dict_df = {}
for i in range(len(headers)):
    col_content = headers[i].text.strip()
    dict_df[col_content.replace(" ","")] = []
dict_df

{'PostalCode': [], 'Borough': [], 'Neighborhood': []}

<span>Populating the dictionary with the row data values with respect to the columns</span>

In [11]:
for i in range(1,len(rows)):
    row_content = rows[i].find_all('td')
    for j in range(len(row_content)):
        cell_content = row_content[j].text.rstrip()
        if j == 0:
            dict_df['PostalCode'].append(cell_content)
        if j == 1:
            dict_df['Borough'].append(cell_content)
        if j == 2:
            dict_df['Neighborhood'].append(cell_content)    

Using pandas library to convert dictionary into a dataframe

In [12]:
df = pd.DataFrame.from_dict(dict_df)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Ignoring 'Not assigned' values of 'Borough' columns

In [13]:
df = df[df['Borough'] != 'Not assigned']

Creating a function for assignment of 'Neighborhood' column

In [14]:
def neighborhood_assignment(row):
    if row['Neighborhood'] == '':
        val = row['Borough']
    else:
        val = row['Neighborhood']
    return val

df['Neighborhood'] = df.apply(neighborhood_assignment,axis=1)

Grouping the dataframe by 'PostalCode' for combining the 'Neighborhood' values into a single row

In [15]:
df.groupby('PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Calling shape method on the dataframe to retrieve the rows and columns size

In [17]:
df.shape

(103, 3)

# 2. Assignment of Latitude and Longitude values to each Postal Code

<span> Using the 'Geospatial_Coordinates.csv' for fetching geospatial coordinate values</span>

In [70]:
geo_spatial_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_spatial_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Defining a function to retrieve latitude and longitude values from the geospatial dataframe

In [53]:
def retrieve_coordinates(postal_code):
    return geo_spatial_df.loc[geo_spatial_df['Postal Code'] == postal_code].iloc[0]

Retrieving the geospatial coordinates for each postal code and assigning it to the data frame

In [57]:
latitudes = []
longitudes = []
for index, row in df.iterrows():
    geo_spacial_data = retrieve_coordinates(row['PostalCode'])
    latitudes.append(geo_spacial_data['Latitude'])
    longitudes.append(geo_spacial_data['Longitude'])
df['Latitude'] = latitudes
df['Longitude'] = longitudes
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# 3. Segmentation and Clustering of Toronto Neighborhood

In [65]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\saisu\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.8.3                |   py37hc8dfbb8_1         3.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    python_abi-3.7             |          1_cp37m           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0
  python_abi



  current version: 4.8.2
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda




Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\saisu\anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    certifi-2019.11.28         |           py37_0         148 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
         

In [66]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [67]:
neighborhoods = df
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


<h3>Mapping out the Toronto map for segmentation and clustering of neighborhoods in the dataframe</h3>

In [68]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [69]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto