# Applied Data Science Capstone --- Week 3 assignment 

## by Rhys Davies (July 2019)

### Set-up

In [1]:
# Prepare
# Install BeautifulSoup
!conda install beautifulsoup4

# Install lxml
!conda install lxml

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lxml-4.3.4                 |   py36hefd8a0e_0         1.5 MB

The following packages will be UPDATED:

    lxml: 4.3.1-py36hefd8a0e_0 --> 4.3.4-py36hefd8a0e_0


Downloading and Extracting Packages
lxml-4.3.4           | 1.5 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


### Import Data

In [2]:
# Scrape data

# Import libraries
import pandas as pd

# Define link
DataURL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Read data and create data frame
df = pd.read_html(DataURL)

# Check details
print('Show source : ', DataURL)
print('Shape of data frame : ', df[0].shape, ' Size of data frame : ', df[0].size, ' Dimension of data frame : ', df[0].ndim )
print('Show top 5 rows of dataframe below : ')
df[0].head()

Show source :  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Shape of data frame :  (288, 3)  Size of data frame :  864  Dimension of data frame :  2
Show top 5 rows of dataframe below : 


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Prepare data

In [7]:
# Prepare the data (part 1: Name columns, eliminate 'Not assigned' in Borough column)
clean_df1 = df[0]
clean_df1.columns = ['PostalCode', 'Borough', 'InitialNeighborhood']
clean_df1 = clean_df1[clean_df1.Borough != 'Not assigned']
clean_df1.head(5)

Unnamed: 0,PostalCode,Borough,InitialNeighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [9]:
# Prepare the data (part 2: Group by PostalCode)
clean_df2 = clean_df1.groupby("PostalCode").agg(lambda x:', '.join(set(x)))
clean_df2.head(5)

Unnamed: 0_level_0,Borough,InitialNeighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
M1E,Scarborough,"Guildwood, West Hill, Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [10]:
# Prepare the data (part 3: Remove 'Not assigned' in the Neighborhood column)
clean_df2.loc[clean_df2.InitialNeighborhood == 'Not assigned', 'Neighborhood'] = clean_df2.Borough
clean_df2.loc[clean_df2.InitialNeighborhood != 'Not assigned', 'Neighborhood'] = clean_df2.InitialNeighborhood    
clean_df2.drop(["InitialNeighborhood"], axis = 1, inplace = True)
clean_df3 = clean_df2.reset_index()
clean_df3.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, West Hill, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
# Show shape
print('Show shape of dataframe : ', clean_df3.shape)
print('Show number of rows in dataframe : ', clean_df3.shape[0])

Show shape of dataframe :  (103, 3)
Show number of rows in dataframe :  103


### Examine geolocation 

In [131]:
# Reference : Link : https://geocoder.readthedocs.io/index.html  OR  CSV : http://cocl.us/Geospatial_data

In [12]:
# Import location date
LocURL = 'http://cocl.us/Geospatial_data'
loc_df = pd.read_csv(LocURL)
print('Shape of loc_df : ', loc_df.shape)
loc_df.head(5)

Shape of loc_df :  (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# Re-name column
clean_loc_df = loc_df.rename(columns={"Postal Code": "PostalCode"})

# Join dataframes
total_df = clean_df3.set_index('PostalCode').join(clean_loc_df.set_index('PostalCode'))

# Add index
total_df = total_df.reset_index()

# Show dataframe
total_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Examine Clustering

In [14]:
# Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

#to add enough Markdown cells to explain what you decided to do and to report any observations you make.
#to generate maps to visualize your neighborhoods and how they cluster together.

In [142]:
# Load required libraries

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab RUN!
import folium # map rendering library

### Visualise data on map

In [136]:
# Centre map
centrelatitude = sum(total_df['Latitude']) / len(total_df['Latitude'])
centrelongitude = sum(total_df['Longitude']) / len(total_df['Longitude'])
print('Map centred at : ', centrelatitude, ' ', centrelongitude)

# Create a map 
map_canada = folium.Map(location=[centrelatitude, centrelongitude], zoom_start=10)

# add markers to map
for pc, borough, neighborhood, lat, lng in zip(total_df['PostalCode'], total_df['Borough'], total_df['Neighborhood'], total_df['Latitude'], total_df['Longitude']):
    label = '{}, {}, {}'.format(pc, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)
    
map_canada

Map centred at :  43.70460773398059   -79.39715291165048
