## NOTEBOOK on IBM Watson Studio shared via GitHub

This notebook will be used for Applied Data Science Capstone Project week 3 assignment (this is part 2 - adding coordinates to the data framework, as per instructions for 2 points)

In [2]:
import pandas as pd
import numpy as np
import requests
import urllib.request
from bs4 import BeautifulSoup
import lxml

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  49.35 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.39 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  40.05 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  44.97 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |###################

Setting up the basics:

In [3]:
# Data will be retrieved from the given wiki page
wikipedia_link_to_Canada_postal_codes='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# Request and sava data in 'page' object
page = requests.get(wikipedia_link_to_Canada_postal_codes)
# Use 'BeautifulSoup' tool to work with retrieved data
soup = BeautifulSoup(page.text, 'lxml')
# Creating an empty data frame to store the data 
df = pd.DataFrame()

Next, the data from the messy page needs to be retrieved. The needed data is in the only table within that page

In [4]:
# Finding the table in the page
match_table = soup.find('table', class_='wikitable sortable')

# Needed infrmation first will be collected in lists
List_PostalCode = []
List_Borough = []
List_Neighborhood = []

# The following cyclicly goes through each row of the table and collects information
i=1
for match_element in match_table.find_all('td'):
    if i==1: List_PostalCode.append(match_element.text)
    if i==2: List_Borough.append(match_element.text)
    if i==3: 
        List_Neighborhood.append(match_element.text)
        i=0
    i=i+1

Collected data lists are assigned to the data frame

In [5]:
df['PostalCode']=List_PostalCode
df['Borough']=List_Borough
df['Neighborhood']=List_Neighborhood

Next, transformations are applied to the data to form the data frame required by this exercise.

In [6]:
# Cleaning up neighbouhood names
for index, element in df.iterrows():
    element['Neighborhood']=element['Neighborhood'].strip("\n")

# Removing rows which do not have Boroughs assigned
# Where neighbourhoods are not assigned, the Borough name is used
for index, element in df.iterrows():
    if element['Borough'] == 'Not assigned': df=df.drop(index)
    if element['Neighborhood'] == 'Not assigned': element['Neighborhood']=element['Borough']

# Dataframe is reindexed
df = df.reset_index(drop=True)

# Neighborboods under the same Boroughs are merged and separated by commas, as per example
df = df.groupby(['PostalCode','Borough'], sort=False, as_index=False).agg(', '.join)

Verifying that the resulting data frame is as requested per exercise

In [7]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [8]:
df.shape

(103, 3)

Coordinates will be retrieved from given URL with csv file:

In [9]:
df_coordinates = pd.read_csv("http://cocl.us/Geospatial_data")

# Colums for Latitude and Longitude are created
df['Latitude']=''
df['Longitude']=''

The following goes through the file with coordinates and iteratively retrieves the ones matching a certain Postal Code:

In [10]:
Lat=0
Lon=0
i=-1
for index, element in df.iterrows():
    if i<len(df):
        i=i+1
        # Finding the right coordinates
        row_coordinate = df_coordinates.loc[df_coordinates['Postal Code'] == element['PostalCode']]
        # Adding the coordinates to a list element
        Lat = row_coordinate['Latitude'].tolist()
        Lon = row_coordinate['Longitude'].tolist()
        # Assigning the values to the data frame
        df.at[i,'Latitude']=Lat[0]
        df.at[i,'Longitude']=Lon[0]

In [11]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Not assigned,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
7,M3B,North York,Don Mills North,43.7459,-79.3522
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.7064,-79.3099
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789


In [12]:
df.shape

(103, 5)

### Data Analysis

##### For the sake of the exercise, the analysis will be similar to the example by the professors but slightly less complicated:
##### 1. Subset of Toronto neighborhoods will be chosen and displayed
##### 2. All coffee shops from the neighboorhoods will be taken from Foursquare API
##### 3. Out of all coffee shops, the biggest branches will be chosen
##### 4. Analysis will show amount of coffee shops of biggest branches per neighborhoods (note: however, with distinct coordinates. Assumption is that if two neighborhoods have the same coordinates, only one of them is chosen)
##### 5. Kmeans algorithm will be used to cluster the coffee shops and result displayed on the map

 ### BUSINESS PROBLEM: A new Coffee shop branch would like to come into Downtown Toronto, however, the coffee shops will be opened starting with neighborhoods where there is a least amount of other famous Coffee shops having multiple places 

## 1. Choosing initial data frame subset with Toronto neighborhoods

Starting with data frame:

In [14]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Not assigned,43.6623,-79.3895


For this exercise, the interest is only in the neighborhoods in Borough of Downtown Toronto:

In [15]:
# PostalCode will not be needed - we'll drop the column
df=df.drop(['PostalCode'], axis=1)
# We'll chose only Downtown Toronto Borough
for index, element in df.iterrows():
    if "Downtown Toronto" not in element['Borough']: df=df.drop(index)
df=df.reset_index(drop=True)

In [16]:
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
1,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789
2,Downtown Toronto,St. James Town,43.6515,-79.3754
3,Downtown Toronto,Berczy Park,43.6448,-79.3733
4,Downtown Toronto,Central Bay Street,43.658,-79.3874


## 2. Finding Coffee shops from Foursquare API 

In [17]:
address = 'Downtown Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

  from ipykernel import kernelapp as app


The geograpical coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.


In [18]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
# Contains my foursquare Credentials
# This code is removed for sharing

In [38]:
i=-1
# Data frame for coffee shops
nearby_coffee_shops_final_df = pd.DataFrame()
for index, element in df.iterrows():
    i=i+1
    if i<len(df):
        
        # Find the coffee shops from Foursquare
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{},&query={},&v={}'.format(CLIENT_ID, CLIENT_SECRET, element['Latitude'], element['Longitude'], search_query, VERSION)
        results = requests.get(url).json()
        foursquare_response = results['response']['venues']

        nearby_coffee_shops_df = json_normalize(foursquare_response) # flatten JSON
        
        # Taking Address, Coffee Shop name, Latitude and Longitude
        filtered_columns = ['location.address', 'name', 'location.lat', 'location.lng']
        nearby_coffee_shops_df = nearby_coffee_shops_df.loc[:, filtered_columns]
        
        # Renaming the columns properly
        nearby_coffee_shops_df.columns = ['Address', 'Name', 'Latitude', 'Longitude']
        
        # Adding the neighborhood name
        nearby_coffee_shops_df['Neighborhood']=element['Neighborhood']
    
    # Finally adding the results to initially created data frame
    nearby_coffee_shops_final_df = pd.concat([nearby_coffee_shops_final_df, nearby_coffee_shops_df])
    
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.reset_index(drop=True)

In [39]:
nearby_coffee_shops_final_df.head()

Unnamed: 0,Address,Name,Latitude,Longitude,Neighborhood
0,120 Lombard St,Fahrenheit Coffee,43.652384,-79.372719,"Harbourfront, Regent Park"
1,354 Queen St. E,Redline Coffee and Espresso,43.655692,-79.364095,"Harbourfront, Regent Park"
2,519 Parliament St.,Jetfuel Coffee,43.665295,-79.368335,"Harbourfront, Regent Park"
3,10 Market Street,Balzac's Coffee,43.648457,-79.37179,"Harbourfront, Regent Park"
4,479 Broadview Ave,Rooster Coffee House,43.669177,-79.353134,"Harbourfront, Regent Park"


In [40]:
nearby_coffee_shops_final_df.shape

(540, 5)

#### CONCLUSION: So, we have 540 Coffee Shops within Neighborhoods of Downtown Toronto Borough

### 3. Finding the branches with the most Coffee shops

Now, we'll find the biggest branches. The assumption is that a branch is 'a big branch' if it has 20 and more coffee shops in Downtown Toronto

In [41]:
# First, we'll count the Coffee Shops
biggest_competitors_df=nearby_coffee_shops_final_df
biggest_competitors_df=biggest_competitors_df.groupby('Name', as_index=False).count()

# Second, we chose only the ones with 20 or more coffee shops
for index, element in biggest_competitors_df.iterrows():
    if element['Address'] < 20: biggest_competitors_df=biggest_competitors_df.drop(index)

biggest_competitors_df=biggest_competitors_df.reset_index(drop=True)

# adding those to series
biggest_competitors_series=[]
for index, element in biggest_competitors_df.iterrows():
      biggest_competitors_series.append(element['Name'])

# Finally, we form a list where only the biggest competitors are present
i=-1
for index, element in nearby_coffee_shops_final_df.iterrows():
    if i<len(nearby_coffee_shops_final_df):
        i=i+1
        if element['Name'] not in biggest_competitors_series: nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.drop(index)
            
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.reset_index(drop=True)

In [42]:
nearby_coffee_shops_final_df.head()

Unnamed: 0,Address,Name,Latitude,Longitude,Neighborhood
0,120 Lombard St,Fahrenheit Coffee,43.652384,-79.372719,"Harbourfront, Regent Park"
1,10 Market Street,Balzac's Coffee,43.648457,-79.37179,"Harbourfront, Regent Park"
2,479 Broadview Ave,Rooster Coffee House,43.669177,-79.353134,"Harbourfront, Regent Park"
3,122 Bond Street,Balzac's Coffee,43.657854,-79.3792,"Harbourfront, Regent Park"
4,1 Trinity Street,Balzac's Coffee,43.649797,-79.359142,"Harbourfront, Regent Park"


In [43]:
nearby_coffee_shops_final_df.shape

(378, 5)

#### CONCLUSION: So, we have 378 Coffee Shops within Neighborhoods of Downtown Toronto Borough, where branch holds 20 or more of them

## 4. Analysing which neighborhoods are with biggest amount of Coffee Shops owned by Big Branches

Counting coffee shops per neighboorhood (or rather separate collections of neighborhoods with given the same coordinates, as it was given like that from the beginning of this exercise)

In [44]:
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.groupby(['Neighborhood'], as_index=False).count()

# Getting rid of the columns which are not needed
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.drop(['Address'], axis=1)
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.drop(['Latitude'], axis=1)
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.drop(['Longitude'], axis=1)

In [45]:
nearby_coffee_shops_final_df.head()

Unnamed: 0,Neighborhood,Name
0,"Adelaide, King, Richmond",18
1,Berczy Park,18
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",20
3,"Cabbagetown, St. James Town",30
4,Central Bay Street,20


Next, normalizing the numbers:

In [46]:
max_coffee_shops = nearby_coffee_shops_final_df['Name'].max()

i=-1
for index, element in nearby_coffee_shops_final_df.iterrows():
    i=i+1
    if i<len(nearby_coffee_shops_final_df):
        # Divide by max value
        new_value = round(element['Name']/max_coffee_shops,2)
        # Adding the new value
        nearby_coffee_shops_final_df.at[i,'Count']=new_value

# Removing the name count, as now we have the 'Count' column  
nearby_coffee_shops_final_df=nearby_coffee_shops_final_df.drop(['Name'], axis=1)

In [47]:
nearby_coffee_shops_final_df.head()

Unnamed: 0,Neighborhood,Count
0,"Adelaide, King, Richmond",0.6
1,Berczy Park,0.6
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.67
3,"Cabbagetown, St. James Town",1.0
4,Central Bay Street,0.67


In [49]:
# Removing 'Neighborhood' column, as we only need the numbers
nearby_coffee_shops_final_no_N_df=nearby_coffee_shops_final_df.drop(['Neighborhood'], axis=1)

## 5. Using kmeans algorithm to group the neighborhoods

The assumption is that new branch would like to have the Borough divided into 6 clusters:

In [51]:
# set number of clusters
kclusters = 6

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nearby_coffee_shops_final_no_N_df)

In [52]:
kmeans.labels_[0:32]

array([4, 4, 0, 1, 0, 5, 3, 1, 4, 4, 1, 2, 5, 2, 1, 0, 4, 1], dtype=int32)

In [61]:
# Assigning Groups to the Neighborhoods
nearby_coffee_shops_final_df['Group']=kmeans.labels_

# Updating initial dataframe df
i=-1
for index, element in df.iterrows():
    i=i+1
    if i<len(df):
        row_group = nearby_coffee_shops_final_df.loc[nearby_coffee_shops_final_df['Neighborhood'] == element['Neighborhood']]
        group = row_group['Group'].tolist()
        if group==[]:
            group=[6]
        df.at[i,'Group']=group[0]

In [62]:
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Group
0,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606,2.0
1,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789,0.0
2,Downtown Toronto,St. James Town,43.6515,-79.3754,4.0
3,Downtown Toronto,Berczy Park,43.6448,-79.3733,4.0
4,Downtown Toronto,Central Bay Street,43.658,-79.3874,0.0


Finally, visualizing clustered Neighborhoods:

In [65]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, 7))
#print (rainbow)
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
i=-1
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Group']):
    i=i+1
    if i<len(df):
        label = folium.Popup(str(poi) + '. Cluster ' + str(cluster), parse_html=True)
        cluster=int(cluster)
        folium.CircleMarker(
            [lat, lon],
            radius=10,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)
        
map_clusters