<a href="https://colab.research.google.com/github/soheil-aa/Coursera_Capstone/blob/main/CapstoneProject_Week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Week 3 Assignment**
# **Segmenting and Clustering Neighborhoods in Toronto**

## **Part 1**
## Scapring the wiki page and extarcting Toronto postal codes


### Import neccessary libraries

In [1]:
# for performing your HTTP requests
import requests  

# for xml & html scrapping 
from bs4 import BeautifulSoup 

# for table analysis
import pandas as pd

#Visuals
import matplotlib.pyplot as plt

### Read the wiki and extract table data

In [2]:
# url of wikipedia page
url_toronto_postal_codes = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# Request & Response
s = requests.Session()
response = s.get(url_toronto_postal_codes, timeout=10)
response # <Response [200]> means a successfull response

<Response [200]>

In [4]:
# parse response content to html
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
# get the sortale table
toronto_PO_table = soup.find('table', {"class":'wikitable sortable'})

In [6]:
# tabel shape
rows = toronto_PO_table.findAll("tr")
for row in rows:
    cells = row.findAll('td')

table_shape = [len(rows), len(cells)]
print("Table shape: {} x {}".format(table_shape[0], table_shape[1]))

Table shape: 181 x 3


In [7]:
# header attributes of the table
toronto_PO_table_columns = [th.text.rstrip() for th in rows[0].find_all('th')]
print("column names: ", toronto_PO_table_columns)

column names:  ['Postal Code', 'Borough', 'Neighbourhood']


In [8]:
# extract data
table_data = []
for row in rows[1:]:
  row_data = [d.text.rstrip() for d in row.find_all('td')]
  table_data.append(row_data)

### Create a data frame and clean and aggregate the data

In [9]:
# create a data frame
toronto_PO_df = pd.DataFrame(table_data, columns = toronto_PO_table_columns)
toronto_PO_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
# drop rpostal codes which have no assigned borough
toronto_PO_df.drop(toronto_PO_df[toronto_PO_df.Borough == 'Not assigned'].index, inplace=True)
toronto_PO_df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [11]:
# group postal codes and join the Neighborhoods
toronto_PO_df = toronto_PO_df.groupby(['Postal Code'], sort=False).agg( ','.join)
toronto_PO_df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
# if a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
idx = toronto_PO_df[(toronto_PO_df.Neighbourhood == 'Not assigned') & (toronto_PO_df.Borough != 'Not assigned')].index
if len(idx) != 0:
  toronto_PO_df.loc[idx_, 'Neighbourhood'] = toronto_PO_df.loc[idx_, 'Borough']

### Results of Part 1

In [13]:
toronto_PO_df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [14]:
# shape of the data frame
toronto_PO_df.shape

(103, 2)

# **Part 2**
# Geocoding the postal codes

### Read the geocoded postal codes from the provided CSV file

In [15]:
postalcode_latlong_csv_url = 'https://cocl.us/Geospatial_data'

In [16]:
coordinates_PO_df = pd.read_csv(postalcode_latlong_csv_url)

In [17]:
coordinates_PO_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
coordinates_PO_df.shape

(103, 3)

### Merge with the postal code table (created in the Part 1)

In [19]:
toronto_PO_geocoded_df = pd.merge(toronto_PO_df, coordinates_PO_df, on="Postal Code")

### Results of Part 2

In [20]:
toronto_PO_geocoded_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [21]:
toronto_PO_geocoded_df.shape

(103, 5)

# **Part 3**
# Exploring and clustering the neighborhoods in Toronto

In [22]:
# Geocoder
from geopy.geocoders import Nominatim

# maps
import folium

Geographical coordinates of Toronto

In [23]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Map of Toronto and its boroughs

In [24]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(toronto_PO_geocoded_df['Latitude'], toronto_PO_geocoded_df['Longitude'], toronto_PO_geocoded_df['Borough'], toronto_PO_geocoded_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [25]:
# Four square credintials
CLIENT_ID = 'ENK3RQ3HMKNG3MWFGNNFYEUKKAPDRUB4IAVP5LU4TTYXH2YW'
CLIENT_SECRET = 'D5TVD254KJOJFU2UB5MHRSL4HEWAKYDAOTUE12QAPH25VDRX'
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)