# 1. Scraping the data from Wikipedia into a pandas DataFrame

In [0]:
# The packages required for the exercise
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [0]:
# Getting the webpage from the url using requests package
website_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
webpage = requests.get(website_url).text

In [0]:
# This function retrieves the raw data from the webpage
soup = BeautifulSoup(webpage, 'lxml')

# Retrieving the specific table that contains the data that we require
table = soup.find('table', {
    'class': 'wikitable sortable'
})

In [0]:
# Retrieving all the <tr> tags from the soup
# Each of these tags will represent a row in our DataFrame
rows = table.find_all('tr')[1:]

# Populating an array with the values from the rows
values = []
for row in rows:
  row_for_df = []
  for val in row.find_all('td'):
    row_for_df.append(val.contents[0].rstrip())

  # Converting all the multiple values of Neighborhood from '/' separated to comma separated
  row_for_df[2] = row_for_df[2].replace(' /', ',')

  # Appending all the rows to a list called values
  values.append(row_for_df)

In [5]:
# Creating the DataFrame with the scraped values and with appropriate column names
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(values, columns = columns)


# Printing the top 5 elements of the DataFrame
print(f'The length of the dataframe is {len(toronto_df)}')
toronto_df.head()

The length of the dataframe is 180


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [23]:
# Dropping all the rows that have a "Not assigned" Borough
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']

# Since many rows were dropped, we need to reset the index of the dataframe from 0 .. len(df)
toronto_df.index = list(range(len(toronto_df)))


# Printing the top 5 elements of the DataFrame
print(f'The length of the dataframe is {len(toronto_df)}')
toronto_df.head()

# The entries with multiple neighborhoods are already handled while reading the data

The length of the dataframe is 103


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
# Printing the shape of the toronto dataframe
print(f'The shape of the DataFrame is {toronto_df.shape}')

The shape of the DataFrame is (103, 3)


# 2. Adding the geospatial data to the DataFrame

I tried using the geocoder package, and it was very unreliable and it did not work as expected. So, I'll use the provided csv file instead.

In [28]:
# Reading the coordinates csv file into a pandas dataframe
coordinates = pd.read_csv('Geospatial_Coordinates.csv', names=['PostalCode', 'Latitude', 'Longitude'])[1:]
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
1,M1B,43.8066863,-79.1943534
2,M1C,43.7845351,-79.1604971
3,M1E,43.7635726,-79.1887115
4,M1G,43.7709921,-79.2169174
5,M1H,43.773136,-79.2394761


In [51]:
# Adding the Latitude and Longitude values for each entry in our toronto_df dataframe
toronto_df['Latitude'] = ''
toronto_df['Longitude'] = ''

# Adding the Latitude and Longitude to each of the toronto_df's entries
for i in range(len(toronto_df)):
  toronto_df.loc[i, ['Latitude', 'Longitude']] = coordinates[coordinates['PostalCode'] == toronto_df.loc[i, 'PostalCode']][['Latitude', 'Longitude']].values


# Printing the top 5 elements of the DataFrame
print(f'The length of the dataframe is {len(toronto_df)}')
toronto_df.head()

The length of the dataframe is 103


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7532586,-79.3296565
1,M4A,North York,Victoria Village,43.7258823,-79.3155716
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542599,-79.3606359
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.4647633
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623015,-79.3894938


# 3. Clustering the data

In [0]:
# Setting the client ID and secret
CLIENT_ID = 'WCSYXZ1P1OBG5ANAIMZFIOYFJURWNPLKZY0A0DQGTV5CEO1J'
CLIENT_SEC = 'CAQPZAEQBRYSFBXIJCYAUTX4K55TXHPQIVPANEZ5UGPAXURL'

In [54]:
# Selecting only the Toronto Neighborhoods
df = toronto_df[toronto_df['Borough'].str.contains('Toronto')]
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542599,-79.3606359
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623015,-79.3894938
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6571618,-79.3789371
3,M5C,Downtown Toronto,St. James Town,43.6514939,-79.3754179
4,M4E,East Toronto,The Beaches,43.6763574,-79.2930312


In [72]:
# Getting the neighborhood latitude and longitudes
neighborhood_latitude = df.loc[0, 'Latitude'] # neighbourhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighbourhood longitude value
neighborhood_name = df.loc[0, 'Neighborhood'] # neighbourhood name

print(f'Latitude and longitude values of {neighborhood_name} are {neighborhood_latitude}, {neighborhood_longitude}.')

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [0]:
# Getting the top 50 venues with 500 meters radius from the neighborhood
LIMIT = 50
radius = 500

url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}&client_secret={CLIENT_SEC}&v=20200417&ll={neighborhood_latitude},{neighborhood_longitude}&radius={radius}&limit={LIMIT}'

# Getting the json result back from the API
results = requests.get(url).json()

In [80]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
  
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

  
venues = results['response']['groups'][0]['items']
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


In [81]:
print(nearby_venues.shape)

(48, 4)
