# Question 1 : Extracting negihborhood data of Toronto City

In [1]:
# Importing necessary libraries
# !pip install bs4
# !pip install requests
# !pip install pandas

import requests
import pandas as pd
from bs4 import BeautifulSoup

Now the data will be extracted from Wikipedia   
Reference : https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M  
With the help of BeautifulSoup library, I'll extract the human readable data

In [2]:
# Downloading url data from wikipedia
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data = requests.get(url).text
CN_data = BeautifulSoup(html_data, 'lxml')

In [3]:
# Creating datafram and assigning columns labels
column_names = ['Postalcode','Borough','Neighborhood']
Toronto = pd.DataFrame(columns = column_names)

In [4]:
# Assigning values to the datframe

table = CN_data.find('table').tbody


for tr in table.find_all('tr'):
   
    for td in tr.find_all('td'):
        content = td.getText(separator='|', strip=True).split('|')
        clean_content = [value for value in content if value != '(' and value != ')' and value != '/' and value != ','] 
        
#         print(clean_content)
        
        number_of_entry = len(content)
        
        if number_of_entry == 1:            
            postcode = clean_content[0]
            borough = ['']
            neighborhood = [''] 
        elif number_of_entry == 2:
            postcode = clean_content[0]
            borough = clean_content[1]
            neighborhood = ['']
        else:
            postcode = clean_content[0]
            borough = clean_content[1]
            neighborhood = ','.join([str(item) for item in clean_content[2:]])
        
        Toronto = Toronto.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood}, ignore_index=True)
        
Toronto.head(20)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,[]
1,M2A,Not assigned,[]
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
5,M6A,North York,"Lawrence Manor,Lawrence Heights"
6,M7A,Queen's Park,(Ontario Provincial Government)
7,M8A,Not assigned,[]
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern,Rouge"


Having all data downloaded from the website, now its time to clean the data and look for non null values.

In [5]:
# Cleaning Dataframe
Toronto = Toronto[Toronto.Borough != 'Not assigned']
Toronto = Toronto[Toronto.Borough != 0]
Toronto.reset_index(drop = True, inplace = True)

for j in range(0,Toronto.shape[0]):
    if Toronto.iloc[j][2] == 'Not assigned':
        Toronto.iloc[j][2] = Toronto.iloc[j][1]
        j += 1

dataframe = Toronto.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
dataframe.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
dataframe = dataframe.dropna()
null = 'Not assigned'
dataframe = dataframe[(dataframe.Postalcode != null) & (dataframe.Borough != null) & (dataframe.Neighborhood != null)]

In [7]:
dataframe.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = dataframe.groupby(['Postalcode', 'Borough'])
dataframe_2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [9]:
print(dataframe_2.shape)
dataframe_2.head(10)

(103, 3)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park,Ionview,/ East,Birchmount Park"
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge"
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village,West)"
9,M1N,Scarborough,"Birch Cliff,Cliffside,West)"


In [10]:
print('The DataFrame shape is ', dataframe_2.shape)

The DataFrame shape is  (103, 3)
