# Segmenting and Clustering Neighborhoods in Toronto - Part-1

### Scraping wikipedia page and and transforming the data on wikipedia page into pandas dataframe

#### Importing Libraries

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import lxml
import requests

##### Using beautiful scoup for scraping the wikipedia page to fetch the data

In [81]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.text, 'lxml')

#### Initialize an empty data frame 

In [82]:
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

#### Loop through each of the elements in the website table and add to the empty dataframe

In [83]:
postalCodeTable = soup.find('table')
for tr in postalCodeTable.find_all('tr'):
    data = []
    for td in tr.find_all('td'):
        data.append(td.text.strip())
    if len(data)==3:
        df.loc[len(df)] = data

#### Explore the dataframe

In [84]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [85]:
df.shape

(180, 3)

#### Removing the not assigned values from Borough

In [86]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Checking for the not assigned values in Neighborhood

In [87]:
if any(df['Neighborhood'] == 'Not assigned'):
    print('There are not assigned values in Neighborhood')
else:
    print('There are no not assigned values in Neighborhood') 

There are no not assigned values in Neighborhood


In [88]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Resulting dataframe and shape

In [89]:
toronto = df.reset_index(drop = True)
toronto.head(12)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [90]:
toronto.shape

(103, 3)

In [92]:
# saving it to a csv file in order to use it for the second part of assignment
toronto.to_csv('toronto.csv', index = False) 