In [3]:
pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 6.8MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.2 soupsieve-2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 5.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


## All Questions in same Notebook.
#### Please follow the code description in  each cell (as comments).

In [5]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display_html
import lxml.html as lh
from urllib.request import urlopen

## Question  1 :Scraping data and creating required pandas dataframe

In [6]:
#Scraping using Beautiful soup
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050."
soup = BeautifulSoup(requests.get(url).text, 'lxml')
table=soup.find('table')

In [9]:
#Writing in tabular form and creating csv file
data=""
for i in table.find_all('tr'):
    row1=""
    for tds in i.find_all('td'):
        row1 = row1 + "," + tds.text
    data=data+row1[1:]

file=open("data.csv","wb")
file.write(bytes(data,encoding="ascii",errors="ignore"))

8709

In [10]:
#Creating a pandas data frame
df = pd.DataFrame(columns=["Postalcode","Borough","Neighbourhood"])
df = pd.read_csv('data.csv',header=None)
df.columns=["Postalcode","Borough","Neighbourhood"]
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [11]:
#Removing rows with Borough = Not assigned
noboroughs = df.index[df['Borough'] == 'Not assigned']
df.drop(df.index[noboroughs], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [12]:
#Not assigned neigbourhood = borough
noneighbor = df.index[df['Neighbourhood'] == 'Not assigned']
for i in noneighbor:
    df['Neighbourhood'][i] = df['Borough'][i]
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [24]:
#Removing Duplicates
temp=df.groupby('Postalcode')['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
temp=temp.reset_index(drop=False)
temp.rename(columns={'Neighbourhood':'tempneighbor'},inplace=True)
df_final = pd.merge(df, temp, on='Postalcode')
df_final.drop(['Neighbourhood'],axis=1,inplace=True)
df_final.drop_duplicates(inplace=True)
df_final.rename(columns={'tempneighbor':'Neighbourhood'},inplace=True)
print('The final data frame dimension is', df_final.shape)

The final data frame dimension is (103, 3)


## Question 2

In [17]:
# reading geo location data
geodata=pd.read_csv('http://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
geodata.shape

(103, 3)

Since shape is same, data frames can be merged with correct tagging, and renaming Postal Code to Postalcode

In [25]:
#Merging data frames
geodata.rename(columns={'Postal Code':'Postalcode'},inplace=True)
merged= pd.merge(geodata,df_final, on='Postalcode')
merged.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [26]:
#Rearranging
merged=merged[['Postalcode','Borough','Neighbourhood','Latitude','Longitude']]
merged.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Question 3

In [27]:
#Exploring Scarborough
scar_data = merged[merged['Borough'] == 'Scarborough'].reset_index(drop=True)
scar_data.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
#clustering
kclusters = 5
scar_data = scar_data.drop('Neighbourhood', 1)
scar_data = scar_data.groupby('Neighbourhood').mean().reset_index()
scar_data
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scar_dat)
kmeans.labels_[0:10]