# Install libraries

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Retrieve data from Wikipedia

In [21]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

# Converting in Pandas Dataframe and Cleaning the data

In [22]:
df=pd.DataFrame(data,columns=['PostalCode','Borough','Neighborhood2'])

In [23]:
# Find indexes of rows that have "Not assigned" in Borough column
indexNames = df[(df['Borough'] == "Not assigned")].index

# Drop rows that have "Not assigned" in Borough column
df.drop(indexNames,inplace=True)

# Drop the first row
df.dropna(inplace=True)

In [24]:
# Combine multiple rows into one row based on PostalCode and Borough
df=df.groupby(['PostalCode','Borough'])['Neighborhood2'].apply(', '.join).reset_index()

In [25]:
# Replace "Not assigned" in Neighborhood column with the value in Borough column
def custom_fx(data):
    if data['Neighborhood2']=='Not assigned':
        var=data['Borough']
    else:
        var=data['Neighborhood2']
    return var

# Apply the function
df['Neighborhood']=df.apply(custom_fx,axis='columns')

# Check that there is no more "Not assigned" in Neighborhood column
print("There are {} rows that have 'Not assigned' in Neighborhood column in the dataframe".format(
    len(df[df['Neighborhood']=='Not assigned'])
)
     )

# Delete Neighborhood2 column
df.drop(columns='Neighborhood2')

There are 0 rows that have 'Not assigned' in Neighborhood column in the dataframe


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [26]:
# Export the dataframe
df.to_csv(r'E:\Saurav\LAPPY\study\Coursera\IBM_Data_Science\project\IBM_DATA_SCIENCE\week_3\week_3_task_2.csv',index=None,header=True)
print("done")

done


In [27]:
# Shape of the table
df.shape

(103, 4)