# Segmenting and Clustering Part 1
## *This section obtains data from Wikipedia page and transforms it into the specified Pandas dataframe*

In [52]:
import pandas as pd

### *Read the table and convert into dataframe*

In [53]:
table1=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',header=0)

In [54]:
df1=table1[0]
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### *Rename the columns as specified*

In [55]:
df1.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df1.head()
#df1.shape

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### *Remove rows where Borough value is "Not assigned"*

In [56]:
df1.drop(df1[df1.Borough=='Not assigned'].index,inplace=True)
df1.head()
#df1.shape

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [57]:
df1.reset_index(drop=True,inplace=True)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### *Grouping cells based on PostalCode*
#### (This requirement is no longer valid. The table in the Wikipedia page has already grouped the neighborhood data according to their postal code)

In [63]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table. 

# This requirement is no longer valid. The table in the Wikipedia Page has already grouped the neighborhood data according to their postal code.
# Below code can be used if the rquirement was still valid.

df1=df1.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: ', '.join(x))
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [65]:
df1.loc[df1['Neighborhood']=='Not assigned','Neighborhood']=df1['Borough']
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [67]:
df1.shape

(103, 3)