# Segmenting and Clustering Neighborhoods in Toronto

For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

#### Loading common libraries

In [10]:
import numpy as np 
import pandas as pd 

#### Loading Module BeautifulSoup

In [11]:
from bs4 import BeautifulSoup
import requests

#### Loading Wiki List of postal codes of Canada: M 

In [12]:
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#### Parse HTML and create a list for HTML Table

In [13]:
soup = BeautifulSoup(html.text, 'html.parser')
tablesoup=soup.find("table")

def get_string(ele):
    if ele.find("a") != None :
        return ele.find("a").string.replace('\n','')
    else:
        return ele.string.replace('\n','')
    
my_list=[]
for row_td in tablesoup.find_all("tr"):
    row=row_td.find_all("td")
    if len(row) != 0 :
        my_list.append([get_string(row[0]),get_string(row[1]),get_string(row[2])])
#print(my_list)

#### Creating dataframe with above table

In [14]:
# This dataframe is not cleaned yet
df=pd.DataFrame(my_list,columns=['PostalCode','Borough','Neighborhood'])
df.reset_index(drop=True)
df.shape

(287, 3)

#### Removing cells Borough = "Not assigned"

In [15]:
#cleaning data
df=df[df.Borough != 'Not assigned']

df.shape

(210, 3)

#### In this step we will concatenate the Neighborhood with ',' for the same PostalCode and remove duplicate PostalCode
Using lamda function with lookup

In [16]:
def concat_str(narray):
    new_series = pd.Series(narray)
    s=new_series.str.cat(sep=',')
    return s
    
df['Neighborhood']=df.apply( lambda row: row['Neighborhood'] if ((df[df['PostalCode'] == row['PostalCode']].count())[0] < 2) else concat_str(df[df['PostalCode'] == row['PostalCode']]['Neighborhood'].unique()), axis=1)

df.drop_duplicates(subset ="PostalCode",keep = 'first',inplace=True) 

df.shape

(103, 3)

#### In this step for "Not assigned" Neighborhood we will assign the Borough value.  
I used lamda function inorder to achive it

In [17]:
df['Neighborhood']=df.apply( lambda row: row['Borough'] if (row['Neighborhood'] == 'Not assigned') else row['Neighborhood'], axis=1)

df.shape


(103, 3)

#### Displaying shape

In [18]:
df.shape

(103, 3)

#### Displaying full dataframe


In [19]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


#### method for location lat lng

In [4]:
!conda install -c conda-forge geocoder --yes 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

    geocoder: 1.38.1-py_1 conda-forge
    ratelim:  0.1.6-py_2  conda-forge


Downloading and Extracting Packages
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
ratelim-0.1.6        | 6 KB      | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [20]:
import geocoder 

####  Note: The geocoder API didn't work it was taking very long so I had to use the csv given

In [25]:
!wget -q -O 'Geospatial_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [26]:
geo_df = pd.read_csv('Geospatial_data.csv')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Copying the Latitude and Longitude columns to df

In [36]:
df['Latitude']=geo_df['Latitude']
df['Longitude']=geo_df['Longitude']

#### Two new columns added to df 

In [38]:
df.shape

(103, 5)

In [39]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,Harbourfront,43.763573,-79.188711
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.770992,-79.216917
4,M7A,Queen's Park,Queen's Park,43.773136,-79.239476
5,M9A,Downtown Toronto,Queen's Park,43.744734,-79.239476
6,M1B,Scarborough,"Rouge,Malvern",43.727929,-79.262029
7,M3B,North York,Don Mills North,43.711112,-79.284577
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.716316,-79.239476
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.692657,-79.264848
