# Segmenting and Clustering Neighborhoods in Toronto

 # Part 1 - we will obtain the data from the following link https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M  and save it into CSV file, and then read it here and start with cleaning the data


In [7]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup



In [8]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")         # tr "table rows" in source code of wikipedia page"

res = []                                        # Create Empty list
for tr in table_rows:
    td = tr.find_all("td")                      # td "cell" in source code of wikipedia page"
    row = [tr.text for tr in td]
   
    if row != [] and row[1] != "Not assigned":  # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

        if "Not assigned" in row[2]:            # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
            row[2] = row[1]
        res.append(row)


df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


   ## Cleaning the data by removing extra Characters and Not assigned Data.

In [9]:
df["PostalCode"] = df["PostalCode"].str.replace("\n","") # removing "/n"
df["Borough"] = df["Borough"].str.replace("\n","")
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")

df.drop( df[ df['Borough'] == "Not assigned" ].index , axis = 0, inplace=True)  # removing Not Assigned on borough
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Now we combine and make sure that all neighborhood with similar PostalCode are grouped toghether

In [10]:
print("shape of DataFrame is " , df.shape)

shape of DataFrame is  (103, 3)


# Part 2 - Import Latitude and the longitude coordinates of each neighborhood

In [6]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data') # read the CSV Format and convert it into DataFrame
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## now lets Combine the two Dataframes 
 df & df_geo will be combined together 

In [13]:
df_tor =pd.merge(df , df_geo , left_on = 'PostalCode' , right_on = 'Postal Code', how = 'left' )
df_tor.drop('Postal Code', axis=1, inplace = True)
df_tor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
