# Data Science Capstone Project


### This notebook is contains the IBM Data Science Capstone Project
#### 

### 1. Importing all necessary Libraries

In [9]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from bs4 import BeautifulSoup 
import requests
from geopy.geocoders import Nominatim

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


### 2. Scraping web and making a dataframe

In [55]:
url = 'https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods'
req = requests.get(url).text
soup = BeautifulSoup(req)

tables = soup.find_all('table')

final_tables = tables[1:3] + [tables[5]]

frame = []
for table in final_tables:
    rows_with_title = table.find_all('tr')
    rows = rows_with_title[1:]
    
    for row in rows:
        row_list = []
        items = row.find_all('td')[:-1]
        for item in items:
            item_text = item.text.strip()
            
            row_list.append(item_text)
            #print(item_text)
        #print(row_list)
        #print('#########')
        #print(row_list)
        frame.append(row_list)

        
#print(frame[:3])

df = pd.DataFrame(frame, columns=["Neighbourhood", "FM", "Census", "Population", "Land Area", "Density", "Population %", "Income", "Commuting", "Renters", "2nd Language", "2nd Language %"])
df.head()

Unnamed: 0,Neighbourhood,FM,Census,Population,Land Area,Density,Population %,Income,Commuting,Renters,2nd Language,2nd Language %
0,Crescent Town,EY,0190.01,8157,0.4,20393,-10.0,23021,24.5,20.3,Bengali (18.1%),18.1% Bengali
1,Governor's Bridge/Bennington Heights,EY,0186.00,2112,1.87,1129,4.0,129904,7.1,13.3,Polish (1.4%),01.4% Polish
2,Leaside,EY,"0195.00, 0196.00",13876,2.81,4938,3.0,82670,9.7,10.5,Bulgarian (0.4%),00.4% Bulgarian
3,O'Connor–Parkview,EY,"0189.00, 0190.02, 0191.00, 0192.00, 0193.00",17740,4.94,3591,-6.1,33517,15.8,19.4,Urdu (3.2%),03.2% Urdu
4,Old East York,EY,"0180.00, 0181.01, 0181.02, 0182.00, 0183.00, 0...",52220,7.94,6577,-4.6,33172,22.0,19.1,Greek (4.3%),04.3% Greek


### 3. Cleaning a dataframe

In [56]:
# clean up the data and dropping unwanted columns
df = df[df.Neighbourhood != 'Toronto CMA Average']
df = df.drop('FM', 1)
df = df.drop('Census', 1)
df = df.drop('Renters', 1)

# change to the proper datatype
df['Population'] = df['Population'].str.replace(',','')
df['Population'] = df['Population'].apply(pd.to_numeric)
#print (df1.dtypes)

# display sample data
df.head(10)

Unnamed: 0,Neighbourhood,Population,Land Area,Density,Population %,Income,Commuting,2nd Language,2nd Language %
0,Crescent Town,8157.0,0.4,20393,-10.0,23021,24.5,Bengali (18.1%),18.1% Bengali
1,Governor's Bridge/Bennington Heights,2112.0,1.87,1129,4.0,129904,7.1,Polish (1.4%),01.4% Polish
2,Leaside,13876.0,2.81,4938,3.0,82670,9.7,Bulgarian (0.4%),00.4% Bulgarian
3,O'Connor–Parkview,17740.0,4.94,3591,-6.1,33517,15.8,Urdu (3.2%),03.2% Urdu
4,Old East York,52220.0,7.94,6577,-4.6,33172,22.0,Greek (4.3%),04.3% Greek
5,Thorncliffe Park,17949.0,3.09,5809,9.1,25340,16.7,Urdu (21.5%),21.5% Urdu
6,Alderwood,11656.0,4.94,2360,-4.0,35239,8.8,Polish (6.2%),06.2% Polish
7,Centennial,12565.0,4.94,2544,0.5,34867,11.5,Polish (2.7%),02.7% Polish
8,Clairville,8506.0,6.71,1268,-3.3,26610,13.2,Punjabi (12.0%),12.0% Punjabi
9,Eatonville,19131.0,11.26,1699,4.3,36206,12.6,Serbian (3.2%),03.2% Serbian


In [57]:
df.shape

(176, 9)

In [39]:
neighbor = df.iloc[2,0]

address = '{}, Toronto, ON, Canada'.format(neighbor)
geolocator = Nominatim(user_agent = 'to_explorer')
location = geolocator.geocode(address)
print(location.latitude, location.longitude)

43.7047983 -79.3680904


### 4. Integrating Latitudes and Longitudes of each Neighborhoods

In [54]:
T_lat = []
T_log = []
for index, row in df.iterrows():
    neighbor = row['Neighbourhood']
    address = '{}, Toronto, ON, Canada'.format(neighbor)
    geolocator = Nominatim(user_agent = 'to_explorer')
    location = geolocator.geocode(address)
    if location is None:
        T_lat.append(np.nan)
        T_log.append(np.nan)
        print('{} location not found'.format(row['Neighbourhood']))
    else:
        T_lat.append(location.latitude)
        T_log.append(location.longitude)
        print('{} location is {}, {}'.format(row['Neighbourhood'], location.latitude, location.longitude))



Crescent Town location is 43.695403, -79.293099
Governor's Bridge/Bennington Heights location not found
Leaside location is 43.7047983, -79.3680904
O'Connor–Parkview location is 43.7023902, -79.3160976
Old East York location is 43.712452, -79.31265325
Thorncliffe Park location is 43.704553, -79.3454074
Alderwood location is 43.6017173, -79.5452325
Centennial location is 43.7874914, -79.1507681
Clairville location is 43.72337025, -79.59745741095173
Eatonville location is 43.6462843, -79.5600005
Humber Bay Shores location not found
Humber Heights location is 43.6981789, -79.5232773
Humberwood location is 43.722525, -79.54602434384384
Humber Valley Village location is 43.6664717, -79.5243136
Islington – Six Points location is 43.6460556, -79.531469
Kingsview Village location is 43.6995391, -79.5563459
Long Branch location is 43.59200455, -79.54536450659592
Markland Wood location is 43.63123865, -79.58543401986114
Mimico location is 43.6166773, -79.4968048
New Toronto location is 43.600762

Westmount location is 43.6936399, -79.5210426
Weston location is 43.7001608, -79.5162474
Wexford location is 43.7453767, -79.2947155
Willowdale location is 43.7615095, -79.4109234
Wilson Heights location is 43.7405195, -79.4400172
Woburn location is 43.7598243, -79.2252908
Wychwood location is 43.682121699999996, -79.42383883756801
York Mills location is 43.7440391, -79.406657
York University Heights location is 43.7587808, -79.5194336
Yorkville location is 43.6713861, -79.3901677


In [58]:
df.head(1)

Unnamed: 0,Neighbourhood,Population,Land Area,Density,Population %,Income,Commuting,2nd Language,2nd Language %
0,Crescent Town,8157.0,0.4,20393,-10.0,23021,24.5,Bengali (18.1%),18.1% Bengali


In [59]:
df['Latitudes'] = T_lat
df['Longitudes'] = T_log
df.head(10)

Unnamed: 0,Neighbourhood,Population,Land Area,Density,Population %,Income,Commuting,2nd Language,2nd Language %,Latitudes,Longitudes
0,Crescent Town,8157.0,0.4,20393,-10.0,23021,24.5,Bengali (18.1%),18.1% Bengali,43.695403,-79.293099
1,Governor's Bridge/Bennington Heights,2112.0,1.87,1129,4.0,129904,7.1,Polish (1.4%),01.4% Polish,,
2,Leaside,13876.0,2.81,4938,3.0,82670,9.7,Bulgarian (0.4%),00.4% Bulgarian,43.704798,-79.36809
3,O'Connor–Parkview,17740.0,4.94,3591,-6.1,33517,15.8,Urdu (3.2%),03.2% Urdu,43.70239,-79.316098
4,Old East York,52220.0,7.94,6577,-4.6,33172,22.0,Greek (4.3%),04.3% Greek,43.712452,-79.312653
5,Thorncliffe Park,17949.0,3.09,5809,9.1,25340,16.7,Urdu (21.5%),21.5% Urdu,43.704553,-79.345407
6,Alderwood,11656.0,4.94,2360,-4.0,35239,8.8,Polish (6.2%),06.2% Polish,43.601717,-79.545232
7,Centennial,12565.0,4.94,2544,0.5,34867,11.5,Polish (2.7%),02.7% Polish,43.787491,-79.150768
8,Clairville,8506.0,6.71,1268,-3.3,26610,13.2,Punjabi (12.0%),12.0% Punjabi,43.72337,-79.597457
9,Eatonville,19131.0,11.26,1699,4.3,36206,12.6,Serbian (3.2%),03.2% Serbian,43.646284,-79.560001


In [60]:
df.to_csv('neighbors_loc.csv', index = False)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176 entries, 0 to 176
Data columns (total 11 columns):
Neighbourhood     176 non-null object
Population        172 non-null float64
Land Area         176 non-null object
Density           176 non-null object
Population %      176 non-null object
Income            176 non-null object
Commuting         176 non-null object
2nd Language      176 non-null object
2nd Language %    176 non-null object
Latitudes         167 non-null float64
Longitudes        167 non-null float64
dtypes: float64(3), object(8)
memory usage: 16.5+ KB


In [65]:
df.dropna(inplace = True, axis = 0)

In [66]:
df.head()

Unnamed: 0,Neighbourhood,Population,Land Area,Density,Population %,Income,Commuting,2nd Language,2nd Language %,Latitudes,Longitudes
0,Crescent Town,8157.0,0.4,20393,-10.0,23021,24.5,Bengali (18.1%),18.1% Bengali,43.695403,-79.293099
2,Leaside,13876.0,2.81,4938,3.0,82670,9.7,Bulgarian (0.4%),00.4% Bulgarian,43.704798,-79.36809
3,O'Connor–Parkview,17740.0,4.94,3591,-6.1,33517,15.8,Urdu (3.2%),03.2% Urdu,43.70239,-79.316098
4,Old East York,52220.0,7.94,6577,-4.6,33172,22.0,Greek (4.3%),04.3% Greek,43.712452,-79.312653
5,Thorncliffe Park,17949.0,3.09,5809,9.1,25340,16.7,Urdu (21.5%),21.5% Urdu,43.704553,-79.345407


In [67]:
df[['Neighbourhood', 'Income']]

Unnamed: 0,Neighbourhood,Income
0,Crescent Town,23021
2,Leaside,82670
3,O'Connor–Parkview,33517
4,Old East York,33172
5,Thorncliffe Park,25340
6,Alderwood,35239
7,Centennial,34867
8,Clairville,26610
9,Eatonville,36206
11,Humber Heights,39738


### Importing and Cleaning Hospital Admission data

In [134]:
hospital = pd.read_excel('hospital.xls')
hospital = hospital.iloc[14:,:]
hospital = hospital.iloc[1:,1:8]
hospital = hospital.reset_index().drop(['index'], axis = 1)
hospital.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
hospital.columns = ['Neighbourhood', 'Male Admission', 'Female Admission', 'Both Admission']
hospital.head()

Unnamed: 0,Neighbourhood,Male Admission,Female Admission,Both Admission
0,West Humber-Clairville,2453,3151,5604
1,Mount Olive-Silverstone-Jamestown,2256,3004,5260
2,Thistletown-Beaumond Heights,953,1153,2106
3,Rexdale-Kipling,938,1090,2028
4,Elms-Old Rexdale,674,978,1652


In [133]:
for index, row in hospital.iterrows():
    neighbor = row['Neighbourhood']
    address = '{}, Toronto, ON, Canada'.format(neighbor)
    geolocator = Nominatim(user_agent = 'to_explorer')
    location = geolocator.geocode(address)
    if location is None:
        #T_lat.append(np.nan)
        #T_log.append(np.nan)
        print('{} location not found'.format(row['Neighbourhood']))
    else:
        #T_lat.append(location.latitude)
        #T_log.append(location.longitude)
        print('{} location is {}, {}'.format(row['Neighbourhood'], location.latitude, location.longitude))



West Humber Clairville location is 43.72337025, -79.59745741095173
Mount Olive Silverstone Jamestown location not found
Thistletown Beaumond Heights location not found
Rexdale Kipling location is 43.722114149999996, -79.57229244708017
Elms Old Rexdale location is 43.72176985, -79.55217331972301
Kingsview Village The Westway location not found
Willowridge Martingrove Richview location not found
Humber Heights Westmount location is 43.6957852, -79.5208324
Edenbridge Humber Valley location is 43.670672, -79.5188545
Princess Rosethorn location not found
Eringate Centennial West Deane location not found
Markland Wood location is 43.63123865, -79.58543401986114
Etobicoke West Mall location is 43.6435491, -79.56532534553605
Islington City Centre West location is 43.6625768, -79.5317888
Kingsway South location is 43.6473811, -79.5113328
Stonegate Queensway location not found
Mimico location is 43.6166773, -79.4968048
New Toronto location is 43.6007625, -79.505264
Long Branch location is 43.592

In [132]:

neighbor = hospital.iloc[2,0]
print(neighbor)
address = '{}, Toronto, ON, Canada'.format(neighbor)
geolocator = Nominatim(user_agent = 'to_explorer')
location = geolocator.geocode(address)
print(location.latitude, location.longitude)

Thistletown Beaumond Heights


AttributeError: 'NoneType' object has no attribute 'latitude'

In [122]:
test = hospital
test['Neighbourhood'] = test['Neighbourhood'].str.replace('-', ' ')
test.head(20)

Unnamed: 0,Neighbourhood,Male Admission,Female Admission,Both Admission
0,West Humber Clairville,2453,3151,5604
1,Mount Olive Silverstone Jamestown,2256,3004,5260
2,Thistletown Beaumond Heights,953,1153,2106
3,Rexdale Kipling,938,1090,2028
4,Elms Old Rexdale,674,978,1652
5,Kingsview Village The Westway,1713,2253,3966
6,Willowridge Martingrove Richview,1689,2211,3900
7,Humber Heights Westmount,950,1335,2285
8,Edenbridge Humber Valley,1029,1269,2298
9,Princess Rosethorn,708,865,1573


In [135]:
pd.merge(df,hospital,on='Neighbourhood', how='left')

Unnamed: 0,Neighbourhood,Population,Land Area,Density,Population %,Income,Commuting,2nd Language,2nd Language %,Latitudes,Longitudes,Male Admission,Female Admission,Both Admission
0,Crescent Town,8157.0,0.4,20393,-10.0,23021,24.5,Bengali (18.1%),18.1% Bengali,43.695403,-79.293099,,,
1,Leaside,13876.0,2.81,4938,3.0,82670,9.7,Bulgarian (0.4%),00.4% Bulgarian,43.704798,-79.36809,,,
2,O'Connor–Parkview,17740.0,4.94,3591,-6.1,33517,15.8,Urdu (3.2%),03.2% Urdu,43.70239,-79.316098,,,
3,Old East York,52220.0,7.94,6577,-4.6,33172,22.0,Greek (4.3%),04.3% Greek,43.712452,-79.312653,647.0,745.0,1392.0
4,Thorncliffe Park,17949.0,3.09,5809,9.1,25340,16.7,Urdu (21.5%),21.5% Urdu,43.704553,-79.345407,887.0,1526.0,2413.0
5,Alderwood,11656.0,4.94,2360,-4.0,35239,8.8,Polish (6.2%),06.2% Polish,43.601717,-79.545232,929.0,1176.0,2105.0
6,Centennial,12565.0,4.94,2544,0.5,34867,11.5,Polish (2.7%),02.7% Polish,43.787491,-79.150768,,,
7,Clairville,8506.0,6.71,1268,-3.3,26610,13.2,Punjabi (12.0%),12.0% Punjabi,43.72337,-79.597457,,,
8,Eatonville,19131.0,11.26,1699,4.3,36206,12.6,Serbian (3.2%),03.2% Serbian,43.646284,-79.560001,,,
9,Humber Heights,4674.0,1.69,2766,8.3,39738,10.1,Spanish (4.1%),04.1% Spanish,43.698179,-79.523277,,,
