#### Importing and reading libraries

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from geopy.geocoders import Nominatim
import urllib.request
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline
from sklearn.cluster import KMeans
import os


#### Scraping data -Wikipedia to a DataFrame

In [14]:
List_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(List_url).text

soup = BeautifulSoup(source, 'xml')

table=soup.find('table')

In [15]:
#dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

# Search all the postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [16]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Data cleaning

In [17]:
df=df[df['Borough']!='Not assigned']
df[df['Neighborhood']=='Not assigned']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Collapse the data

In [18]:
temp_df=df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [19]:
df_merge = pd.merge(df, temp_df, on='Postalcode')
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
df_merge.shape

(103, 3)

In [21]:
from geopy.geocoders import Nominatim
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
geo_df.rename(columns={'Postal Code':'Postalcode'},inplace=True)
geo_merged = pd.merge(geo_df, df_merge, on='Postalcode')
geo_data=geo_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


#### work with only boroughs that contain the word Toronto

In [23]:
toronto_data=geo_data[geo_data['Borough'].str.contains("Toronto")]
toronto_data.head(40)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


#### 40 borughs are filtered to be looked at for the desirable location....

In [24]:
CLIENT_ID = 'QEF4SFROUUVOQKNPAFNNRDUW4ACAWSYYG312LM3BKEDDKPIZ' # Foursquare ID
CLIENT_SECRET = 'CCBFW1WDVTUYDLPFX3SR0QBB5R2UKRYOZ3F1JDJ3PLQMPDRM' #  Foursquare Secret code
VERSION = '20201004'

In [25]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=1000
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood_Latitude', 
                  'Neighborhood_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude', 
                  'Venue_Category']
    
    return(nearby_venues)
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

##### places based on lat and long

In [26]:
toronto_venues.head(40)

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
2,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
3,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
4,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
5,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.67263,-79.287993,Ice Cream Shop
6,The Beaches,43.676357,-79.293031,Bagels On Fire,43.672864,-79.286784,Bagel Shop
7,The Beaches,43.676357,-79.293031,Mastermind Toys,43.671453,-79.293971,Toy / Game Store
8,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
9,The Beaches,43.676357,-79.293031,Veloute Bistro,43.672267,-79.289584,French Restaurant


In [27]:
toronto_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3176 entries, 0 to 3175
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Neighborhood            3176 non-null   object 
 1   Neighborhood_Latitude   3176 non-null   float64
 2   Neighborhood_Longitude  3176 non-null   float64
 3   Venue                   3176 non-null   object 
 4   Venue_Latitude          3176 non-null   float64
 5   Venue_Longitude         3176 non-null   float64
 6   Venue_Category          3176 non-null   object 
dtypes: float64(4), object(3)
memory usage: 173.8+ KB


In [28]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue_Category'].unique())))

There are 278 uniques categories.


In [39]:
df.Indian_Restaurant=toronto_venues[toronto_venues.Venue_Category=="Indian Restaurant"]


In [40]:
df.Indian_Restaurant.count()


Neighborhood              27
Neighborhood_Latitude     27
Neighborhood_Longitude    27
Venue                     27
Venue_Latitude            27
Venue_Longitude           27
Venue_Category            27
dtype: int64

In [41]:
df.Indian_Restaurant.head()

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
30,The Beaches,43.676357,-79.293031,Delhi Bistro,43.672506,-79.288353,Indian Restaurant
69,The Beaches,43.676357,-79.293031,ali's tandoori curry house,43.673393,-79.28374,Indian Restaurant
111,"The Danforth West, Riverdale",43.679557,-79.352188,Sher-E-Punjab,43.677308,-79.353066,Indian Restaurant
183,"India Bazaar, The Beaches West",43.668999,-79.315572,Udupi Palace,43.67248,-79.321275,Indian Restaurant
187,"India Bazaar, The Beaches West",43.668999,-79.315572,Motimahal,43.672175,-79.322532,Indian Restaurant


In [42]:
df.SwimmingPool=toronto_venues[toronto_venues.Venue_Category=="Pool"]

In [43]:
df.SwimmingPool.count()

Neighborhood              5
Neighborhood_Latitude     5
Neighborhood_Longitude    5
Venue                     5
Venue_Latitude            5
Venue_Longitude           5
Venue_Category            5
dtype: int64

In [44]:
df.SwimmingPool.head()

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
136,"The Danforth West, Riverdale",43.679557,-79.352188,Riverdale Pool,43.67238,-79.35669,Pool
225,"India Bazaar, The Beaches West",43.668999,-79.315572,Donald Dean Summerville Olympic Pools,43.665187,-79.304684,Pool
792,"St. James Town, Cabbagetown",43.667967,-79.367675,Regent Park Aquatic Centre,43.6606,-79.361392,Pool
933,"Regent Park, Harbourfront",43.65426,-79.360636,Regent Park Aquatic Centre,43.6606,-79.361392,Pool
2931,"Parkdale, Roncesvalles",43.64896,-79.456325,High Park Swimming Pool,43.649507,-79.465699,Pool


### making a new df from the above two dfs

In [45]:
df.Result = df.Indian_Restaurant.append( df.SwimmingPool, sort=False)
df.Result.head(50)

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
30,The Beaches,43.676357,-79.293031,Delhi Bistro,43.672506,-79.288353,Indian Restaurant
69,The Beaches,43.676357,-79.293031,ali's tandoori curry house,43.673393,-79.28374,Indian Restaurant
111,"The Danforth West, Riverdale",43.679557,-79.352188,Sher-E-Punjab,43.677308,-79.353066,Indian Restaurant
183,"India Bazaar, The Beaches West",43.668999,-79.315572,Udupi Palace,43.67248,-79.321275,Indian Restaurant
187,"India Bazaar, The Beaches West",43.668999,-79.315572,Motimahal,43.672175,-79.322532,Indian Restaurant
210,"India Bazaar, The Beaches West",43.668999,-79.315572,Bombay Chowpatty,43.671875,-79.324114,Indian Restaurant
228,"India Bazaar, The Beaches West",43.668999,-79.315572,Regency Restaurant,43.672118,-79.322474,Indian Restaurant
236,"India Bazaar, The Beaches West",43.668999,-79.315572,Haandi 2000,43.671929,-79.323662,Indian Restaurant
238,"India Bazaar, The Beaches West",43.668999,-79.315572,Gautama,43.67215,-79.322877,Indian Restaurant
241,"India Bazaar, The Beaches West",43.668999,-79.315572,The Famous Indian Restaurant,43.672339,-79.321941,Indian Restaurant


In [46]:
df.Result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 30 to 2931
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Neighborhood            32 non-null     object 
 1   Neighborhood_Latitude   32 non-null     float64
 2   Neighborhood_Longitude  32 non-null     float64
 3   Venue                   32 non-null     object 
 4   Venue_Latitude          32 non-null     float64
 5   Venue_Longitude         32 non-null     float64
 6   Venue_Category          32 non-null     object 
dtypes: float64(4), object(3)
memory usage: 2.0+ KB


In [106]:
df.new_result = df.Result[['Neighborhood','Venue_Category']]
df.new_result.head(50)

Unnamed: 0,Neighborhood,Venue_Category
30,The Beaches,Indian Restaurant
69,The Beaches,Indian Restaurant
111,"The Danforth West, Riverdale",Indian Restaurant
183,"India Bazaar, The Beaches West",Indian Restaurant
187,"India Bazaar, The Beaches West",Indian Restaurant
210,"India Bazaar, The Beaches West",Indian Restaurant
228,"India Bazaar, The Beaches West",Indian Restaurant
236,"India Bazaar, The Beaches West",Indian Restaurant
238,"India Bazaar, The Beaches West",Indian Restaurant
241,"India Bazaar, The Beaches West",Indian Restaurant


In [131]:
df.neighbourhood = df.Result.loc[df.Result['Neighborhood'] == 'India Bazaar, The Beaches West']
df.neighbourhood.head(20)

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
183,"India Bazaar, The Beaches West",43.668999,-79.315572,Udupi Palace,43.67248,-79.321275,Indian Restaurant
187,"India Bazaar, The Beaches West",43.668999,-79.315572,Motimahal,43.672175,-79.322532,Indian Restaurant
210,"India Bazaar, The Beaches West",43.668999,-79.315572,Bombay Chowpatty,43.671875,-79.324114,Indian Restaurant
228,"India Bazaar, The Beaches West",43.668999,-79.315572,Regency Restaurant,43.672118,-79.322474,Indian Restaurant
236,"India Bazaar, The Beaches West",43.668999,-79.315572,Haandi 2000,43.671929,-79.323662,Indian Restaurant
238,"India Bazaar, The Beaches West",43.668999,-79.315572,Gautama,43.67215,-79.322877,Indian Restaurant
241,"India Bazaar, The Beaches West",43.668999,-79.315572,The Famous Indian Restaurant,43.672339,-79.321941,Indian Restaurant
245,"India Bazaar, The Beaches West",43.668999,-79.315572,Lahori Taste & Burger House,43.671656,-79.324581,Indian Restaurant
225,"India Bazaar, The Beaches West",43.668999,-79.315572,Donald Dean Summerville Olympic Pools,43.665187,-79.304684,Pool


#### From the above data frame India Bazaar, The Beaches West has 8 Indian restraunts and 1 pool and thus we can conclude the neighbourhood " India Bazaar, The Beaches West " is the ideal place for our new immigrant to settle in...