# All Three Sections are Contained within this Notebook

## Part 1- Scrape and Transform Toronto Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Scrape data from Wikipedia
wiki_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
scrape=pd.read_html(wiki_link)[0]

In [None]:
#Ignore cells with no borough assigned
scrape=scrape[scrape.Borough!='Not assigned']

In [None]:
#Check for neighborhood not assigned in dataframe
scrape[scrape.Neighborhood=='Not assigned']

Since no neighborhods are unassigned neighborhoods can now be aggregated by Postal Code and Borough

In [None]:
toronto=scrape.groupby(["Postal Code","Borough"],as_index=False).agg(lambda x: ", ".join(x))

In [None]:
toronto.shape

In [None]:
toronto.head(10)

## Part 2- Geocoding Toronto postal codes

In [None]:
#Geospatial data from the link was downloaded and imported
geo_spat=pd.read_csv("Geospatial_Coordinates.csv")

In [None]:
toronto=toronto.merge(geo_spat,how="left",on=["Postal Code"])

In [None]:
#Test against coursera assignment page
test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
test=toronto[toronto["Postal Code"].isin(test_list)]
test['cat_sort']=pd.Categorical(
    test["Postal Code"], 
    categories=test_list, 
    ordered=True
)
test.sort_values(['cat_sort']).drop(columns=['cat_sort'])

Coordinates tally with frame on assignment page

## Part 3- Explore and Cluster Neighborhoods

In [None]:
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import requests

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT=100
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
def get_lat_lng(address):
    
    try:
    
        location = geolocator.geocode('{}, Toronto, Ontario'.format(address))
        latitude = location.latitude
        longitude = location.longitude
            
        return latitude, longitude
    
    except:
        return None, None

In [None]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

### Map Toronto Neighborhods

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

### Use Foursquare API for neighborhood venues

#### Define Foursquare credentials

In [None]:
CLIENT_ID = 'D401AJGJPVCSSPFFCVAOAN2ZE2RRTF2OGQDP5OOTHJEXK5NR' # your Foursquare ID
CLIENT_SECRET = 'ETW1QCU54RFBEILWBMCJ11PVGNLUWZMCUWQ5AVGVEHC220VV' # your Foursquare Secret
VERSION = '20180604'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

#### To analyze neighborhods they are first geocoded and then venues obtained per each neighborhood

In [None]:
#explode neighborhoods to rows
toronto.rename(columns={'Latitude':'Postal_Latitude',
                        'Longitude':'Postal Logitude'},inplace=True)
toronto.Neighborhood=toronto.Neighborhood.apply(lambda x: [y.strip() for y in x.split(',')])
toronto=toronto.set_index(list(toronto.drop(columns=['Neighborhood']).columns)).apply(pd.Series.explode).reset_index()

In [None]:
#Create address for geocoding
toronto['Address']=toronto.Neighborhood+', '+toronto.Borough

In [None]:
#get latitude and longitudes for neighborhoods
toronto=toronto.merge(toronto.Address.apply(lambda x: pd.Series(get_lat_lng(x))),left_index=True,right_index=True)

In [None]:
toronto.columns = ['Postal Code','Borough', 'Postal_Latitude','Postal Logitude','Neighborhood','Address',
                   'Neighborhood_lat','Neighborhood_lng']

In [None]:
#keep only neighborhoods that were geocoded
toronto.dropna(subset=['Neighborhood_lat','Neighborhood_lng'],inplace=True)

In [None]:
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Neighborhood_lat'],
                                   longitudes=toronto['Neighborhood_lng']
                              )

In [None]:
toronto_venues.shape

In [None]:
toronto_venues.head()

In [None]:
toronto_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyze Neighborhoods

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
#one hot encoding venue categories and adding back neighborhod
toronto_onehot=pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.Neighborhood=toronto_venues.Neighborhood

In [None]:
toronto_onehot.head()

In [None]:
toronto_onehot.shape

In [None]:
#group venue category by neighborhood and get mean
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

In [None]:
toronto.shape

In [None]:
def calculate_WSS(points, kmax):
    sse = []
    for k in range(1, kmax+1):
        kmeans = KMeans(n_clusters = k).fit(points)
        centroids = kmeans.cluster_centers_
        pred_clusters = kmeans.predict(points)
        curr_sse = 0
    
        # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
        for i in range(len(points)):
            curr_center = centroids[pred_clusters[i]]
            curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
      
        sse.append(curr_sse)
    return sse

In [None]:
sse=calculate_WSS(np.array(toronto_grouped.drop('Neighborhood', 1)),10)
pd.Series(sse).plot(kind='line')

From above 7 seems to be a good number of clusters

#### Run K mean for number of clusters selected

In [None]:
# set number of clusters
kclusters = 8

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped.drop('Neighborhood', 1))

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = toronto_grouped[["Neighborhood"]].copy()
#Add cluster labels
toronto_merged["Cluster Labels"]=kmeans.labels_
#merge the venues sorted dataframe to clustered dataframe
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on="Neighborhood")
#Assign geo data
toronto_merged=toronto_merged.merge(toronto_venues[['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude']],on='Neighborhood',how='left').drop_duplicates()

In [None]:
toronto_merged["Cluster Labels"].value_counts()

#### Cluster Visualization

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster Examination

In [None]:
#Check the most repeated category by ordered common category within each cluster 
cluster_summary=toronto_merged.groupby(['Cluster Labels']).agg(lambda x:x.value_counts().index[0]).drop(['Neighborhood','Neighborhood Longitude','Neighborhood Latitude'],1)

By examining the most repeated venue for the most common venues in each cluster the type of neighborhood becomes apparent