# Capstone project - provide use the comparison of 2 cities based on the data available #

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe
import pandas as pd
# import k-means from clustering stage
from sklearn.cluster import KMeans

## Read the Wikipedia page for City1 ##

In [2]:
city1_address = 'Saskatoon, Saskatchewan, Canada'
table=pd.read_html("https://en.wikipedia.org/wiki/Saskatoon",attrs={"class":"infobox geography vcard"})[0]
loc=table[table[table.columns[0]].str.match('^Population.*')==True].index[0]
print("Population: ",int(table.loc[[loc+1]][table.columns[1]].to_list()[0]))
print("Density: ",(table.loc[[loc+2]][table.columns[1]].to_list()[0]))

Population:  246376
Density:  1,080.0/km2 (2,797/sq mi)


## Read the Wikipedia page for City2 ##

In [3]:
city2_address = 'Fremont, California, USA'
table=pd.read_html("https://en.wikipedia.org/wiki/Fremont,_California",attrs={"class":"infobox geography vcard"})[0]
loc=table[table[table.columns[0]].str.match('^Population.*')==True].index[0]
print("Population: ",int(table.loc[[loc+1]][table.columns[1]].to_list()[0]))
print("Density: ",(table.loc[[loc+4]][table.columns[1]].to_list()[0]))

Population:  214089
Density:  2,400/sq mi (940/km2)


## Get the Neighborhoods for City1 ##

In [4]:
source = requests.get("https://canadianvisa.org/blog/cities-and-places/saskatoon/neighborhoods").text
soup = BeautifulSoup(source, 'lxml')
table = soup.find_all("h2")
for i in table:
    str(i).replace("<h2>","")
table[1:5]

[<h2>Evergreen</h2>,
 <h2>Adelaide or Churchill</h2>,
 <h2>Mayfair</h2>,
 <h2>Lakeview</h2>]

### Creating function to get venues from FourSquare API

In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=10000):
    
    venues_list=[]
    CLIENT_ID="ASX3DZRL0RJC2H3VLR1YDHFGSBVU2ITFCZ3UUYKRZSIWV0FR"
    CLIENT_SECRET="SAFPB5VK2BFQHTN3VWKHFHTE0YGCENI5SJWIOCUQWIYKM3DT"
    VERSION='20180323'
    LIMIT=100
    url = 'https://api.foursquare.com/v2/venues/explore'

    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

        #print (url)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        #print(results)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Define cities for comparison

In [6]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Define the neighborhoods for both cities (it was hard to get this publicly in a nice format)

In [7]:
city1_dist=['Evergreen','Adelaide','Mayfair','Lakeview']
city2_dist=['Centerville', 'Niles', 'Irvington', 'Warm Springs']
city_df = []
for dist in city1_dist:
    city_df.append(dist+","+city1_address)

for dist in city2_dist:
    city_df.append(dist+","+city2_address)

city_df

['Evergreen,Saskatoon, Saskatchewan, Canada',
 'Adelaide,Saskatoon, Saskatchewan, Canada',
 'Mayfair,Saskatoon, Saskatchewan, Canada',
 'Lakeview,Saskatoon, Saskatchewan, Canada',
 'Centerville,Fremont, California, USA',
 'Niles,Fremont, California, USA',
 'Irvington,Fremont, California, USA',
 'Warm Springs,Fremont, California, USA']

## Get the Coordinates for all the neighborhoods using GeoLocator API

In [8]:
#city1_df=pd.DataFrame(columns=['Neighborhood','Latitude','Longitude'])
city_df=[]
for dist in city1_dist:
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(dist+","+city1_address)
    latitude = location.latitude
    longitude = location.longitude
    city_df.append([dist+","+city1_address,latitude,longitude])

city_df=pd.DataFrame(city_df,columns=['Neighborhood','Latitude','Longitude'])

city_df2=[]
for dist in city2_dist:
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(dist+","+city2_address)
    latitude = location.latitude
    longitude = location.longitude
    print(dist,latitude,longitude)
    city_df2.append([dist+","+city2_address,latitude,longitude])

city_df=city_df.append(pd.DataFrame(city_df2,columns=['Neighborhood','Latitude','Longitude']))

## Adding San Jose coordinates manually, as GeoLocator was giving error for this
city_df=city_df.append(pd.DataFrame([['Mission San Jose,Fremont, California, USA',37.534721, -121.920418]],columns=['Neighborhood','Latitude','Longitude']))

city_df

Centerville 37.559645599999996 -122.0067569
Niles 37.5781679 -121.9809177
Irvington 37.5322543 -121.9538481
Warm Springs 37.5019913 -121.9392084


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Evergreen,Saskatoon, Saskatchewan, Canada",52.169376,-106.56908
1,"Adelaide,Saskatoon, Saskatchewan, Canada",52.098655,-106.640908
2,"Mayfair,Saskatoon, Saskatchewan, Canada",52.141689,-106.678728
3,"Lakeview,Saskatoon, Saskatchewan, Canada",52.097643,-106.595409
0,"Centerville,Fremont, California, USA",37.559646,-122.006757
1,"Niles,Fremont, California, USA",37.578168,-121.980918
2,"Irvington,Fremont, California, USA",37.532254,-121.953848
3,"Warm Springs,Fremont, California, USA",37.501991,-121.939208
0,"Mission San Jose,Fremont, California, USA",37.534721,-121.920418


In [11]:
city_list=city_df

#for city in city_list:
city_venues = getNearbyVenues(names = city_list['Neighborhood'], latitudes = city_list['Latitude'],longitudes = city_list['Longitude'])
print('There are {} uniques categories.'.format(len(city_venues['Venue Category'].unique())))
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
city_onehot['Neighborhood'] = city_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]

#city_onehot.head()
city_grouped = city_onehot.groupby('Neighborhood').mean().reset_index()
print(city_grouped)
num_top_venues = 10

for hood in city_grouped['Neighborhood']:
    #print("----"+hood+"----")
    temp = city_grouped[city_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = city_grouped['Neighborhood']

for ind in np.arange(city_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(city_grouped.iloc[ind, :], num_top_venues)

print(neighborhoods_venues_sorted)

# set number of clusters
kclusters = 5
city_grouped_clustering = city_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(city_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

Evergreen,Saskatoon, Saskatchewan, Canada
Adelaide,Saskatoon, Saskatchewan, Canada
Mayfair,Saskatoon, Saskatchewan, Canada
Lakeview,Saskatoon, Saskatchewan, Canada
Centerville,Fremont, California, USA
Niles,Fremont, California, USA
Irvington,Fremont, California, USA
Warm Springs,Fremont, California, USA
Mission San Jose,Fremont, California, USA
There are 130 uniques categories.
                                Neighborhood  Afghan Restaurant  \
0   Adelaide,Saskatoon, Saskatchewan, Canada               0.00   
1       Centerville,Fremont, California, USA               0.01   
2  Evergreen,Saskatoon, Saskatchewan, Canada               0.00   
3         Irvington,Fremont, California, USA               0.00   
4   Lakeview,Saskatoon, Saskatchewan, Canada               0.00   
5    Mayfair,Saskatoon, Saskatchewan, Canada               0.00   
6  Mission San Jose,Fremont, California, USA               0.00   
7             Niles,Fremont, California, USA               0.01   
8      Warm Spri

array([3, 2, 0, 4, 3], dtype=int32)

## Process the data for Cosine Similarity (remove Neighborhood, group on cities)

In [12]:
def get_city(neighborhood):
    return neighborhood.split(",")[1]+","+neighborhood.split(",")[2]+","+neighborhood.split(",")[3] 

city_group2 = city_onehot.groupby('Neighborhood').sum().reset_index()
#city_group2
city_group2['City']=city_group2.apply(lambda x: get_city(x['Neighborhood']),axis=1)
city_group2_new = city_group2.drop(['Neighborhood'],axis=1).groupby('City').sum()
#city_group2_new = city_group2.drop(['Neighborhood'],axis=1).sort_values('City', ascending=False)
#city_group2_new.set_index('City',inplace=True)

## Calculate Similarity between the cities

In [20]:
## METHOD 1

from sklearn.metrics import pairwise_distances
import sklearn.metrics.pairwise as pairwise

pairwise.cosine_similarity(city_group2_new)[0][1]

0.5692370799616825

In [19]:
## METHOD 2

import scipy.spatial as spatial
spatial.distance.cosine(city_group2_new.transpose().iloc[:,0],city_group2_new.transpose().iloc[:,1])

0.343105388838693

## Find top 10 venues for these cities

In [43]:
city_vens=city_group2_new.transpose()
city1_ven=city_vens[[city_vens.columns[0]]].sort_values(city_vens.columns[0],ascending=False)
city1_ven.head(10)

City,"Fremont, California, USA"
Coffee Shop,34
Grocery Store,24
Park,19
Ice Cream Shop,19
Bakery,18
Fast Food Restaurant,17
Trail,15
Sushi Restaurant,15
Thai Restaurant,14
Mexican Restaurant,13


In [41]:
city2_ven=city_vens[[city_vens.columns[1]]].sort_values(city_vens.columns[1],ascending=False)
city2_ven.head(10)

City,"Saskatoon, Saskatchewan, Canada"
Coffee Shop,18
Pub,16
Hotel,14
Café,14
Restaurant,14
Bakery,12
Pizza Place,12
American Restaurant,10
Grocery Store,9
Asian Restaurant,9


In [49]:
from IPython.display import display
pd.options.display.max_rows = None
display(city_vens)
#print(city_vens)

City,"Fremont, California, USA","Saskatoon, Saskatchewan, Canada"
Afghan Restaurant,2,0
American Restaurant,5,10
Asian Restaurant,2,9
BBQ Joint,3,0
Bagel Shop,3,0
Bakery,18,12
Bank,0,1
Bar,0,4
Beach,1,0
Beer Garden,2,0


In [13]:
city_merged = city_list.copy(deep=True)
city_merged.reset_index()

# add clustering labels
city_merged['Cluster Labels'] = kmeans.labels_

# merge city_grouped with city_data to add latitude/longitude for each neighborhood
city_merged = city_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

city_merged.to_csv("CityComparison_data.csv")

city_merged # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Evergreen,Saskatoon, Saskatchewan, Canada",52.169376,-106.56908,3,Coffee Shop,Hotel,Restaurant,Bakery,Pub,Asian Restaurant,Café,Pizza Place,American Restaurant,Grocery Store
1,"Adelaide,Saskatoon, Saskatchewan, Canada",52.098655,-106.640908,2,Pub,Coffee Shop,Café,Grocery Store,Pizza Place,Restaurant,Mexican Restaurant,Sandwich Place,Breakfast Spot,Bookstore
2,"Mayfair,Saskatoon, Saskatchewan, Canada",52.141689,-106.678728,0,Hotel,Restaurant,Coffee Shop,Pub,Café,Bakery,Steakhouse,Pizza Place,Asian Restaurant,American Restaurant
3,"Lakeview,Saskatoon, Saskatchewan, Canada",52.097643,-106.595409,4,Pub,Coffee Shop,Café,Pizza Place,Mexican Restaurant,Restaurant,American Restaurant,Bakery,Hotel,Breakfast Spot
0,"Centerville,Fremont, California, USA",37.559646,-122.006757,3,Coffee Shop,Grocery Store,Sushi Restaurant,Bakery,Ice Cream Shop,Park,Thai Restaurant,Breakfast Spot,Trail,Mexican Restaurant
1,"Niles,Fremont, California, USA",37.578168,-121.980918,0,Park,Coffee Shop,Grocery Store,Breakfast Spot,Sushi Restaurant,Bakery,Fast Food Restaurant,Chinese Restaurant,Mexican Restaurant,Thai Restaurant
2,"Irvington,Fremont, California, USA",37.532254,-121.953848,4,Coffee Shop,Grocery Store,Ice Cream Shop,Fast Food Restaurant,Gym,Pizza Place,Trail,Bakery,Falafel Restaurant,Park
3,"Warm Springs,Fremont, California, USA",37.501991,-121.939208,2,Coffee Shop,Fast Food Restaurant,Gym,Grocery Store,Sushi Restaurant,Juice Bar,Trail,Thai Restaurant,Bakery,Mexican Restaurant
0,"Mission San Jose,Fremont, California, USA",37.534721,-121.920418,1,Coffee Shop,Ice Cream Shop,Trail,Bakery,Fast Food Restaurant,Grocery Store,Gym,Mexican Restaurant,Pizza Place,Falafel Restaurant


## Put all clusters on the map

In [14]:
# Matplotlib and associated plotting modules
!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['Latitude'], city_merged['Longitude'], city_merged['Neighborhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

