# Comparing different San Francisco Neighborhoods

In [None]:
# Import all the libraries needed for the code
import pandas as pd
import numpy as np
!pip install lxml
import requests
import numpy as np
!pip install sklearn
!pip install folium
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns

In [None]:
# Reading scrapped data from a spreadsheet about sanfraciso 
df = pd.read_csv('sfneighborhood.csv')
df.head()

In [None]:
# Foursquare credentials
CLIENT_ID =  # your Foursquare ID
CLIENT_SECRET =  # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# Function to fetch venues using the foursquare API
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=50):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# Pulling venues near different SF neighborhood
sf_venues = getNearbyVenues(names=df['NHOOD'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

In [None]:
sf_venues.head()

In [None]:
# Onehot encoding of venues
sf_onehot = pd.get_dummies(sf_venues[['Venue Category']], prefix="",prefix_sep="")
sf_venues.reset_index()
#tor_venues
sf_onehot['Neighborhood'] = sf_venues['Neighborhood']

first_column = sf_onehot.pop('Neighborhood')

sf_onehot.insert(0, 'Neighborhood', first_column )

sf_onehot

In [None]:
#Grouping by neighborhood names
sf_group = sf_onehot.groupby('Neighborhood').mean().reset_index()
sf_group.head()

In [None]:
# Merging similar venue types into a single venue category like Restaurants, Schools, etc.

df = sf_group.filter(regex='(Restaurant|Joint|Caf|Breakfast|Brewery|Deli|Diner)', axis=1).sum(axis=1)
dfs = sf_group.filter(regex='(Store|Bakery)', axis=1).sum(axis=1)
dfsh = sf_group.filter(regex='(Shop|Parlor|market|Supermarket|Market)', axis=1).sum(axis=1)
dfsc = sf_group.filter(regex='School', axis=1).sum(axis=1)
dfb = sf_group.filter(regex='(Bar|pub|Pub)', axis=1).sum(axis=1)
dfg =sf_group.filter(regex='(Gym|gym|Yoga|Sports|Tennis|Center|Court|Field|Bike|Arcade|Studio)', axis=1).sum(axis=1)
dfscenic = sf_group.filter(regex='(Trail|Beach|Hill|Park|)', axis=1).sum(axis=1)
dfbank = sf_group.filter(regex='(ATM|Bank)', axis=1).sum(axis=1)
dfart = sf_group.filter(regex='(Art|Museum|Gallery)', axis=1).sum(axis=1)
dftransport = sf_group.filter(regex='(Bus|Station|Stop|Line|Train|Airport)', axis=1).sum(axis=1)
dfent = sf_group.filter(regex='(Movie|Theatre|Concert)', axis=1).sum(axis=1)

sf_group.drop(list(sf_group.filter(regex='(Restaurant|Joint|Caf|Breakfast|Brewery|Deli|Diner)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Store|Bakery)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Shop|Parlor|market|Supermarket|Market)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='School')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Bar|pub|Pub)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Gym|gym|Yoga|Sports|Tennis|Center|Court|Field|Bike|Arcade|Studio)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Trail|Beach|Hill|Park)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(ATM|Bank)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Art|Museum|Gallery)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Bus|Station|Stop|Line|Train|Airport)')), axis=1, inplace=True)
sf_group.drop(list(sf_group.filter(regex='(Movie|Theatre|Concert)')), axis=1, inplace=True)

sf_group['Restaurants'] = df
sf_group['Stores'] = dfs
sf_group['Shops'] = dfsh
sf_group['Schools'] = dfsc
sf_group['Bars'] = dfb
sf_group['Recretional'] = dfg
sf_group['Arts'] = dfart
sf_group['Bank'] = dfbank
sf_group['Nature'] = dfscenic
sf_group['Transport'] = dftransport
sf_group['Entertainment'] = dfent
sf_group.head()

In [None]:
# Creating a new dataframe with only lumped categorical columns
sf_cats = sf_group[['Neighborhood','Restaurants','Stores','Shops','Schools','Bars','Recretional', 'Arts', 'Bank', 'Nature', 'Transport', 'Entertainment']].copy()
sf_cats.head()

In [None]:
# THe Nature category has similar mean for every Neiborhood so dropping this coulumn so to not skew the clustering
sf_cats.drop('Nature', axis=1, inplace=True)
sf_cats.head()

In [None]:
# Adding and normalizing the Rent, House Price and Crime Rate columns
sf_cats['Rent'] = df['Rent']/(df['Rent'].max()*10)
sf_cats['House Price'] = df['House Price']/(df['House Price'].max()*10)
sf_cats['Crime Rate'] = df['Crime Rate']/(df['Crime Rate'].max()*10)
sf_cats.head()


In [None]:
# Dropping Neighborhood columns for clustering features
sf_group_clustering = sf_cats.drop('Neighborhood', 1)

In [None]:
# Finding the best K from the Elbow method
wcss=[]


for i in range(1,9):
    kmeans = KMeans(n_clusters=i, init ='k-means++', max_iter=300,  n_init=10,random_state=0 )
    kmeans.fit(sf_group_clustering)
    wcss.append(kmeans.inertia_)


plt.plot(range(1,9), wcss)
plt.title('Elbow Graph')
plt.xlabel('Num Clusters')
plt.ylabel('WCSS')
plt.show()
      

In [None]:

# set number of clusters based of the elbow method
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, init ='k-means++', max_iter=300,  n_init=10,random_state=0).fit(sf_group_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:31] 

In [None]:
#adding the cluster labels to the dataframe
sf_cats.insert(1, 'Cluster Labels', kmeans.labels_)
sf_cats.head()

In [None]:
# Plot to show how Kmeans captured the relation between Crime Rate and House Price
fig, axs = plt.subplots(ncols=2)
sns.barplot(data=sf_cats, x='Cluster Labels', y='Crime Rate', ax= axs[0])
sns.barplot(data=sf_cats, x='Cluster Labels', y='House Price', ax=axs[1])
fig.tight_layout()

In [None]:
# Using Pairplot to find relation between different factors
sns.pairplot(sf_cats[['House Price', 'Rent', 'Crime Rate', 'Restaurants', 'Cluster Labels']], diag_kind='hist' , hue='Cluster Labels', size = 4)

In [None]:
# adding absolute values and lat lon
sf_cats['Latitude'] = df['Latitude']
sf_cats['Longitude'] = df['Longitude']
sf_cats['House Price Value'] = df['House Price']
sf_cats['Rent Value'] = df['Rent']
sf_cats['Crime Rate%'] = df['Crime Rate']
sf_cats.head()

In [None]:
# create a map of the city of San Francisco and plotting them
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[37.73, -122.44], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, hp, ren, cr in zip(sf_cats['Latitude'], sf_cats['Longitude'], sf_cats['Neighborhood'], sf_cats['Cluster Labels'], sf_cats['House Price Value'], sf_cats['Rent Value'], sf_cats['Crime Rate%']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + ' Avg House Price ' + str(hp) + ' Avg Rent ' + str(ren) + ' Crime Rate ' + str(cr), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters