Import libraries

In [1]:
# basic libraries
import numpy as np
import pandas as pd

# libraries for scraping wikipedia page
import requests
import urllib.request
from bs4 import BeautifulSoup

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# plotting libraries
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes
import folium

# for k-means clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.4.1               |             py_0          26 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    ------------------------------------------------------------
                       

Load wikipedia page with list of Bay Area cities

In [2]:
from urllib.request import urlopen
url = 'https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_the_San_Francisco_Bay_Area'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

Get the table with the list of cities

In [3]:
table=soup.find('table', class_='wikitable plainrowheaders sortable')
#print(table.prettify())

Extract data into lists

In [4]:
Name=[]
County=[]
Population=[]
Area=[]

for row in table.findAll('tr')[2:]:
    cityname = row.find('th', scope='row').find(text=True)
    Name.append(cityname)

    cells=row.findAll('td')
    if len(cells)==6:
        County.append(cells[1].find(text=True))
        Population.append(str.rstrip(cells[2].find(text=True)))
        Area.append(str.rstrip(cells[3].find(text=True)))

Store lists in dataframe

In [5]:
BayAreaCities=pd.DataFrame(Name,columns=['City'])
BayAreaCities['County']=County
BayAreaCities['Population']=Population
BayAreaCities['Land Area']=Area

print(BayAreaCities.shape)
BayAreaCities.head()

(101, 4)


Unnamed: 0,City,County,Population,Land Area
0,Alameda,Alameda,73812,10.61
1,Albany,Alameda,18539,1.79
2,American Canyon,Napa,19454,4.84
3,Antioch,Contra Costa,102372,28.35
4,Atherton,San Mateo,6914,5.02


Add latitude and longitude for each city

In [6]:
BayAreaCities['Latitude']=''
BayAreaCities['Longitude']=''
geolocator = Nominatim(user_agent="ba_explorer")

count=0

for city in BayAreaCities['City']:
    city = BayAreaCities.iloc[count]['City']
    print(city)
    address=city+', California'    
    location = geolocator.geocode(address)
    BayAreaCities.loc[count,'Latitude']=location.latitude
    BayAreaCities.loc[count,'Longitude']=location.longitude
    count+=1
    
print(BayAreaCities.shape)
BayAreaCities.head()

Alameda
Albany
American Canyon
Antioch
Atherton
Belmont
Belvedere
Benicia
Berkeley
Brentwood
Brisbane
Burlingame
Calistoga
Campbell
Clayton
Cloverdale
Colma
Concord
Corte Madera
Cotati
Cupertino
Daly City
Danville
Dixon
Dublin
East Palo Alto
El Cerrito
Emeryville
Fairfax
Fairfield
Foster City
Fremont
Gilroy
Half Moon Bay
Hayward
Healdsburg
Hercules
Hillsborough
Lafayette
Larkspur
Livermore
Los Altos
Los Altos Hills
Los Gatos
Martinez
Menlo Park
Mill Valley
Millbrae
Milpitas
Monte Sereno
Moraga
Morgan Hill
Mountain View
Napa
Newark
Novato
Oakland
Oakley
Orinda
Pacifica
Palo Alto
Petaluma
Piedmont
Pinole
Pittsburg
Pleasant Hill
Pleasanton
Portola Valley
Redwood City
Richmond
Rio Vista
Rohnert Park
Ross
St. Helena
San Anselmo
San Bruno
San Carlos
San Francisco
San Jose
San Leandro
San Mateo
San Pablo
San Rafael
San Ramon
Santa Clara
Santa Rosa
Saratoga
Sausalito
Sebastopol
Sonoma
South San Francisco
Suisun City
Sunnyvale
Tiburon
Union City
Vacaville
Vallejo
Walnut Creek
Windsor
Woodside
Y

Unnamed: 0,City,County,Population,Land Area,Latitude,Longitude
0,Alameda,Alameda,73812,10.61,37.609,-121.899
1,Albany,Alameda,18539,1.79,37.8869,-122.298
2,American Canyon,Napa,19454,4.84,38.2235,-122.227
3,Antioch,Contra Costa,102372,28.35,38.0049,-121.806
4,Atherton,San Mateo,6914,5.02,37.4613,-122.198


Get latitude and longitude for San Franciscom, and map the Bay Area cities

In [32]:

address = 'San Francisco, USA'

#geolocator = Nominatim(user_agent="ba_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
#print('The geograpical coordinate of San Francisco are {}, {}.'.format(latitude, longitude))

map_ba = folium.Map(location=[latitude, longitude], zoom_start=9)

for lat, lng, city in zip(BayAreaCities['Latitude'], BayAreaCities['Longitude'], BayAreaCities['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ba)  
    
map_ba


Set up to use FourSquare

In [8]:
CLIENT_ID = 'SWM3UXBLTTAKPFB1ZF5ZVIFID3YVDE0SNO2YGCLGMKJEO125' # your Foursquare ID
CLIENT_SECRET = 'BX1R0O3RZYGYRKXYOFL2C1BLHMEKD2I05CAE2BIQ0VY0ZOLM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

radius = 500
LIMIT = 100

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: SWM3UXBLTTAKPFB1ZF5ZVIFID3YVDE0SNO2YGCLGMKJEO125
CLIENT_SECRET:BX1R0O3RZYGYRKXYOFL2C1BLHMEKD2I05CAE2BIQ0VY0ZOLM


Define functions to process FourSquare data

In [9]:
# extract the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# get nearby venues for a neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get venues for Bay Area cities

In [10]:
BayArea_venues = getNearbyVenues(names=BayAreaCities['City'], latitudes=BayAreaCities['Latitude'], longitudes=BayAreaCities['Longitude'])

Alameda
Albany
American Canyon
Antioch
Atherton
Belmont
Belvedere
Benicia
Berkeley
Brentwood
Brisbane
Burlingame
Calistoga
Campbell
Clayton
Cloverdale
Colma
Concord
Corte Madera
Cotati
Cupertino
Daly City
Danville
Dixon
Dublin
East Palo Alto
El Cerrito
Emeryville
Fairfax
Fairfield
Foster City
Fremont
Gilroy
Half Moon Bay
Hayward
Healdsburg
Hercules
Hillsborough
Lafayette
Larkspur
Livermore
Los Altos
Los Altos Hills
Los Gatos
Martinez
Menlo Park
Mill Valley
Millbrae
Milpitas
Monte Sereno
Moraga
Morgan Hill
Mountain View
Napa
Newark
Novato
Oakland
Oakley
Orinda
Pacifica
Palo Alto
Petaluma
Piedmont
Pinole
Pittsburg
Pleasant Hill
Pleasanton
Portola Valley
Redwood City
Richmond
Rio Vista
Rohnert Park
Ross
St. Helena
San Anselmo
San Bruno
San Carlos
San Francisco
San Jose
San Leandro
San Mateo
San Pablo
San Rafael
San Ramon
Santa Clara
Santa Rosa
Saratoga
Sausalito
Sebastopol
Sonoma
South San Francisco
Suisun City
Sunnyvale
Tiburon
Union City
Vacaville
Vallejo
Walnut Creek
Windsor
Woodside
Y

In [11]:
BayArea_venues.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Albany,37.88687,-122.297747,Sam's Log Cabin,37.888589,-122.298258,Breakfast Spot
1,Albany,37.88687,-122.297747,Potala Organic Cafe,37.885131,-122.297013,Vegetarian / Vegan Restaurant
2,Albany,37.88687,-122.297747,Patisserie Rotha,37.884811,-122.296931,Bakery
3,Albany,37.88687,-122.297747,Sprouts Farmers Market,37.885157,-122.297564,Grocery Store
4,Albany,37.88687,-122.297747,Hal's Office,37.890522,-122.295885,Café


Transform venues dataframe to one-hot encoding and group by city with mean frequency of each venue category

In [12]:
# one hot encoding
bayarea_onehot = pd.get_dummies(BayArea_venues[['Venue Category']], prefix="", prefix_sep="")

# insert city column
bayarea_onehot['City'] = BayArea_venues['City'] 

# move city column to the first column
city_column = bayarea_onehot.pop('City')
bayarea_onehot.insert(0, 'City', city_column)

# group by city using mean frequency of venue categories
bayarea_grouped = bayarea_onehot.groupby('City').mean().reset_index()
print(bayarea_grouped.shape)
bayarea_grouped.head()

(94, 308)


Unnamed: 0,City,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Arcade,Art Gallery,...,Vietnamese Restaurant,Vineyard,Waterfront,Weight Loss Center,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Albany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017857,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0
1,American Canyon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Antioch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Atherton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Belmont,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Define function to find and sort top venues by city

In [13]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create dataframe with top 5 venues for each city

In [14]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
bayarea_venues_sorted = pd.DataFrame(columns=columns)
bayarea_venues_sorted['City'] = bayarea_grouped['City']

for ind in np.arange(bayarea_grouped.shape[0]):
    bayarea_venues_sorted.iloc[ind, 1:] = return_most_common_venues(bayarea_grouped.iloc[ind, :], num_top_venues)

print(bayarea_venues_sorted.shape)
bayarea_venues_sorted.head()

(94, 6)


Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Albany,Pizza Place,Thai Restaurant,Coffee Shop,Burger Joint,Sushi Restaurant
1,American Canyon,Winery,Yoga Studio,Fish & Chips Shop,Eye Doctor,Falafel Restaurant
2,Antioch,Fast Food Restaurant,Gym,Coffee Shop,Flower Shop,Mexican Restaurant
3,Atherton,Business Service,Spa,Mexican Restaurant,Food & Drink Shop,Baseball Field
4,Belmont,Pet Store,Sushi Restaurant,Coffee Shop,Grocery Store,Sandwich Place


Find cities with restaurants as most common venues

In [15]:
restaurant_cities = pd.DataFrame(bayarea_venues_sorted['1st Most Common Venue'].str.contains('Restaurant'))
restaurant_cities = restaurant_cities.join(bayarea_venues_sorted['2nd Most Common Venue'].str.contains('Restaurant'))
restaurant_cities = restaurant_cities.join(bayarea_venues_sorted['3rd Most Common Venue'].str.contains('Restaurant'))
restaurant_cities = restaurant_cities.join(bayarea_venues_sorted['4th Most Common Venue'].str.contains('Restaurant'))
restaurant_cities = restaurant_cities.join(bayarea_venues_sorted['5th Most Common Venue'].str.contains('Restaurant'))
restaurant_cities.head()

top5mostcommon = bayarea_venues_sorted[restaurant_cities.sum(axis=1)==5]
top4mostcommon = bayarea_venues_sorted[restaurant_cities.sum(axis=1)>=4]
top3mostcommon = bayarea_venues_sorted[restaurant_cities.sum(axis=1)>=3]
top2mostcommon = bayarea_venues_sorted[restaurant_cities.sum(axis=1)>=2]
top1mostcommon = bayarea_venues_sorted[restaurant_cities.sum(axis=1)>=1]

print(top5mostcommon.shape)
print(top4mostcommon.shape)
print(top3mostcommon.shape)
print(top2mostcommon.shape)
print(top1mostcommon.shape)

(0, 6)
(0, 6)
(22, 6)
(49, 6)
(79, 6)


Get cities with restaurants as 3 of top 5 venues

In [16]:
many_restaurants = top3mostcommon
many_restaurants

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Benicia,Mexican Restaurant,Café,Wine Bar,American Restaurant,Italian Restaurant
9,Brisbane,Mexican Restaurant,Deli / Bodega,Vietnamese Restaurant,Indian Restaurant,Paper / Office Supplies Store
27,Fairfield,Chinese Restaurant,Indian Restaurant,Bank,Thai Restaurant,Beer Bar
32,Hayward,Fast Food Restaurant,Bar,Vietnamese Restaurant,Pizza Place,Mexican Restaurant
36,Larkspur,American Restaurant,Pizza Place,Italian Restaurant,Indian Restaurant,Burrito Place
38,Los Altos,Pizza Place,Italian Restaurant,Mexican Restaurant,American Restaurant,Bakery
43,Mill Valley,Pizza Place,Italian Restaurant,American Restaurant,Coffee Shop,Indian Restaurant
44,Millbrae,Chinese Restaurant,Vietnamese Restaurant,Coffee Shop,Pharmacy,Seafood Restaurant
45,Milpitas,Indian Restaurant,Vietnamese Restaurant,Korean Restaurant,Sandwich Place,Café
48,Morgan Hill,Italian Restaurant,Brewery,Mexican Restaurant,American Restaurant,Burger Joint


Get California crime data

In [17]:
url = 'https://en.wikipedia.org/wiki/California_locations_by_crime_rate'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

Find table with crime data by city

In [18]:
tables=soup.find_all('table', class_='wikitable sortable')
#print(tables[1].prettify())

Extract data into lists

In [19]:
Name=[]
ViolentCrime=[]
PropertyCrime=[]

for row in tables[1].findAll('tr')[1:]:    
    cells=row.findAll('td')
    if len(cells)==8:
        Name.append(cells[0].find(text=True))
        ViolentCrime.append(cells[5].find(text=True))
        PropertyCrime.append(cells[7].find(text=True))

Store lists in dataframe and merge with Bay Area cities dataframe to get only crime rates for Bay Area cities

In [20]:
CAAreaCrime=pd.DataFrame(Name,columns=['City'])
CAAreaCrime['Violent Crime per thousand']=ViolentCrime
CAAreaCrime['Property Crime per thousand']=PropertyCrime

BayAreaCrime=pd.merge(BayAreaCities, CAAreaCrime, how='inner')
print (BayAreaCrime.shape)
BayAreaCrime.head()

(93, 8)


Unnamed: 0,City,County,Population,Land Area,Latitude,Longitude,Violent Crime per thousand,Property Crime per thousand
0,Alameda,Alameda,73812,10.61,37.609,-121.899,1.88,22.36
1,Albany,Alameda,18539,1.79,37.8869,-122.298,1.6,24.7
2,American Canyon,Napa,19454,4.84,38.2235,-122.227,2.7,27.87
3,Antioch,Contra Costa,102372,28.35,38.0049,-121.806,7.84,38.72
4,Atherton,San Mateo,6914,5.02,37.4613,-122.198,0.42,10.39


Convert crime rates from string to float and drop county and coordinates columns

In [21]:
#print(type(BayAreaCrime.loc[0,'Violent Crime per thousand']))

for i in range(0,BayAreaCrime.shape[0]):
    BayAreaCrime.loc[i,'Violent Crime per thousand'] = float(BayAreaCrime.loc[i,'Violent Crime per thousand'].replace(',',' '))
    BayAreaCrime.loc[i,'Property Crime per thousand'] = float(BayAreaCrime.loc[i,'Property Crime per thousand'].replace(',',''))
    
BayAreaCrime.drop(columns=['County', 'Population', 'Land Area', 'Latitude', 'Longitude'], inplace=True)

BayAreaCrime.head()

Unnamed: 0,City,Violent Crime per thousand,Property Crime per thousand
0,Alameda,1.88,22.36
1,Albany,1.6,24.7
2,American Canyon,2.7,27.87
3,Antioch,7.84,38.72
4,Atherton,0.42,10.39


Find cities with low violent crime

In [22]:
low_violent_crime = BayAreaCrime.sort_values(by=['Violent Crime per thousand','Property Crime per thousand'])
low_violent_crime.head(10)

Unnamed: 0,City,Violent Crime per thousand,Property Crime per thousand
45,Monte Sereno,0.0,8.54
35,Hillsborough,0.09,9.05
86,Tiburon,0.11,11.59
54,Orinda,0.21,9.89
38,Los Altos,0.23,10.64
39,Los Altos Hills,0.24,8.68
76,San Ramon,0.31,9.97
14,Clayton,0.34,9.53
21,Danville,0.39,10.05
4,Atherton,0.42,10.39


Find cities with low property crime

In [23]:
low_property_crime = BayAreaCrime.sort_values(by=['Property Crime per thousand','Violent Crime per thousand'])
low_property_crime.head(10)

Unnamed: 0,City,Violent Crime per thousand,Property Crime per thousand
67,Ross,1.62,8.48
45,Monte Sereno,0.0,8.54
39,Los Altos Hills,0.24,8.68
46,Moraga,0.47,8.85
35,Hillsborough,0.09,9.05
79,Saratoga,0.61,9.25
91,Windsor,3.14,9.29
68,St. Helena,1.0,9.37
18,Cotati,4.45,9.43
14,Clayton,0.34,9.53


Find cities on both top 10 low violent crime and low property crime lists

In [24]:
topX = 10
city = []

for i in range(0,topX):
    if (low_property_crime.iloc[0:topX,0] == low_violent_crime.iloc[i,0]).any():
        city.append(low_violent_crime.iloc[i,0])
        
print(city)

['Monte Sereno', 'Hillsborough', 'Los Altos Hills', 'Clayton']


Merge crime rate dataframe with dataframe of cities with many restaurants, and sort by low violent and property crime

In [25]:
restaurant_cities_with_crime_rates = pd.merge(many_restaurants, BayAreaCrime, how='inner')
violent_crime_sort = restaurant_cities_with_crime_rates.sort_values(by=['Violent Crime per thousand','Property Crime per thousand'])
many_restaurants_low_violent_crime = violent_crime_sort.head(10)['City'].values

property_crime_sort = restaurant_cities_with_crime_rates.sort_values(by=['Property Crime per thousand','Violent Crime per thousand'])
many_restaurants_low_property_crime = property_crime_sort.head(10)['City'].values

print(many_restaurants_low_violent_crime)
print(many_restaurants_low_property_crime)

good_city_list_1 = []

for city in many_restaurants_low_violent_crime:
    if city in many_restaurants_low_property_crime:
        good_city_list_1.append(city)

good_city_list_1

['Tiburon' 'Orinda' 'Los Altos' 'Mill Valley' 'Pleasanton' 'Benicia'
 'Morgan Hill' 'Milpitas' 'Pleasant Hill' 'South San Francisco']
['Orinda' 'Los Altos' 'Tiburon' 'Mill Valley' 'Morgan Hill' 'Napa'
 'Pleasanton' 'Benicia' 'South San Francisco' 'Newark']


['Tiburon',
 'Orinda',
 'Los Altos',
 'Mill Valley',
 'Pleasanton',
 'Benicia',
 'Morgan Hill',
 'South San Francisco']

Merge dataframe of cities with many restaurants with crime rate dataframe, and sort by low violent and property crime

In [26]:
low_violent_crime_cities_with_restaurants = pd.merge(low_violent_crime.head(10), many_restaurants, how='inner')
violent_crime_sorted = low_violent_crime_cities_with_restaurants.sort_values(by=['Violent Crime per thousand','Property Crime per thousand'])
many_restaurants_low_violent_crime = violent_crime_sorted['City'].values

low_property_crime_cities_with_restaurants = pd.merge(low_property_crime.head(10), many_restaurants, how='inner')
property_crime_sorted = low_property_crime_cities_with_restaurants.sort_values(by=['Property Crime per thousand','Violent Crime per thousand'])
many_restaurants_low_property_crime = property_crime_sorted['City'].values

print(many_restaurants_low_violent_crime)
print(many_restaurants_low_property_crime)

good_city_list_2 = many_restaurants_low_violent_crime
good_city_list_2

['Tiburon' 'Orinda' 'Los Altos']
[]


array(['Tiburon', 'Orinda', 'Los Altos'], dtype=object)

Normalize data as preparation for clustering

In [27]:
BayAreaCrime_for_clustering = BayAreaCrime.copy()
X = BayAreaCrime_for_clustering.values[:,1:]
normalized = StandardScaler().fit_transform(X)
BayAreaCrime_for_clustering.values[:,1:] = normalized
#BayAreaCrime_for_clustering.head()

bayarea_grouped_for_clustering = bayarea_grouped.copy()
X = bayarea_grouped_for_clustering.values[:,1:]
normalized = StandardScaler().fit_transform(X)
bayarea_grouped_for_clustering.values[:,1:] = normalized
#bayarea_grouped_for_clustering.head()



Add crime data to grouped dataframe for clustering

In [28]:
combined_grouped_df = pd.merge(bayarea_grouped_for_clustering, BayAreaCrime_for_clustering, how='inner')
combined_grouped_df.head()

Unnamed: 0,City,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Arcade,Art Gallery,...,Waterfront,Weight Loss Center,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Violent Crime per thousand,Property Crime per thousand
0,Albany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,-0.382588,-0.00511835
1,American Canyon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.045345,0.130456
2,Antioch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.04496,0.594487
3,Atherton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.841644,-0.617127
4,Belmont,0.017241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.483736,-0.471288


Run k-means to group cities into kclusters clusters

In [29]:
# set number of clusters
kclusters = 6

ba_clustering = combined_grouped_df.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ba_clustering)

# check cluster labels generated for each row in the dataframe
len(kmeans.labels_)

86

Store clustering results in new dataframe

In [30]:
# create new combined dataframe
combined_df = pd.merge(bayarea_venues_sorted, BayAreaCrime, how='inner')

# add clustering labels
combined_df.insert(0, 'Cluster Label', kmeans.labels_)

ba_merged = BayAreaCities

# merge to add latitude/longitude for each neighborhood
ba_merged = ba_merged.join(combined_df.set_index('City'), on='City')

# drop neighborhoods with no venues and therefore no cluster assigned
ba_merged.dropna(subset=['Cluster Label'], inplace=True)

print(ba_merged.shape)
ba_merged.head()

(86, 14)


Unnamed: 0,City,County,Population,Land Area,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
1,Albany,Alameda,18539,1.79,37.8869,-122.298,4.0,Pizza Place,Thai Restaurant,Coffee Shop,Burger Joint,Sushi Restaurant,1.6,24.7
2,American Canyon,Napa,19454,4.84,38.2235,-122.227,4.0,Winery,Yoga Studio,Fish & Chips Shop,Eye Doctor,Falafel Restaurant,2.7,27.87
3,Antioch,Contra Costa,102372,28.35,38.0049,-121.806,0.0,Fast Food Restaurant,Gym,Coffee Shop,Flower Shop,Mexican Restaurant,7.84,38.72
4,Atherton,San Mateo,6914,5.02,37.4613,-122.198,1.0,Business Service,Spa,Mexican Restaurant,Food & Drink Shop,Baseball Field,0.42,10.39
5,Belmont,San Mateo,25835,4.62,37.5202,-122.276,1.0,Pet Store,Sushi Restaurant,Coffee Shop,Grocery Store,Sandwich Place,1.34,13.8


Visualize the resulting clusters

In [33]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=9)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ba_merged['Latitude'], ba_merged['Longitude'], ba_merged['City'], ba_merged['Cluster Label'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


Examine each cluster

In [34]:
cluster0 = ba_merged.loc[ba_merged['Cluster Label'] == 0, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster0.shape)
print(cluster0['Violent Crime per thousand'].min(), cluster0['Violent Crime per thousand'].max())
print(cluster0['Property Crime per thousand'].min(), cluster0['Property Crime per thousand'].max())
cluster0

(5, 9)
7.77 8.65
38.72 53.03


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
3,Antioch,0.0,Fast Food Restaurant,Gym,Coffee Shop,Flower Shop,Mexican Restaurant,7.84,38.72
69,Richmond,0.0,Convenience Store,Food Truck,Art Gallery,Grocery Store,Food,7.77,39.47
77,San Francisco,0.0,Coffee Shop,Hotel,Café,Cocktail Bar,Wine Bar,7.95,53.03
81,San Pablo,0.0,Pizza Place,Chinese Restaurant,Supermarket,Mexican Restaurant,Pharmacy,8.08,38.95
96,Vallejo,0.0,Chinese Restaurant,Yoga Studio,Park,Breakfast Spot,Music Venue,8.65,40.81


Cluster 0 ...

In [35]:
cluster1 = ba_merged.loc[ba_merged['Cluster Label'] == 1, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster1.shape)
print(cluster1['Violent Crime per thousand'].min(), cluster1['Violent Crime per thousand'].max())
print(cluster1['Property Crime per thousand'].min(), cluster1['Property Crime per thousand'].max())
cluster1

(34, 9)
0.0 1.62
8.48 20.77


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
4,Atherton,1.0,Business Service,Spa,Mexican Restaurant,Food & Drink Shop,Baseball Field,0.42,10.39
5,Belmont,1.0,Pet Store,Sushi Restaurant,Coffee Shop,Grocery Store,Sandwich Place,1.34,13.8
6,Belvedere,1.0,Bakery,Deli / Bodega,Bay,Harbor / Marina,Chinese Restaurant,0.47,16.86
7,Benicia,1.0,Mexican Restaurant,Café,Wine Bar,American Restaurant,Italian Restaurant,0.94,17.43
12,Calistoga,1.0,Hotel,Bed & Breakfast,Wine Bar,American Restaurant,Bakery,0.57,13.83
14,Clayton,1.0,Sandwich Place,Gym,Steakhouse,Liquor Store,Bar,0.34,9.53
20,Cupertino,1.0,Chinese Restaurant,Coffee Shop,Hotel,Furniture / Home Store,Bank,0.66,16.94
22,Danville,1.0,Pizza Place,Sandwich Place,American Restaurant,Coffee Shop,Juice Bar,0.39,10.05
24,Dublin,1.0,Furniture / Home Store,Korean Restaurant,Thrift / Vintage Store,Men's Store,American Restaurant,1.3,14.83
30,Foster City,1.0,Fast Food Restaurant,Food Truck,Lake,Coffee Shop,Asian Restaurant,0.43,11.15


Cluster 1 ...

In [36]:
cluster2 = ba_merged.loc[ba_merged['Cluster Label'] == 2, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster2.shape)
print(cluster2['Violent Crime per thousand'].min(), cluster2['Violent Crime per thousand'].max())
print(cluster2['Property Crime per thousand'].min(), cluster2['Property Crime per thousand'].max())
cluster2

(2, 9)
7.98 10.66
146.1 180.31


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
16,Colma,2.0,Flower Shop,Electronics Store,Hardware Store,Automotive Shop,Rental Car Location,7.98,180.31
27,Emeryville,2.0,Pet Store,Mobile Phone Shop,Bakery,Furniture / Home Store,Cupcake Shop,10.66,146.1


Cluster 2 ...

In [37]:
cluster3 = ba_merged.loc[ba_merged['Cluster Label'] == 3, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster3.shape)
print(cluster3['Violent Crime per thousand'].min(), cluster3['Violent Crime per thousand'].max())
print(cluster3['Property Crime per thousand'].min(), cluster3['Property Crime per thousand'].max())
cluster3

(1, 9)
16.85 16.85
59.43 59.43


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
56,Oakland,3.0,Bar,Chinese Restaurant,Japanese Restaurant,Sandwich Place,Vietnamese Restaurant,16.85,59.43


Cluster 3 ...

In [38]:
cluster4 = ba_merged.loc[ba_merged['Cluster Label'] == 4, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster4.shape)
print(cluster4['Violent Crime per thousand'].min(), cluster4['Violent Crime per thousand'].max())
print(cluster4['Property Crime per thousand'].min(), cluster4['Property Crime per thousand'].max())
cluster4

(30, 9)
1.56 4.86
9.29 30.2


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
1,Albany,4.0,Pizza Place,Thai Restaurant,Coffee Shop,Burger Joint,Sushi Restaurant,1.6,24.7
2,American Canyon,4.0,Winery,Yoga Studio,Fish & Chips Shop,Eye Doctor,Falafel Restaurant,2.7,27.87
9,Brentwood,4.0,Pizza Place,Mexican Restaurant,American Restaurant,Bar,Sandwich Place,1.83,22.39
11,Burlingame,4.0,Japanese Restaurant,Italian Restaurant,Sandwich Place,Coffee Shop,Breakfast Spot,1.56,24.87
15,Cloverdale,4.0,Airport,Recreation Center,Skydiving Drop Zone,Yoga Studio,Financial or Legal Service,1.71,19.51
19,Cotati,4.0,Pizza Place,Music Store,Park,Bar,Karaoke Bar,4.45,9.43
21,Daly City,4.0,Sandwich Place,Fast Food Restaurant,Mexican Restaurant,Pizza Place,Gym / Fitness Center,1.84,15.95
23,Dixon,4.0,Mexican Restaurant,Sushi Restaurant,Bistro,Bakery,Auto Workshop,2.77,22.4
25,East Palo Alto,4.0,Mexican Restaurant,Bagel Shop,Gym / Fitness Center,Grocery Store,Market,4.22,19.54
28,Fairfax,4.0,Coffee Shop,Indian Restaurant,Italian Restaurant,Bar,Park,2.09,13.6


Cluster 4 ...

In [39]:
cluster5 = ba_merged.loc[ba_merged['Cluster Label'] == 5, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster5.shape)
print(cluster5['Violent Crime per thousand'].min(), cluster5['Violent Crime per thousand'].max())
print(cluster5['Property Crime per thousand'].min(), cluster5['Property Crime per thousand'].max())
cluster5

(14, 9)
1.1 4.71
28.06 50.58


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand
8,Berkeley,5.0,Sushi Restaurant,Theater,Brewery,Music Venue,Electronics Store,3.66,43.33
10,Brisbane,5.0,Mexican Restaurant,Deli / Bodega,Vietnamese Restaurant,Indian Restaurant,Paper / Office Supplies Store,2.45,29.67
13,Campbell,5.0,Yoga Studio,Mexican Restaurant,Sandwich Place,Italian Restaurant,Cosmetics Shop,1.98,33.96
17,Concord,5.0,Mexican Restaurant,Indian Restaurant,Café,Coffee Shop,Food Truck,3.67,41.0
29,Fairfield,5.0,Chinese Restaurant,Indian Restaurant,Bank,Thai Restaurant,Beer Bar,4.71,35.11
32,Gilroy,5.0,American Restaurant,Tapas Restaurant,Theater,Shipping Store,Train Station,3.76,28.35
34,Hayward,5.0,Fast Food Restaurant,Bar,Vietnamese Restaurant,Pizza Place,Mexican Restaurant,3.95,31.78
63,Pinole,5.0,Liquor Store,Spa,Sporting Goods Shop,Comic Shop,Chinese Restaurant,3.63,33.2
64,Pittsburg,5.0,Mexican Restaurant,Fast Food Restaurant,Park,Supermarket,Fried Chicken Joint,2.59,34.99
65,Pleasant Hill,5.0,Sushi Restaurant,Burger Joint,American Restaurant,Pizza Place,Chinese Restaurant,1.75,50.58


Cluster 5 ...

In [40]:
cluster6 = ba_merged.loc[ba_merged['Cluster Label'] == 6, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster6.shape)
print(cluster6['Violent Crime per thousand'].min(), cluster6['Violent Crime per thousand'].max())
print(cluster6['Property Crime per thousand'].min(), cluster6['Property Crime per thousand'].max())
cluster6

(0, 9)
nan nan
nan nan


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand


Cluster 6 ...

In [41]:
cluster7 = ba_merged.loc[ba_merged['Cluster Label'] == 7, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster7.shape)
print(cluster7['Violent Crime per thousand'].min(), cluster7['Violent Crime per thousand'].max())
print(cluster7['Property Crime per thousand'].min(), cluster7['Property Crime per thousand'].max())
cluster7

(0, 9)
nan nan
nan nan


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand


Cluster 7 ...

In [42]:
cluster8 = ba_merged.loc[ba_merged['Cluster Label'] == 8, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster8.shape)
print(cluster8['Violent Crime per thousand'].min(), cluster8['Violent Crime per thousand'].max())
print(cluster8['Property Crime per thousand'].min(), cluster8['Property Crime per thousand'].max())
cluster8

(0, 9)
nan nan
nan nan


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand


Cluster 8 ...

In [43]:
cluster9 = ba_merged.loc[ba_merged['Cluster Label'] == 9, ba_merged.columns[[0] + list(range(6, ba_merged.shape[1]))]]
print(cluster9.shape)
print(cluster9['Violent Crime per thousand'].min(), cluster9['Violent Crime per thousand'].max())
print(cluster9['Property Crime per thousand'].min(), cluster9['Property Crime per thousand'].max())
cluster9

(0, 9)
nan nan
nan nan


Unnamed: 0,City,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Violent Crime per thousand,Property Crime per thousand


Cluster 9 ...