# Washington D.C. Battle of the neighboors

## Part 1: Getting the Data as a  dataframe

### Zip codes in Washigton D.C.
We first obtain a list of all zip codes in Washington D.C.

In [1]:
import pandas as pd
zips="""20515 20222 20060 20224 20226 20064 20229 20071 20530 20533 20532 20250 20091 20543 20542 20546 20262
20268 20549 20303 20317 20319 20374 20373 20593 20390 20402 20401 20001 20003 20002 20005 20004 20007 20006
20009 20008 20011 20010 20422 20012 20016 20015 20018 20017 20020 20019 20433 20431 20024 20032 20030 20037
20036 20202 20203 20044 20052 20057 20059 20220 20510"""
zips=[int(a) for a in zips.split()]
dc_zip_df=pd.DataFrame(zips)
dc_zip_df.columns=['ZIP']
dc_zip_df.head()

Unnamed: 0,ZIP
0,20515
1,20222
2,20060
3,20224
4,20226


### GPS Coordinates of all zip codes in the USA
We obtain the GPS coordinates of all zip codes in the USA

In [2]:
zip_df=pd.read_csv('zip.csv')
zip_df.head()

Unnamed: 0,ZIP,LAT,LNG
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.455183,-67.119887
3,606,18.158345,-66.932911
4,610,18.295366,-67.125135


### Merging the two DataFrames

In [3]:
dc_df=pd.merge(left=dc_zip_df,right=zip_df,how='inner')
dc_df.head()

Unnamed: 0,ZIP,LAT,LNG
0,20064,38.936354,-76.999167
1,20317,38.934841,-77.014387
2,20319,38.864838,-77.017003
3,20373,38.858625,-77.007865
4,20593,38.866713,-77.010187


### Using Foursquare
We use four square to obtain data on the different venues around each zip code.

In [4]:
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
CLIENT_ID = 'NDRF1MOANOCNDSVG31VUQW2YIG0CH44LOFRP5TQZYS5LUUNQ' # your Foursquare ID
CLIENT_SECRET = '4PIMF4J003QYEMYUIO4UIQWNKYEGZ0TJL0PUUOOHHEX0ML0R' # your Foursquare Secret
VERSION = '20190605' # Foursquare API version

url_template = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'
LIMIT = 100
radius = 500

In [5]:
import requests
v_list=[]
for i in dc_df.index:#[:1]:
    lat,lon=dc_df.iloc[i,1],dc_df.iloc[i,2]
    url=url_template.format(CLIENT_ID, 
                            CLIENT_SECRET, 
                            VERSION,
                            lat,#finalDf.loc[i,'Latitude'], 
                            lon,#finalDf.loc[i,'Longitude'],
                            radius, 
                            LIMIT)
    results = requests.get(url).json()["response"]['groups'][0]['items']
    for v in results:
        v_list.append([dc_df.iloc[i,0],dc_df.iloc[i,1],dc_df.iloc[i,2], v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], 
                     v['venue']['categories'][0]['name']])
v_list[:2]


[[20064,
  38.936353999999994,
  -76.999167,
  '&pizza',
  38.93258208733208,
  -76.99669629335403,
  'Pizza Place'],
 [20064,
  38.936353999999994,
  -76.999167,
  'Chick-fil-A',
  38.935476,
  -76.998198,
  'Food Service']]

In [6]:
#make a dataframe
venues_df = pd.DataFrame(v_list)
venues_df.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
venues_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,20064,38.936354,-76.999167,&pizza,38.932582,-76.996696,Pizza Place
1,20064,38.936354,-76.999167,Chick-fil-A,38.935476,-76.998198,Food Service
2,20064,38.936354,-76.999167,Busboys and Poets,38.932117,-76.99764,American Restaurant
3,20064,38.936354,-76.999167,Starbucks Reserve,38.932484,-76.997172,Coffee Shop
4,20064,38.936354,-76.999167,BGR Burgers Grilled Right,38.932647,-76.99674,Burger Joint


In [7]:
venues_df.to_csv('venues_df')

## Part 2: Exploratory Data Analysis

Let us first see which types of venues are the most common in Washington D.C.

In [8]:
venues_onehot1 = pd.get_dummies(venues_df['Venue Category'], prefix="", prefix_sep="") 
venues_onehot= pd.concat([venues_df['Neighborhood'] ,venues_onehot1],axis=1)
#remove an extra columns for neighborhood
df1 = venues_onehot.loc[:, ~venues_onehot.columns.duplicated()]
venues_grouped = df1.groupby('Neighborhood').mean().reset_index()
venues_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,...,Volleyball Court,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio
0,20001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,...,0.0,0.0,0.0,0.0,0.030303,0.030303,0.0,0.0,0.0,0.0
1,20002,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20003,0.018182,0.0,0.0,0.036364,0.0,0.0,0.0,0.018182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0
3,20004,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
4,20005,0.04,0.01,0.01,0.01,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_most_common=venues_grouped.sum()[1:]
df_most_common=df_most_common.sort_values(ascending=False)
df_most_common=pd.DataFrame( df_most_common[:20],columns=['Frequency'])
df_most_common.columns=['Frequency']
df_most_common.to_csv("df_most_common.cvs")
df_most_common

Unnamed: 0,Frequency
Park,1.332066
Coffee Shop,1.071133
Sandwich Place,0.972823
American Restaurant,0.844335
Convenience Store,0.841558
Harbor / Marina,0.752828
Hotel,0.734662
Pizza Place,0.72681
Gym,0.680204
Boat or Ferry,0.622205


We now figure out which venues are the most popular in each neighborhood.

In [10]:
# We create the columns
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']
 
new_columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        new_columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        new_columns.append('{}th Most Common Venue'.format(ind+1))
        
neighborhoods_venues_sorted = pd.DataFrame(columns=new_columns)
neighborhoods_venues_sorted['Neighborhood'] = venues_grouped['Neighborhood']


neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,20001,,,,,,,,,,
1,20002,,,,,,,,,,
2,20003,,,,,,,,,,
3,20004,,,,,,,,,,
4,20005,,,,,,,,,,


In [11]:
for i in venues_grouped.index:
    row_categories =venues_grouped.iloc[i, 1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    temp=row_categories_sorted[0:num_top_venues]
    neighborhoods_venues_sorted.iloc[i, 1:] = temp.index
neighborhoods_venues_sorted.to_csv("neighborhoods_venues_sorted.csv")
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,20001,Thai Restaurant,Liquor Store,BBQ Joint,Grocery Store,Bookstore,Market,Building,Spanish Restaurant,Middle Eastern Restaurant,Gas Station
1,20002,American Restaurant,Bar,Gym,Moving Target,New American Restaurant,Park,Pharmacy,Diner,Convenience Store,Sandwich Place
2,20003,Pizza Place,Bar,Coffee Shop,Art Gallery,Gym / Fitness Center,Bakery,Sandwich Place,Spa,Mobile Phone Shop,Pet Store
3,20004,Hotel,Science Museum,History Museum,Exhibit,American Restaurant,Food Truck,Museum,Coffee Shop,Bakery,Sandwich Place
4,20005,Hotel,Hotel Bar,Coffee Shop,American Restaurant,Salon / Barbershop,Latin American Restaurant,Sandwich Place,Sushi Restaurant,New American Restaurant,Deli / Bodega


## Part 3: Clustering

In [12]:
# Obtain the clusters
kclusters = 4
venues_grouped_clustering = venues_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_grouped_clustering)
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 0, 0, 2, 0, 0, 0], dtype=int32)

In [13]:
# add clustering labels
cluster_venues_sorted=neighborhoods_venues_sorted.copy()
cluster_venues_sorted.insert(1, 'Cluster Labels', kmeans.labels_)
cluster_venues_sorted.head()

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,20001,0,Thai Restaurant,Liquor Store,BBQ Joint,Grocery Store,Bookstore,Market,Building,Spanish Restaurant,Middle Eastern Restaurant,Gas Station
1,20002,2,American Restaurant,Bar,Gym,Moving Target,New American Restaurant,Park,Pharmacy,Diner,Convenience Store,Sandwich Place
2,20003,0,Pizza Place,Bar,Coffee Shop,Art Gallery,Gym / Fitness Center,Bakery,Sandwich Place,Spa,Mobile Phone Shop,Pet Store
3,20004,0,Hotel,Science Museum,History Museum,Exhibit,American Restaurant,Food Truck,Museum,Coffee Shop,Bakery,Sandwich Place
4,20005,0,Hotel,Hotel Bar,Coffee Shop,American Restaurant,Salon / Barbershop,Latin American Restaurant,Sandwich Place,Sushi Restaurant,New American Restaurant,Deli / Bodega


In [14]:
# give label to all zip codes
vg=dc_df.copy()
vg.insert(1,'Cluster Labels', kmeans.labels_)
vg.head()

Unnamed: 0,ZIP,Cluster Labels,LAT,LNG
0,20064,0,38.936354,-76.999167
1,20317,2,38.934841,-77.014387
2,20319,0,38.864838,-77.017003
3,20373,0,38.858625,-77.007865
4,20593,0,38.866713,-77.010187


## Map

In [15]:
# create map
map_clusters = folium.Map(location=[38.92,-77.014387], zoom_start=11.5)#4841

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(vg['LAT'], vg['LNG'], 
                                  vg['ZIP'], vg['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters.save('map.html')
map_clusters

We print all the clusters

In [16]:
my_clusters={0:[],1:[],3:[],2:[]}
for i in vg.index:
    my_clusters[vg['Cluster Labels'].iloc[i]].append(vg['ZIP'].iloc[i])
for i in my_clusters:
    print("Custer No "+str(i+1)+": "+", ".join([str(j) for j in my_clusters[i]]))

Custer No 1: 20064, 20319, 20373, 20593, 20390, 20003, 20002, 20005, 20004, 20007, 20009, 20008, 20016, 20015, 20018, 20017, 20020, 20019, 20024, 20032, 20052, 20057, 20510
Custer No 2: 20036
Custer No 4: 20006
Custer No 3: 20317, 20001, 20011, 20010, 20012, 20037, 20202


We explore next, the characteristics of one neighborhood in each cluster to see if anything stands out.

In [17]:
neighborhoods_venues_sorted[neighborhoods_venues_sorted['Neighborhood']==20036]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,20036,Mediterranean Restaurant,Café,Coffee Shop,Salad Place,Hotel,Hotel Bar,Sandwich Place,Pizza Place,Yoga Studio,Steakhouse


In [18]:
neighborhoods_venues_sorted[neighborhoods_venues_sorted['Neighborhood']==20006]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,20006,Coffee Shop,Sandwich Place,Hotel,American Restaurant,Garden,History Museum,Bakery,Indian Restaurant,Salad Place,Mexican Restaurant


In [19]:
neighborhoods_venues_sorted[neighborhoods_venues_sorted['Neighborhood']==20317]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
26,20317,Bus Station,Intersection,Cafeteria,Golf Course,Park,Bus Stop,Filipino Restaurant,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant


In [20]:
neighborhoods_venues_sorted[neighborhoods_venues_sorted['Neighborhood']==20319]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,20319,Boat or Ferry,Harbor / Marina,Yoga Studio,Farmers Market,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Fountain,Food Truck,Food Service


## Part 4: The final result

Here we build a recommendation system. We give the option to the user to enter the zip code (s)he is moving from, the zip code (s)he is planning to move to. Then we recommand a zip code to the user based on which zip code is the nearest zip code to the target zip code that is also in of the same type (in the same cluster ) as the original zip code. 

In [21]:
yourZip=input("Enter your current zip in Washington D.C.: ") 
tagZip=input("Enter your target zip in Washington D.C.: ") 

from geopy.distance import distance
a_cluster=vg[vg['ZIP']==int(yourZip)]['Cluster Labels'].iloc[0]
a_lat=vg[vg['ZIP']==int(tagZip)]['LAT'].iloc[0]
a_lon=vg[vg['ZIP']==int(tagZip)]['LNG'].iloc[0]

dist=[]

for i in vg.index:
    b_lat=vg['LAT'].iloc[i]
    b_lon=vg['LNG'].iloc[i]
    b_cluster=vg['Cluster Labels'].iloc[i] 
    if b_cluster==a_cluster:
        dist.append((distance((a_lat,a_lon),(b_lat,b_lon)).miles,vg['ZIP'].iloc[i]))
        
dist.sort()
dist=dist[1:]

if len(dist)==0:
    print("Sorry your zip code is quite unique in its cluster!")
else:
    print("You may want to consider the zip code "+str(dist[0][1])+' which is '+str(dist[0][0])+' miles away.')
        

Enter your current zip in Washington D.C.: 20006
Enter your target zip in Washington D.C.: 20036
Sorry your zip code is quite unique in its cluster!
