In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ny_data = pd.read_csv('New-York-data.csv')

In [3]:
ny = ny_data.drop(['Population(In thousands)','Median income(in dollars)'],axis=1)

In [4]:
ny

Unnamed: 0,Neighbourhood
0,Melrose
1,Mott Haven
2,Port Morris
3,Hunts Point
4,Longwood
...,...
120,Dongan Hills
121,Midland Beach
122,New Dorp
123,South Beach


In [6]:
import geocoder

# Function that retrieves the geographical coordinates for a given neighborhood
def get_coordinates(row):
    # initialize variable to None
    lat_lng_coords = None

    # loop until we get the coordinates
    while(lat_lng_coords is None):
       g = geocoder.arcgis(f'{row["Neighbourhood"]},New York ')
       lat_lng_coords = g.latlng
    
    # return pair lat,long
    return pd.Series([lat_lng_coords[0], lat_lng_coords[1]])
# Fill coordinates for each row
ny[['Latitude','Longitude']] = ny.apply(get_coordinates, axis=1)
ny.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Melrose,42.84501,-73.61879
1,Mott Haven,40.80899,-73.92291
2,Port Morris,40.80134,-73.90996
3,Hunts Point,40.8126,-73.88402
4,Longwood,40.81748,-73.89816


In [8]:
import folium
map_ny = folium.Map(location=[42.84501,-73.61879], zoom_start=11)

for lat, lng, label in zip(ny['Latitude'], ny['Longitude'], ny['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

In [9]:
CLIENT_ID = '2GQBW5PR0QFXTOGCHKTRFWJBTGOFOHXW1TRTNRAFURQ5FE1X'
CLIENT_SECRET = '3QH40WMZIIDSQN1RFAVAEQHUIMOQUJPKYPABQVNTSDQJN2YD'
VERSION = 20202808
radius = 500
LIMIT = 100

In [10]:
import requests
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
# Part 1: creating the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
# Part 2: making the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
    
    
    
        
# Part 3 returning only relevant information for each nearby venue and append to the list 
        venues_list.append([(
            name, 
            lat, 
            lng,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    return (venues_list)

In [11]:
ny_venues = getNearbyVenues(names = ny['Neighbourhood'],
                              latitudes = ny['Latitude'],
                            longitudes = ny['Longitude'])

Melrose
Mott Haven
Port Morris
Hunts Point
Longwood
Morrisania
Highbridge
Tremont
Fordham
Morris Heights
University Heights
Bathgate
East Termont
West Farms
Bedford Park
Norwood
Fieldston
Marble Hill
Riverdale
Spuyten Duyvil
Van Cortlandt Village
Castle Hill
Clason Point
Parkchester
Soundview
Co-op City
Locust Point
Pelham Bay
Throggs Neck
Morris Park
Pelham Gardens
Pelham Parkway
Van Nest
Baychester
Williamsbridge
Wakefield
Greenpoint
Williamsburg
Brooklyn Heights
Clinton Hill
Fort Greene
Vinegar Hill
Bedford–Stuyvesant
Bushwick
East New York
Carroll Gardens
Park Slope
Windsor Terrace
Crown Heights
Bensonhurst
Brownsville
East Flatbush
Bergen beach
Canarsie
Mill Basin
Battery Park City
Financial District
Tribeca
Chinatown
Greenwich Village
Alphabet City
Lower East Side
Chelsea
SoHo
Hell's Kitchen
Midtown Manhattan
Kips Bay
Murray Hill
Stuyvesant Town–Peter Cooper Village
Turtle Bay
Upper West Side
Upper East Side
Yorkville
Hamilton Heights
Manhattanville
Morningside Heights
East Harle

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng,
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results if v['categories']])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Search for venues within 1Km of radius from the neighbourhood centre point
radius = 1000

# Call the above function
ny_venues = getNearbyVenues(names=ny['Neighbourhood'],
                                   latitudes=ny['Latitude'],
                                   longitudes=ny['Longitude'],
                                   radius=radius)

print(ny_venues.shape)
ny_venues

(5407, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Melrose,42.84501,-73.61879,Stewart's Shops,42.804973,-73.666879,Convenience Store
1,Melrose,42.84501,-73.61879,Deepkill Reservoir,42.826098,-73.612660,Lake
2,Melrose,42.84501,-73.61879,Haughney Hill,42.805929,-73.650755,Comedy Club
3,Melrose,42.84501,-73.61879,US Post Office,42.829723,-73.626620,Post Office
4,Melrose,42.84501,-73.61879,United States Postal Service,42.844713,-73.623573,Post Office
...,...,...,...,...,...,...,...
5402,Tottenville,40.51128,-74.25057,Antique Raiders & Traders,40.512563,-74.250004,Antique Shop
5403,Tottenville,40.51128,-74.25057,Broken Records Magazine,40.509340,-74.247456,Art Gallery
5404,Tottenville,40.51128,-74.25057,Masonic Temple Main Street,40.509280,-74.247576,Temple
5405,Tottenville,40.51128,-74.25057,Bedell Pizzo Funeral Home,40.509898,-74.244849,Funeral Home


In [31]:
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
ny_onehot['Neighbourhood'] = ny_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

# group rows by neighbourhood and find frequency of each category
ny_grouped = ny_onehot.groupby('Neighbourhood').mean().reset_index()
print(f'Size: {ny_grouped.shape}')
ny_grouped

Size: (124, 442)


Unnamed: 0,Neighbourhood,ATM,Accessories Store,Acupuncturist,Adult Education Center,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Gate,...,Warehouse,Waste Facility,Watch Shop,Wedding Hall,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Alphabet City,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.020833,0.000000,0.0,0.0,0.020833,0.020833
1,Arverne,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,Astoria,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.020408,...,0.0,0.0,0.0,0.00000,0.020408,0.000000,0.0,0.0,0.000000,0.000000
3,Bathgate,0.0,0.022222,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
4,Battery Park City,0.0,0.000000,0.0,0.0,0.021277,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Williamsburg,0.0,0.000000,0.0,0.0,0.023256,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.046512,0.000000,0.0,0.0,0.000000,0.000000
120,Windsor Terrace,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.020833,0.0,0.0,0.000000,0.000000
121,Woodhaven,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
122,Woodside,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [32]:
df_ny = pd.read_csv('New-York-data.csv')

In [33]:
df_ny_population = pd.read_csv('NY_Population.csv')

In [34]:
df_ny_population.head()

Unnamed: 0,Neighbourhood,Population
0,Melrose,24913
1,Mott Haven,52413
2,Port Morris,3523
3,Hunts Point,12281
4,Longwood,26196


In [35]:
df_ny_income = pd.read_csv('NY_income.csv')

In [36]:
df_ny_income.head()

Unnamed: 0,Neighbourhood,Median income
0,Melrose,24467.0
1,Mott Haven,25325.0
2,Port Morris,38834.0
3,Hunts Point,25678.0
4,Longwood,26300.0


## Neighbourhood segmentation

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = ny_grouped['Neighbourhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alphabet City,Residential Building (Apartment / Condo),Art Gallery,Building,Church,Gym,Bar,Salon / Barbershop,Café,Park,Event Space
1,Arverne,Medical Center,School,Bus Line,Farm,Bus Stop,Housing Development,Building,Non-Profit,Nature Preserve,Gas Station
2,Astoria,Automotive Shop,Hardware Store,Building,Laundry Service,Deli / Bodega,Bus Station,Bus Stop,Church,Medical Center,Sandwich Place
3,Bathgate,Bus Line,School,Building,Non-Profit,Automotive Shop,Laundry Service,Outdoors & Recreation,Office,Bus Station,Art Gallery
4,Battery Park City,Office,Park,Building,Residential Building (Apartment / Condo),Bank,Grocery Store,Coffee Shop,Monument / Landmark,Breakfast Spot,Bike Rental / Bike Share


In [46]:
ny_grouped_clustering = ny_grouped.drop('Neighbourhood', 1)
sum_of_squared_distances = []
K = range(1,10)

for k in K:
    print(k, end=' ')
    kmeans = KMeans(n_clusters=k, random_state=1, n_init=20).fit(ny_grouped_clustering)
    sum_of_squared_distances.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.title('Elbow Method For Optimal k')

1 

  ny_grouped_clustering = ny_grouped.drop('Neighbourhood', 1)


AttributeError: 'NoneType' object has no attribute 'split'

In [43]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=100).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 1, 0, 3, 2, 0, 2, 1, 4])

In [26]:
# add clustering labels
neighbourhoods_venues_sorted.drop(['Cluster Labels'], axis=1, inplace=True)
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny

# merge ny_grouped with df_madrid to add latitude/longitude for each neighbourhood
ny_merged = ny_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

ny_merged.head()

KeyError: "['Cluster Labels'] not found in axis"

In [None]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[0] + list(range(4, ny_merged.shape[1]))]]

In [None]:
residential_cluster_id = 2
ny_residential = ny[ny['Neighbourhood'].isin(ny_merged.loc[ny_merged['Cluster Labels'] == residential_cluster_id, 'Neighbourhood'])]

# Category ID in Foursquare API
supermarket_category = '52f2ab2ebcbc57f1066b8b46'

# Function to count the number of market-like venues in a given neighbourhood
def getMarketVenues(row):
    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        row['Latitude'], 
        row['Longitude'],
        supermarket_category,
        radius, 
        LIMIT)

    # make the GET request and get get length of venues
    count = len(requests.get(url).json()["response"]['venues'])

    # return count
    return pd.Series(count)
    
# Apply the above function on each residential neighbourhood
ny_residential['Number of Markets'] =ny_residential.apply(getMarketVenues, axis=1)
ny_residential.head()

In [None]:
ny_residential = ny_residential.merge(df_ny_income, how='left', on='Neighbourhood')
ny
# Left join with population dataframe
ny_residential = ny_residential.merge(df_ny_population, how='left', on='Neighbourhood')
ny_residential['Population'] = ny_residential['Population'].astype(int)

# Calculate how many people per one market live in each residential neighbourhood
ny_residential['People per Market'] = ny_residential.apply(lambda x: (x['Population'] // x['Number of Markets']) if x['Number of Markets'] > 0 else x['Population'], axis=1)

print(ny_residential.shape)
ny_residential.head()

In [None]:
import seaborn as sns

def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
    # Create the scatter plot
    p1 = sns.scatterplot(x, y, data=data, size=8, legend=False)
    # Add text besides each point
    texts = [p1.text(data[x][line], data[y][line], 
             data[text_column][line], horizontalalignment='left', 
             size='small', color='blue') for line in range(0,data.shape[0])]
   
    
    plt.title(title, size=18)
    plt.xlabel(xlabel, size=14)
    plt.ylabel(ylabel, size=14)
    return p1

plt.figure(figsize=(20,10))


avg_people_per_market = ny_residential['People per Market'].median()
max_people_per_market = ny_residential['People per Market'].max()
avg_income = ny_residential['Median income'].median()

plt.axhline(y=avg_income, color='r', linestyle='--')
plt.axvline(x=avg_people_per_market, color='r', linestyle='--')


plt.fill_between(x=np.arange(avg_people_per_market, max_people_per_market), y1=0, y2=avg_income, color='lightgreen')


scatter_text('People per Market', 'Median income', 'Neighbourhood',
             data = ny_residential, 
             title = 'Madrid residential neighbourhoods', 
             xlabel = 'People per market',
             ylabel = 'Household income (€)')

