# Part One - Dataframe and analysis of social life

Importing libraries and creating dataframe for social life

In [1]:
import pandas as pd
import geocoder
import folium
from geopy.geocoders import Nominatim
import requests
import json
from pandas import json_normalize

In [2]:
df = pd.DataFrame({'Index':[1,2,3,4,5,6,7,8], 'City':["Lower Manhattan, New York City (USA)", "Warsaw (Poland)", "Prague (Czech Republic)", "Bratislava (Slovakia)", "Budapest (Hungary)", "Bucharest (Romania)", "Sofia (Bulgaria)", "Zagreb (Croatia)"]})
df

Unnamed: 0,Index,City
0,1,"Lower Manhattan, New York City (USA)"
1,2,Warsaw (Poland)
2,3,Prague (Czech Republic)
3,4,Bratislava (Slovakia)
4,5,Budapest (Hungary)
5,6,Bucharest (Romania)
6,7,Sofia (Bulgaria)
7,8,Zagreb (Croatia)


In [3]:
def get_latilong(city):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis(city)
        lati_long_coords = g.latlng
    return lati_long_coords

In [4]:
city = df['City']    
coords = [get_latilong(city) for city in city.tolist()]

In [5]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [6]:
df

Unnamed: 0,Index,City,Latitude,Longitude
0,1,"Lower Manhattan, New York City (USA)",40.71913,-73.98763
1,2,Warsaw (Poland),52.2356,21.01037
2,3,Prague (Czech Republic),50.07913,14.43302
3,4,Bratislava (Slovakia),48.14924,17.10699
4,5,Budapest (Hungary),47.49972,19.05508
5,6,Bucharest (Romania),44.43429,26.10298
6,7,Sofia (Bulgaria),42.69718,23.32433
7,8,Zagreb (Croatia),45.80724,15.96757


In [7]:
df.dtypes

Index          int64
City          object
Latitude     float64
Longitude    float64
dtype: object

Creating a map of cities

In [8]:
address = 'Warsaw'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [9]:
map_cities = folium.Map(location=[latitude, longitude], tiles='Stamen Toner', zoom_start=2)

for lat, lng, city in zip(df['Latitude'], df['Longitude'], df['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=5000,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cities)  
    
map_cities

Defining Foursquare credentials and version

In [10]:
CLIENT_ID = 'FI5FBYT3V51GTQL3WVWKNKZ0PZIOSD2X3HR0J4LIXGWMS242'
CLIENT_SECRET = '2NKTS1RJPP514X05TMZ3LTBEGAZLNADKUTVGJ4NVB5V1STYG'
VERSION = '20180605'
LIMIT = 100
radius = 5000

Exploring data for first city in my dataframe to get sense of obtained data

In [11]:
c1_lat = df.loc[0, 'Latitude']
c2_long = df.loc[0, 'Longitude']
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    c1_lat, 
    c2_long, 
    radius, 
    LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fc297c7b4e1ed177c5b933c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'},
    {'name': '$-$$$$', 'key': 'price'}]},
  'headerLocation': 'New York',
  'headerFullLocation': 'New York',
  'headerLocationGranularity': 'city',
  'totalResults': 231,
  'suggestedBounds': {'ne': {'lat': 40.76413004500011,
    'lng': -73.92836750789549},
   'sw': {'lat': 40.67412995500002, 'lng': -74.04689249210445}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5c362485237dee002ce062fb',
       'name': 'Regal Essex 14 & RPX',
       'location': {'address': '129 Delancey St',
        'crossStreet': 'Norfolk',
        'lat': 40.71813306291876,
        'lng': -73.98789549033421,
        '

Extracting relevant data and desiging functions for automatization

In [12]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [13]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Regal Essex 14 & RPX,Movie Theater,40.718133,-73.987895
1,Blue Bottle Coffee,Coffee Shop,40.71914,-73.985224
2,MooShoes NYC,Shoe Store,40.717861,-73.990377
3,Pause Cafe,Juice Bar,40.721264,-73.98398
4,Wayla,Thai Restaurant,40.718291,-73.992584


In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
venues_cities = getNearbyVenues(names=df['City'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                )

Lower Manhattan, New York City (USA)
Warsaw (Poland)
Prague (Czech Republic)
Bratislava (Slovakia)
Budapest (Hungary)
Bucharest (Romania)
Sofia (Bulgaria)
Zagreb (Croatia)


In [16]:
venues_cities.loc[venues_cities["City"] == "Warsaw (Poland)"]

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
100,Warsaw (Poland),52.2356,21.01037,Filharmonia Narodowa,52.234495,21.011277,Concert Hall
101,Warsaw (Poland),52.2356,21.01037,Big Szef,52.236011,21.007992,Turkish Restaurant
102,Warsaw (Poland),52.2356,21.01037,Bułkę przez Bibułkę,52.232723,21.014007,Breakfast Spot
103,Warsaw (Poland),52.2356,21.01037,Teatr Kwadrat,52.235960,21.009052,Theater
104,Warsaw (Poland),52.2356,21.01037,Lukullus,52.231927,21.012567,Dessert Shop
...,...,...,...,...,...,...,...
195,Warsaw (Poland),52.2356,21.01037,Teatr Ateneum,52.237345,21.032777,Theater
196,Warsaw (Poland),52.2356,21.01037,Park Ujazdowski,52.221506,21.025517,Park
197,Warsaw (Poland),52.2356,21.01037,Boscaiola,52.242447,21.026500,Pizza Place
198,Warsaw (Poland),52.2356,21.01037,Ave! Pizza,52.238476,21.025395,Pizza Place


In [17]:
venues_cities.groupby('City').count()

Unnamed: 0_level_0,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bratislava (Slovakia),100,100,100,100,100,100
Bucharest (Romania),100,100,100,100,100,100
Budapest (Hungary),100,100,100,100,100,100
"Lower Manhattan, New York City (USA)",100,100,100,100,100,100
Prague (Czech Republic),100,100,100,100,100,100
Sofia (Bulgaria),100,100,100,100,100,100
Warsaw (Poland),100,100,100,100,100,100
Zagreb (Croatia),100,100,100,100,100,100


Having all relevant data in the dataframe called "venues_cities", I am going to produce top-10 categories for each city now.

In [18]:
cities_onehot = pd.get_dummies(venues_cities[['Venue Category']], prefix="", prefix_sep="")

cities_onehot['City'] = venues_cities['City'] 

fixed_columns = [cities_onehot.columns[-1]] + list(cities_onehot.columns[:-1])
cities_onehot = cities_onehot[fixed_columns]

cities_onehot.head()

Unnamed: 0,City,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Turkish Restaurant,Udon Restaurant,Used Bookstore,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Volleyball Court,Waterfront,Wine Bar,Wine Shop,Yoga Studio
0,"Lower Manhattan, New York City (USA)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Lower Manhattan, New York City (USA)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Lower Manhattan, New York City (USA)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lower Manhattan, New York City (USA)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Lower Manhattan, New York City (USA)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
cities_grouped = cities_onehot.groupby('City').mean().reset_index()
cities_grouped.loc[cities_grouped['Gym'] > 0]

Unnamed: 0,City,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Turkish Restaurant,Udon Restaurant,Used Bookstore,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Volleyball Court,Waterfront,Wine Bar,Wine Shop,Yoga Studio
1,Bucharest (Romania),0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Prague (Czech Republic),0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.02,...,0.0,0.0,0.0,0.02,0.01,0.0,0.01,0.03,0.02,0.03
6,Warsaw (Poland),0.0,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.02,...,0.01,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.01,0.0
7,Zagreb (Croatia),0.0,0.01,0.01,0.0,0.0,0.0,0.05,0.0,0.02,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0


In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = cities_grouped['City']

for ind in np.arange(cities_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cities_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted.head(8)

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bratislava (Slovakia),Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
1,Bucharest (Romania),Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
2,Budapest (Hungary),Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
3,"Lower Manhattan, New York City (USA)",Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
4,Prague (Czech Republic),Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
5,Sofia (Bulgaria),Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
6,Warsaw (Poland),Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
7,Zagreb (Croatia),Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


Time to cluster neighbourhoods (k-means) - 4 baskets (later 3 and 2 baskets)

In [22]:
from sklearn.cluster import KMeans

kclusters = 4

cities_grouped_clustering = cities_grouped.drop('City', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cities_grouped_clustering)

kmeans.labels_[0:10]

array([2, 1, 1, 0, 2, 0, 2, 3])

In [23]:
cities_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_citiesANDvenues = df
df_citiesANDvenues = df_citiesANDvenues.join(cities_venues_sorted.set_index('City'), on='City')

df_citiesANDvenues.head(8)

Unnamed: 0,Index,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,"Lower Manhattan, New York City (USA)",40.71913,-73.98763,0,Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
1,2,Warsaw (Poland),52.2356,21.01037,2,Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
2,3,Prague (Czech Republic),50.07913,14.43302,2,Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
3,4,Bratislava (Slovakia),48.14924,17.10699,2,Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
4,5,Budapest (Hungary),47.49972,19.05508,1,Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
5,6,Bucharest (Romania),44.43429,26.10298,1,Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
6,7,Sofia (Bulgaria),42.69718,23.32433,0,Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
7,8,Zagreb (Croatia),45.80724,15.96757,3,Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


Visualizing on the map

In [24]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], tiles='Stamen Toner', zoom_start=2)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

df_citiesANDvenues_nonan = df_citiesANDvenues.dropna(subset=['Cluster Labels'])

markers_colors = []
for lat, lon, poi, cluster in zip(df_citiesANDvenues_nonan['Latitude'], df_citiesANDvenues_nonan['Longitude'], df_citiesANDvenues_nonan['City'], df_citiesANDvenues_nonan['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let's try to use 3 baskets instead to cluster neighbourhoods (k-means) - 3 and 2 baskets

In [25]:
kclusters = 3

cities_grouped_clustering3 = cities_grouped.drop('City', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cities_grouped_clustering3)

kmeans.labels_[0:10]

array([0, 1, 1, 2, 0, 2, 0, 0])

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

cities_venues_sorted3 = pd.DataFrame(columns=columns)
cities_venues_sorted3['City'] = cities_grouped['City']

for ind in np.arange(cities_grouped.shape[0]):
    cities_venues_sorted3.iloc[ind, 1:] = return_most_common_venues(cities_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted3.head(8)

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bratislava (Slovakia),Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
1,Bucharest (Romania),Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
2,Budapest (Hungary),Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
3,"Lower Manhattan, New York City (USA)",Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
4,Prague (Czech Republic),Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
5,Sofia (Bulgaria),Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
6,Warsaw (Poland),Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
7,Zagreb (Croatia),Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


In [27]:
cities_venues_sorted3.insert(0, 'Cluster Labels', kmeans.labels_)

df_citiesANDvenues3 = df
df_citiesANDvenues3 = df_citiesANDvenues3.join(cities_venues_sorted3.set_index('City'), on='City')

df_citiesANDvenues3.head(8)

Unnamed: 0,Index,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,"Lower Manhattan, New York City (USA)",40.71913,-73.98763,2,Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
1,2,Warsaw (Poland),52.2356,21.01037,0,Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
2,3,Prague (Czech Republic),50.07913,14.43302,0,Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
3,4,Bratislava (Slovakia),48.14924,17.10699,0,Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
4,5,Budapest (Hungary),47.49972,19.05508,1,Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
5,6,Bucharest (Romania),44.43429,26.10298,1,Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
6,7,Sofia (Bulgaria),42.69718,23.32433,2,Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
7,8,Zagreb (Croatia),45.80724,15.96757,0,Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


In [28]:
map_clusters3 = folium.Map(location=[latitude, longitude], tiles='Stamen Toner', zoom_start=2)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

df_citiesANDvenues_nonan3 = df_citiesANDvenues3.dropna(subset=['Cluster Labels'])

markers_colors = []
for lat, lon, poi, cluster in zip(df_citiesANDvenues_nonan3['Latitude'], df_citiesANDvenues_nonan3['Longitude'], df_citiesANDvenues_nonan3['City'], df_citiesANDvenues_nonan3['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters3)
       
map_clusters3

Let's try to use 2 baskets instead 4 to cluster neighbourhoods (k-means)

In [29]:
kclusters = 2

cities_grouped_clustering2 = cities_grouped.drop('City', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cities_grouped_clustering2)

kmeans.labels_[0:10]

array([0, 1, 1, 0, 0, 0, 0, 0])

In [30]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

cities_venues_sorted2 = pd.DataFrame(columns=columns)
cities_venues_sorted2['City'] = cities_grouped['City']

for ind in np.arange(cities_grouped.shape[0]):
    cities_venues_sorted2.iloc[ind, 1:] = return_most_common_venues(cities_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted2.head(8)

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bratislava (Slovakia),Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
1,Bucharest (Romania),Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
2,Budapest (Hungary),Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
3,"Lower Manhattan, New York City (USA)",Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
4,Prague (Czech Republic),Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
5,Sofia (Bulgaria),Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
6,Warsaw (Poland),Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
7,Zagreb (Croatia),Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


In [31]:
cities_venues_sorted2.insert(0, 'Cluster Labels', kmeans.labels_)

df_citiesANDvenues2 = df
df_citiesANDvenues2 = df_citiesANDvenues2.join(cities_venues_sorted2.set_index('City'), on='City')

df_citiesANDvenues2.head(8)

Unnamed: 0,Index,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,"Lower Manhattan, New York City (USA)",40.71913,-73.98763,0,Park,Ice Cream Shop,Italian Restaurant,Yoga Studio,Gourmet Shop,Bookstore,Memorial Site,Sandwich Place,Salad Place,Movie Theater
1,2,Warsaw (Poland),52.2356,21.01037,0,Cocktail Bar,Hotel,Coffee Shop,Park,Café,Pizza Place,Vegetarian / Vegan Restaurant,Plaza,Beer Bar,Sushi Restaurant
2,3,Prague (Czech Republic),50.07913,14.43302,0,Café,Burger Joint,Cocktail Bar,Theater,Coffee Shop,Hotel,Ice Cream Shop,Bistro,Beer Bar,Yoga Studio
3,4,Bratislava (Slovakia),48.14924,17.10699,0,Café,Coffee Shop,Wine Bar,Beer Bar,Ice Cream Shop,Italian Restaurant,Pub,Hotel,Creperie,Pizza Place
4,5,Budapest (Hungary),47.49972,19.05508,1,Coffee Shop,Hotel,Dessert Shop,Restaurant,Pizza Place,Italian Restaurant,Ice Cream Shop,Cocktail Bar,Soup Place,Indie Movie Theater
5,6,Bucharest (Romania),44.43429,26.10298,1,Coffee Shop,Dessert Shop,Plaza,Hotel,Italian Restaurant,Park,Romanian Restaurant,Bar,Pub,Pizza Place
6,7,Sofia (Bulgaria),42.69718,23.32433,0,Park,Coffee Shop,Italian Restaurant,Bakery,Bar,Restaurant,Café,Beer Store,Cocktail Bar,Ice Cream Shop
7,8,Zagreb (Croatia),45.80724,15.96757,0,Café,Bar,Restaurant,Plaza,BBQ Joint,Dessert Shop,Park,Bistro,Burger Joint,Pub


In [32]:
map_clusters2 = folium.Map(location=[latitude, longitude], tiles='Stamen Toner', zoom_start=2)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

df_citiesANDvenues_nonan2 = df_citiesANDvenues2.dropna(subset=['Cluster Labels'])

markers_colors = []
for lat, lon, poi, cluster in zip(df_citiesANDvenues_nonan2['Latitude'], df_citiesANDvenues_nonan2['Longitude'], df_citiesANDvenues_nonan2['City'], df_citiesANDvenues_nonan2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters2)
       
map_clusters2

# Part Two - Dataframe and analysis of flight network, cost of living and international schools

Creating dataframe for flight network, cost of living and international schools (all cities apart from NYC)

In [33]:
df2 = pd.DataFrame({'Index':[1,2,3,4,5,6,7], 'City':["Warsaw", "Prague", "Bratislava", "Budapest", "Bucharest", "Sofia", "Zagreb"]})
df2['State'] = ["Poland", "Czech Republic", "Slovakia", "Hungary", "Romania", "Bulgaria", "Croatia"]
df2

Unnamed: 0,Index,City,State
0,1,Warsaw,Poland
1,2,Prague,Czech Republic
2,3,Bratislava,Slovakia
3,4,Budapest,Hungary
4,5,Bucharest,Romania
5,6,Sofia,Bulgaria
6,7,Zagreb,Croatia


In [34]:
df2['Direct to NYC'] = [1, 0, 0, 1, 0, 0, 0]
df2

Unnamed: 0,Index,City,State,Direct to NYC
0,1,Warsaw,Poland,1
1,2,Prague,Czech Republic,0
2,3,Bratislava,Slovakia,0
3,4,Budapest,Hungary,1
4,5,Bucharest,Romania,0
5,6,Sofia,Bulgaria,0
6,7,Zagreb,Croatia,0


In [35]:
df2['Direct to 5 of 6 CEE'] = [1, 1, 0, 1, 1, 1, 0]
# If flight to Vienna is available it accounts for Bratislava because of close proximity
df2

Unnamed: 0,Index,City,State,Direct to NYC,Direct to 5 of 6 CEE
0,1,Warsaw,Poland,1,1
1,2,Prague,Czech Republic,0,1
2,3,Bratislava,Slovakia,0,0
3,4,Budapest,Hungary,1,1
4,5,Bucharest,Romania,0,1
5,6,Sofia,Bulgaria,0,1
6,7,Zagreb,Croatia,0,0


In [36]:
df2['Distance to airport (km)'] = [15.77, 18.50, 12.7, 22.1, 20.2, 10.1, 15]
df2

Unnamed: 0,Index,City,State,Direct to NYC,Direct to 5 of 6 CEE,Distance to airport (km)
0,1,Warsaw,Poland,1,1,15.77
1,2,Prague,Czech Republic,0,1,18.5
2,3,Bratislava,Slovakia,0,0,12.7
3,4,Budapest,Hungary,1,1,22.1
4,5,Bucharest,Romania,0,1,20.2
5,6,Sofia,Bulgaria,0,1,10.1
6,7,Zagreb,Croatia,0,0,15.0


In [37]:
df2['Monthly rent 85m2 downtown (USD)'] = [1276, 1350, 1130, 1061, 868, 650, 972]
df2

Unnamed: 0,Index,City,State,Direct to NYC,Direct to 5 of 6 CEE,Distance to airport (km),Monthly rent 85m2 downtown (USD)
0,1,Warsaw,Poland,1,1,15.77,1276
1,2,Prague,Czech Republic,0,1,18.5,1350
2,3,Bratislava,Slovakia,0,0,12.7,1130
3,4,Budapest,Hungary,1,1,22.1,1061
4,5,Bucharest,Romania,0,1,20.2,868
5,6,Sofia,Bulgaria,0,1,10.1,650
6,7,Zagreb,Croatia,0,0,15.0,972


In [38]:
df2['International schools'] = [19, 15, 10, 12, 15, 10, 3]
df2

Unnamed: 0,Index,City,State,Direct to NYC,Direct to 5 of 6 CEE,Distance to airport (km),Monthly rent 85m2 downtown (USD),International schools
0,1,Warsaw,Poland,1,1,15.77,1276,19
1,2,Prague,Czech Republic,0,1,18.5,1350,15
2,3,Bratislava,Slovakia,0,0,12.7,1130,10
3,4,Budapest,Hungary,1,1,22.1,1061,12
4,5,Bucharest,Romania,0,1,20.2,868,15
5,6,Sofia,Bulgaria,0,1,10.1,650,10
6,7,Zagreb,Croatia,0,0,15.0,972,3


In [39]:
from sklearn.cluster import KMeans

kclusters = 3

df2_clustering = df2.drop('City', 1)
df2_clustering = df2_clustering.drop('State', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df2_clustering)

kmeans.labels_[0:10]

array([1, 1, 2, 2, 0, 0, 2])

In [40]:
df2.insert(0, 'Cluster Labels', kmeans.labels_)
df2

Unnamed: 0,Cluster Labels,Index,City,State,Direct to NYC,Direct to 5 of 6 CEE,Distance to airport (km),Monthly rent 85m2 downtown (USD),International schools
0,1,1,Warsaw,Poland,1,1,15.77,1276,19
1,1,2,Prague,Czech Republic,0,1,18.5,1350,15
2,2,3,Bratislava,Slovakia,0,0,12.7,1130,10
3,2,4,Budapest,Hungary,1,1,22.1,1061,12
4,0,5,Bucharest,Romania,0,1,20.2,868,15
5,0,6,Sofia,Bulgaria,0,1,10.1,650,10
6,2,7,Zagreb,Croatia,0,0,15.0,972,3


In [41]:
address = 'Warsaw'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_df2 = folium.Map(location=[latitude, longitude], zoom_start=4)

folium.Choropleth(
    geo_data="otherdata/custom.geo.json",
    name='choropleth',
    data=df2,
    columns=['State', 'Cluster Labels'],
    key_on='feature.properties.sovereignt',
    fill_color='PuRd',
    fill_opacity=0.9,
    line_opacity=0.2,
    legend_name='K-mean cluster'
).add_to(map_df2)

folium.LayerControl().add_to(map_df2)

map_clusters2.add_to(map_df2)
map_df2

# The end