In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import folium
import json
import requests
import os
from shapely.geometry import Point, Polygon
from sklearn.cluster import KMeans
import seaborn as sn
import matplotlib.pyplot as plt

In [2]:
# geojson files exported from .shp with QGIS (EPSG:4326 WGS 84)
warsaw_geofile = 'geodata/Warsaw_districts/warsaw_districts.geojson'
cracow_geofile = 'geodata/Cracow_districts/cracow_districts.geojson'

In [3]:
with open(warsaw_geofile) as w:
    warsaw_geojson = json.load(w)

In [4]:
with open(cracow_geofile) as c:
    cracow_geojson = json.load(c)

In [5]:
warsaw_districts_names = []
for x in warsaw_geojson['features']:
    warsaw_districts_names.append(x['properties']['nazwa_dzie'])

In [6]:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
folium.GeoJson(
    warsaw_geojson,
    name='geojson'
).add_to(warsaw_map)
warsaw_map

In [7]:
def get_district_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa_dzie']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers

In [8]:
warsaw_district_centers = get_district_centers(warsaw_geojson)

In [70]:
warsaw_district_centers.head()

Unnamed: 0,District_center
Żoliborz,"[52.2688536216614, 20.985135391429377]"
Praga-Południe,"[52.235168690871, 21.071138918572952]"
Mokotów,"[52.18866196405862, 21.052814696946914]"
Wola,"[52.22969762335711, 20.94634273758449]"
Wilanów,"[52.15030830502675, 21.091139416474352]"


In [9]:
for center in warsaw_district_centers.index:
    folium.Marker(warsaw_district_centers.loc[center, 'District_center']).add_to(warsaw_map)
warsaw_map

In [12]:
def create_districts_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa_dzie']]=district['geometry']['coordinates'][0][0]
    return polygons

In [13]:
warsaw_districts_polygons = create_districts_polygons(warsaw_geojson)

In [72]:
warsaw_districts_polygons

{'Żoliborz': [[20.957550244360345, 52.266927972075955],
  [20.957595033280743, 52.26712068328835],
  [20.957577637116344, 52.26723607840504],
  [20.957537312295642, 52.26727740707875],
  [20.957761257322645, 52.267475395339645],
  [20.957815144778245, 52.267498512850246],
  [20.95787439818555, 52.26757897342375],
  [20.95791009640075, 52.26762769918745],
  [20.957907153084747, 52.26764189861664],
  [20.95793322389554, 52.26764721061744],
  [20.957923228478144, 52.26768297788135],
  [20.957920876497745, 52.26769151530553],
  [20.95793214246805, 52.26770634893295],
  [20.958270099249148, 52.26802283392484],
  [20.958467694144947, 52.26820795917454],
  [20.95868797775235, 52.26841439274213],
  [20.958872967069347, 52.268452385171436],
  [20.95894444363255, 52.26846705990273],
  [20.95915945950805, 52.268511173916025],
  [20.961263616641954, 52.27015734038829],
  [20.96171409893366, 52.27050980280468],
  [20.96176841477846, 52.27055251169258],
  [20.961853359918262, 52.27058354652189],
  [

In [14]:
def calculate_radius(district_name):
    center_coords = warsaw_district_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in warsaw_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [16]:
warsaw_districts_radiuses = {}
for district in warsaw_district_centers.index:
    warsaw_districts_radiuses[district] = calculate_radius(district)

In [73]:
warsaw_districts_radiuses

{'Żoliborz': 2735,
 'Praga-Południe': 4228,
 'Mokotów': 4866,
 'Wola': 3791,
 'Wilanów': 5010,
 'Wesoła': 3668,
 'Wawer': 7006,
 'Włochy': 4599,
 'Ursynów': 4896,
 'Śródmieście': 4339,
 'Praga-Północ': 3133,
 'Ursus': 2402,
 'Targówek': 3617,
 'Rembertów': 3461,
 'Ochota': 2640,
 'Bielany': 4385,
 'Białołęka': 7513,
 'Bemowo': 3478}

In [17]:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20200605'
LIMIT = 100
OFFSET=LIMIT

In [18]:
def get_venues(district_center, radius):
    RADIUS = radius
    lat, lng = district_center
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
    result = requests.get(url).json()
    
    venues = result['response']['groups'][0]['items']
    
    total_results = result['response']['totalResults']
    print('\tTotal results: ', total_results, '\n')
    
    #checking if there is more results -if true, next request with offset is send
    requests_to_perform = total_results//100
    
    for _ in range(requests_to_perform):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            OFFSET
        )
        result = requests.get(url).json()
        venues.extend(result['response']['groups'][0]['items'])
        
    return venues

In [19]:
warsaw_districts_venues_foursquare = {}
for district in warsaw_district_centers.index:
    print(district)
    warsaw_districts_venues_foursquare[district] = get_venues(
        warsaw_district_centers.loc[district, 'District_center'],
        warsaw_districts_radiuses[district])

Żoliborz
	Total results:  159 

Praga-Południe
	Total results:  237 

Mokotów
	Total results:  234 

Wola
	Total results:  223 

Wilanów
	Total results:  150 

Wesoła
	Total results:  20 

Wawer
	Total results:  98 

Włochy
	Total results:  225 

Ursynów
	Total results:  142 

Śródmieście
	Total results:  239 

Praga-Północ
	Total results:  160 

Ursus
	Total results:  47 

Targówek
	Total results:  145 

Rembertów
	Total results:  35 

Ochota
	Total results:  166 

Bielany
	Total results:  95 

Białołęka
	Total results:  122 

Bemowo
	Total results:  67 



In [41]:
def extract_data(district,district_venues_foursquare):
    district_venues=[]
    for item in district_venues_foursquare:
        x={}
        x['District']=district
        x['Name']=item['venue']['name']
        x['Category']=item['venue']['categories'][0]['name']
        x['Lat'] = item['venue']['location']['lat']
        x['Lon'] = item['venue']['location']['lng']
        x['VenueId'] = item['venue']['id']
        district_venues.append(x)
    district_df = pd.DataFrame(district_venues)
    return district_df

In [74]:
def get_districts_venues(city_districts_venues):
    all_city_venues= pd.DataFrame()
    for district in city_districts_venues.keys():
        district_df = extract_data(district, city_districts_venues[district])
        district_df['District'] = district
        #print(district_df.head())
        all_city_venues = all_city_venues.append(district_df, ignore_index=True)
    return all_city_venues

In [75]:
warsaw_districts_venues = get_districts_venues(warsaw_districts_venues_foursquare)

In [68]:
warsaw_districts_venues.head()

Unnamed: 0,District,Name,Category,Lat,Lon,VenueId,Inside
0,Żoliborz,Park Żeromskiego,Park,52.268377,20.988747,4baf7aa5f964a52031033ce3,True
1,Żoliborz,Galeria Wypieków,Bakery,52.268523,20.986111,55508b67498e2dcf9038f190,True
2,Żoliborz,Plac zabaw w Parku Żeromskiego,Playground,52.267248,20.988827,4db2f35a4b226b343d6d0581,True
3,Żoliborz,Kino Wisła,Indie Movie Theater,52.269609,20.986743,4c14d3afa9c220a11e18589d,True
4,Żoliborz,Plac Wilsona,Plaza,52.268914,20.985587,4bb771276edc76b0a92e321c,True


In [61]:
# for key in warsaw_districts_venues.keys():    
#     for item in warsaw_districts_venues[key]:
#         #print(item['venue']['name'], ' ', item['venue']['categories'][0]['name'])
#         folium.Marker([item['venue']['location']['lat'], item['venue']['location']['lng']]).add_to(warsaw_map)

In [62]:
def pin_venues(venues_df,city_map):
    for i in venues_df.index:
        folium.Marker([venues_df.loc[i,'Lat'], venues_df.loc[i,'Lon']]).add_to(city_map)

In [64]:
#warsaw_districts_venues.reset_index(inplace=True)

In [65]:
def check_if_inside_district(venue_coords, district_shape):
    p = Point(venue_coords)
    poly = Polygon(district_shape)
    return p.within(poly)

In [66]:
for i in warsaw_districts_venues.index:
    warsaw_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [warsaw_districts_venues.loc[i,'Lon'], warsaw_districts_venues.loc[i,'Lat']],
                                        warsaw_districts_polygons[warsaw_districts_venues.loc[i,'District']])

In [67]:
warsaw_districts_venues = warsaw_districts_venues.loc[warsaw_districts_venues['Inside']==True]

In [69]:
warsaw_districts_venues.head()

Unnamed: 0,District,Name,Category,Lat,Lon,VenueId,Inside
0,Żoliborz,Park Żeromskiego,Park,52.268377,20.988747,4baf7aa5f964a52031033ce3,True
1,Żoliborz,Galeria Wypieków,Bakery,52.268523,20.986111,55508b67498e2dcf9038f190,True
2,Żoliborz,Plac zabaw w Parku Żeromskiego,Playground,52.267248,20.988827,4db2f35a4b226b343d6d0581,True
3,Żoliborz,Kino Wisła,Indie Movie Theater,52.269609,20.986743,4c14d3afa9c220a11e18589d,True
4,Żoliborz,Plac Wilsona,Plaza,52.268914,20.985587,4bb771276edc76b0a92e321c,True


In [76]:
warsaw_top5 = warsaw_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [77]:
warsaw_top5

Unnamed: 0_level_0,District,Name,Lat,Lon,VenueId
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Café,179,179,179,179,179
Park,140,140,140,140,140
Coffee Shop,138,138,138,138,138
Italian Restaurant,117,117,117,117,117
Supermarket,84,84,84,84,84


In [31]:
cracow_districts_names = []
for x in cracow_geojson['features']:
    cracow_districts_names.append(x['properties']['nazwa'])

In [32]:
cracow_coordinates = [50.06143, 19.93658]
cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)

folium.GeoJson(
    cracow_geojson,
    name='geojson'
).add_to(cracow_map)
cracow_map

In [79]:
# def create_cracow_map():    
#     cracow_coordinates = [50.06143, 19.93658]
#     cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)
#     cracow_map.choropleth(
#         geo_data=cracow_geojson,
#         name='choropleth',
#         data=df,
#         columns=['nazwa', 'Density'],
#         key_on='feature.properties.nazwa',
#         fill_color='YlGn',
#         fill_opacity=0.7,
#         line_opacity=0.9,
#         legend_name='Density'
#     )
#     return cracow_map

# cracow_map = create_cracow_map()

In [80]:
def get_cracow_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers

In [81]:
cracow_districts_centers = get_cracow_centers(cracow_geojson)

In [82]:
def create_cracow_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa']]=district['geometry']['coordinates'][0][0]
    return polygons

In [83]:
cracow_districts_polygons = create_cracow_polygons(cracow_geojson)

In [84]:
def calculate_cracow_radius(district_name):
    center_coords = cracow_districts_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in cracow_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [85]:
cracow_districts_radiuses = {}
for district in cracow_districts_centers.index:
    cracow_districts_radiuses[district] = calculate_cracow_radius(district)

In [86]:
cracow_districts_venues_foursquare = {}
for district in cracow_districts_centers.index:
    print(district)
    cracow_districts_venues_foursquare[district] = get_venues(
        cracow_districts_centers.loc[district, 'District_center'],
        cracow_districts_radiuses[district])

Stare Miasto
	Total results:  228 

Grzegórzki
	Total results:  159 

Prądnik Czerwony
	Total results:  63 

Prądnik Biały
	Total results:  125 

Krowodrza
	Total results:  206 

Bronowice
	Total results:  93 

Zwierzyniec
	Total results:  124 

Dębniki
	Total results:  49 

Łagiewniki-Borek Fałęcki
	Total results:  33 

Swoszowice
	Total results:  50 

Podgórze Duchackie
	Total results:  52 

Bieżanów-Prokocim
	Total results:  28 

Podgórze
	Total results:  220 

Czyżyny
	Total results:  79 

Mistrzejowice
	Total results:  25 

Bieńczyce
	Total results:  38 

Wzgórza Krzesławickie
	Total results:  18 

Nowa Huta
	Total results:  34 



In [91]:
cracow_districts_venues = get_districts_venues(cracow_districts_venues_foursquare)
#cracow_districts_venues.reset_index(inplace=True)

In [92]:
for i in cracow_districts_venues.index:
    cracow_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [cracow_districts_venues.iloc[i]['Lon'], cracow_districts_venues.loc[i]['Lat']],
                                        cracow_districts_polygons[cracow_districts_venues.iloc[i]['District']])

In [93]:
cracow_districts_venues = cracow_districts_venues.loc[cracow_districts_venues['Inside']==True]
cracow_top5 = cracow_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [94]:
cracow_top5

Unnamed: 0_level_0,District,Name,Lat,Lon,VenueId,Inside
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hotel,52,52,52,52,52,52
Supermarket,35,35,35,35,35,35
Italian Restaurant,31,31,31,31,31,31
Café,26,26,26,26,26,26
Park,24,24,24,24,24,24


In [96]:
#cracow_map = create_cracow_map()
for i in cracow_districts_venues.index:    
    folium.Marker([cracow_districts_venues.loc[i,'Lat'], cracow_districts_venues.loc[i,'Lon']]).add_to(cracow_map)

In [97]:
top_categories = warsaw_top5.add(cracow_top5, fill_value=0)

In [98]:
top_5_categories = top_categories.head().index
top_5_categories

Index(['Café', 'Coffee Shop', 'Hotel', 'Italian Restaurant', 'Park'], dtype='object', name='Category')

In [99]:
warsaw_districts_area = {'Mokotów':35.42, 'Praga-Południe':22.38, 'Ursynów':43.79, 'Wola':19.26, 'Bielany':32.34,
                            'Targówek':24.22, 'Bemowo':24.95, 'Śródmieście':15.57, 'Białołęka':73.04, 'Ochota':9.72,
                            'Wawer':79.70, 'Praga-Północ':11.42, 'Ursus':9.36, 'Żoliborz':8.47, 'Włochy':28.63,
                            'Wilanów':36.73, 'Wesoła':22.94, 'Rembertów':19.30}

In [100]:
warsaw_districts_venues_top5 = warsaw_districts_venues[warsaw_districts_venues.Category.isin(top_5_categories)]
warsaw_top5 = warsaw_districts_venues_top5.groupby(['District', 'Category']).count()

In [101]:
warsaw_top5 = warsaw_districts_venues_top5.groupby(['District', 'Category']).count()

In [102]:
warsaw_top5.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Lat,Lon,VenueId
District,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bemowo,Café,5,5,5,5
Bemowo,Coffee Shop,3,3,3,3
Bemowo,Italian Restaurant,4,4,4,4
Bemowo,Park,8,8,8,8
Białołęka,Café,8,8,8,8


In [103]:
warsaw_categories_density={}
for District, Category in warsaw_top5.index:
    warsaw_categories_density[(Category, District)]=warsaw_top5.loc[District, Category].VenueId/warsaw_districts_area[District]

In [104]:
warsaw_categories_density = pd.DataFrame(warsaw_categories_density, index=[1]).T
warsaw_categories_density.reset_index(inplace=True)
warsaw_categories_density.columns=['Category', 'District', 'Category_density']

In [106]:
warsaw_categories_density.head(10)

Unnamed: 0,Category,District,Category_density
0,Café,Bemowo,0.200401
1,Coffee Shop,Bemowo,0.12024
2,Italian Restaurant,Bemowo,0.160321
3,Park,Bemowo,0.320641
4,Café,Białołęka,0.109529
5,Coffee Shop,Białołęka,0.068456
6,Hotel,Białołęka,0.027382
7,Italian Restaurant,Białołęka,0.027382
8,Park,Białołęka,0.136911
9,Café,Bielany,0.278293


In [None]:
warsaw_maps = {}
for category in warsaw_categories_density.Category.unique():
    category_density = warsaw_categories_density.loc[warsaw_categories_density.Category==category]
    warsaw_category_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
    warsaw_category_map.choropleth(
    geo_data=warsaw_geojson,
    name='choropleth',
    data=category_density,
    columns=['District', 'Category_density'],
    key_on='feature.properties.nazwa_dzie',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.9,
    legend_name='Density'
)
    warsaw_maps[category]=warsaw_category_map

In [None]:
top_5_categories

In [None]:
warsaw_maps['Café']

In [None]:
warsaw_maps['Coffee Shop']

In [None]:
warsaw_maps['Hotel']

In [None]:
warsaw_maps['Italian Restaurant']

In [None]:
warsaw_maps['Park']

In [None]:
cracow_districts_area = {'Stare Miasto': 5.56, 'Grzegórzki': 5.84, 'Prądnik Czerwony': 6.43, 'Prądnik Biały': 23.41,
                        'Krowodrza': 5.61, 'Bronowice': 13.54, 'Zwierzyniec': 28.73, 'Dębniki': 46.18,
                        'Łagiewniki-Borek Fałęcki':54.15, 'Swoszowice': 25.6, 'Podgórze Duchackie': 95.40,
                        'Bieżanów-Prokocim': 18.47, 'Podgórze': 25.66, 'Czyżyny': 12.25, 'Mistrzejowice': 55.90,
                        'Bieńczyce': 36.99, 'Wzgórze Krzesławickie': 23.81, 'Nowa Huta': 65.40}

In [None]:
cracow_districts_venues_top5 = cracow_districts_venues[cracow_districts_venues.Category.isin(top_5_categories)]
cracow_top5 = cracow_districts_venues_top5.groupby(['District', 'Category']).count()

In [None]:
cracow_categories_density={}
for District, Category in cracow_top5.index:
    cracow_categories_density[(Category, District)]=cracow_top5.loc[District, Category].VenueId/cracow_districts_area[District]

In [None]:
cracow_categories_density = pd.DataFrame(cracow_categories_density, index=[1]).T
cracow_categories_density.reset_index(inplace=True)
cracow_categories_density.columns=['Category', 'District', 'Category_density']

In [None]:
cracow_maps = {}
for category in cracow_categories_density.Category.unique():
    category_density = cracow_categories_density.loc[cracow_categories_density.Category==category]
    cracow_category_map = folium.Map(location=cracow_coordinates, zoom_start=11)
    cracow_category_map.choropleth(
    geo_data=cracow_geojson,
    name='choropleth',
    data=category_density,
    columns=['District', 'Category_density'],
    key_on='feature.properties.nazwa',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.9,
    legend_name='Density'
)
    cracow_maps[category]=cracow_category_map

In [None]:
cracow_maps['Café']

In [None]:
cracow_maps['Coffee Shop']

In [None]:
cracow_maps['Hotel']

In [None]:
cracow_maps['Italian Restaurant']

In [None]:
cracow_maps['Park']

In [None]:
warsaw_districts_venues

In [None]:
both_cities_venues = pd.concat([warsaw_districts_venues, cracow_districts_venues], axis=0)
both_cities_venues.drop(['index'], axis=1, inplace=True)

In [None]:
both_cities_venues.head()

In [None]:
# one hot encoding
both_cities_onehot = pd.get_dummies(both_cities_venues[['Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
both_cities_onehot['District'] = both_cities_venues['District'] 

# move neighborhood column to the first column
fixed_columns = [both_cities_onehot.columns[-1]] + list(both_cities_onehot.columns[:-1])
both_cities_onehot = both_cities_onehot[fixed_columns]

both_cities_onehot.head()

In [None]:
both_cities_onehot.shape

In [None]:
both_cities_grouped = both_cities_onehot.groupby('District').mean().reset_index()
both_cities_grouped

In [None]:
both_cities_grouped.shape

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['District'] = both_cities_grouped['District']

for ind in np.arange(both_cities_grouped.shape[0]):
    districts_venues_sorted.iloc[ind, 1:] = return_most_common_venues(both_cities_grouped.iloc[ind, :], num_top_venues)

#districts_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 3

both_cities_grouped_clustering = both_cities_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(both_cities_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
districts_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
districts_venues_sorted.groupby('Cluster Labels').count()

In [None]:
districts_venues_sorted.sort_values(by='Cluster Labels')

In [None]:
cracow_clusters_labels = districts_venues_sorted[districts_venues_sorted.District.isin(cracow_districts_names)]

In [None]:
cracow_clusters_labels.groupby('Cluster Labels').count()

In [None]:
cracow_clusters_map = folium.Map(location=cracow_coordinates, zoom_start=11)
cracow_clusters_map.choropleth(
geo_data=cracow_geojson,
name='choropleth',
data=cracow_clusters_labels,
columns=['District', 'Cluster Labels'],
key_on='feature.properties.nazwa',
fill_color='BuPu',
fill_opacity=0.9,
line_opacity=0.9,
legend_name='Cluster_label'
)

In [None]:
cracow_clusters_map

In [None]:
warsaw_clusters_labels = districts_venues_sorted[districts_venues_sorted.District.isin(warsaw_districts_names)]

In [None]:
warsaw_clusters_labels.groupby('Cluster Labels').count()

In [None]:
warsaw_clusters_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
warsaw_clusters_map.choropleth(
geo_data=warsaw_geojson,
name='choropleth',
data=warsaw_clusters_labels,
columns=['District', 'Cluster Labels'],
key_on='feature.properties.nazwa_dzie',
fill_color='BuPu',
fill_opacity=0.9,
line_opacity=0.9,
legend_name='Density'
)

In [None]:
warsaw_clusters_map

In [None]:
districts_venues_sorted.loc[districts_venues_sorted['Cluster Labels']==1].iloc[:,2:].stack().value_counts().head()

In [None]:
districts_venues_sorted.loc[districts_venues_sorted['Cluster Labels']==2].iloc[:,2:].stack().value_counts().head()

In [None]:
both_cities_grouped.head()

In [None]:
corr_matrix = both_cities_grouped.set_index('District').corr()

In [None]:
sn.heatmap(corr_matrix, annot=True)

In [None]:
plt.show()

In [None]:
correlation_abs = corr_matrix.abs()

In [None]:
correlation_pairs=correlation_abs.unstack()

In [None]:
correlations = pd.DataFrame(s)
correlations.columns=['Correlation']

In [None]:
correlations.loc[correlations.Correlation>0.5].loc[top_5_categories].loc[correlations.Correlation<1]

In [None]:
correlation_abs