# Display the clustered cities on a map

In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# clustering
from sklearn.cluster import KMeans

# displaying on a map
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML, display
from IPython.display import Image 
from IPython.core.display import HTML 

## Get latitudes and longitudes

**Dashes (-) have been deleted from the city names of the below list**

In [2]:
city_list = ['Amsterdam', 'Andorra la Vella', 'Athens', 'Belgrade', 'Berlin', 'Bern', 
             'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 
             'Dublin', 'Helsinki', 'Kyiv', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 
             'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 
             'Podgorica', 'Prague', 'Pristina', 'Reykjavik', 'Riga', 'Rome', 'San Marino', 'Sarajevo', 
             'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Vaduz', 'Valletta', 'Vatican City', 
             'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']

In [3]:
len(city_list)

47

Source of the country-capitals.csv: http://techslides.com/list-of-countries-and-capitals

In [4]:
capitals = pd.read_csv('../data/country-capitals.csv', error_bad_lines=False)
capitals.head()

b'Skipping line 229: expected 6 fields, saw 7\nSkipping line 240: expected 6 fields, saw 7\n'


Unnamed: 0,CountryName,CapitalName,CapitalLatitude,CapitalLongitude,CountryCode,ContinentName
0,Somaliland,Hargeisa,9.55,44.05,,Africa
1,South Georgia and South Sandwich Islands,King Edward Point,-54.283333,-36.5,GS,Antarctica
2,French Southern and Antarctic Lands,Port-aux-Français,-49.35,70.216667,TF,Antarctica
3,Palestine,Jerusalem,31.766667,35.233333,PS,Asia
4,Aland Islands,Mariehamn,60.116667,19.9,AX,Europe


Only Washington (229) and Hong Kong (240) are not in the list, but all the European capital cities.

In [5]:
capitals.shape

(243, 6)

In [28]:
european_capitals = capitals[capitals['CapitalName'].isin(city_list)].reset_index(drop=True)
european_capitals['CapitalName'] = european_capitals['CapitalName'].str.replace('Kyiv', 'Kiev')
european_capitals.head()

Unnamed: 0,CountryName,CapitalName,CapitalLatitude,CapitalLongitude,CountryCode,ContinentName
0,Albania,Tirana,41.316667,19.816667,AL,Europe
1,Andorra,Andorra la Vella,42.5,1.516667,AD,Europe
2,Austria,Vienna,48.2,16.366667,AT,Europe
3,Belarus,Minsk,53.9,27.566667,BY,Europe
4,Belgium,Brussels,50.833333,4.333333,BE,Europe


### Which cities are not in the list?

In [7]:
not_in = capitals[~capitals['CapitalName'].isin(city_list)]

These are not European cities, so the list it fine

In [8]:
not_in[not_in['ContinentName'] == 'Europe']

Unnamed: 0,CountryName,CapitalName,CapitalLatitude,CapitalLongitude,CountryCode,ContinentName
4,Aland Islands,Mariehamn,60.116667,19.9,AX,Europe
18,Armenia,Yerevan,40.166667,44.5,AM,Europe
22,Azerbaijan,Baku,40.383333,49.866667,AZ,Europe
78,Faroe Islands,Torshavn,62.0,-6.766667,FO,Europe
85,Georgia,Tbilisi,41.683333,44.833333,GE,Europe
88,Gibraltar,Gibraltar,36.133333,-5.35,GI,Europe
94,Guernsey,Saint Peter Port,49.45,-2.533333,GG,Europe
108,Isle of Man,Douglas,54.15,-4.483333,IM,Europe
113,Jersey,Saint Helier,49.183333,-2.1,JE,Europe
205,Svalbard,Longyearbyen,78.216667,15.633333,SJ,Europe


In [9]:
european_capitals = (european_capitals[['CapitalName', 'CountryName', 'CapitalLatitude', 'CapitalLongitude']]
                     .sort_values(by=['CapitalName'])
                     .reset_index(drop=True))

european_capitals.head()

Unnamed: 0,CapitalName,CountryName,CapitalLatitude,CapitalLongitude
0,Amsterdam,Netherlands,52.35,4.916667
1,Andorra la Vella,Andorra,42.5,1.516667
2,Athens,Greece,37.983333,23.733333
3,Belgrade,Serbia,44.833333,20.5
4,Berlin,Germany,52.516667,13.4


## Merge with the Socio-Economics data

In [10]:
soceco = pd.read_json('../data/Socio_economic_data.json')

In [11]:
soceco = soceco[['city', 'climate', 'cost_of_living', 'health_care', 
                 'pollution', 'property_income_ratio', 'purchasing_power', 
                 'safety', 'traffic_time', 'quality_of_life']]

In [27]:
soceco.head()

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life
0,Amsterdam,87.45,84.18,69.45,30.79,10.98,81.63,67.32,29.88,168.38
1,Andorra-La-Vella,,66.69,69.44,64.08,7.44,82.46,87.16,5.0,
2,Athens,95.18,59.28,56.17,57.3,12.75,40.69,50.49,37.98,119.84
3,Belgrade,84.14,40.49,53.69,63.57,22.22,34.87,62.02,35.89,107.89
4,Berlin,83.35,67.41,69.68,39.45,9.63,98.54,58.92,34.06,164.83


In [26]:
soceco_lat_lng = soceco.merge(european_capitals, how='left', left_on='city', right_on='CapitalName')

# drop CapitalName
soceco_lat_lng = soceco_lat_lng.drop(columns=['CapitalName'])

# rename columns
soceco_lat_lng = soceco_lat_lng.rename(columns={'CountryName': 'country', 
                                                'CapitalLatitude': 'lat', 
                                                'CapitalLongitude': 'lng'})

soceco_lat_lng.head()

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,country,lat,lng
0,Amsterdam,87.45,84.18,69.45,30.79,10.98,81.63,67.32,29.88,168.38,Netherlands,52.35,4.916667
1,Andorra-La-Vella,,66.69,69.44,64.08,7.44,82.46,87.16,5.0,,,,
2,Athens,95.18,59.28,56.17,57.3,12.75,40.69,50.49,37.98,119.84,Greece,37.983333,23.733333
3,Belgrade,84.14,40.49,53.69,63.57,22.22,34.87,62.02,35.89,107.89,Serbia,44.833333,20.5
4,Berlin,83.35,67.41,69.68,39.45,9.63,98.54,58.92,34.06,164.83,Germany,52.516667,13.4


**Drop Andorra la Vella, Monaco and Nuuk because of missing soceco data**

In [23]:
soceco_lat_lng.drop([1, 20, 23]).reset_index(drop=True).isna().sum()

city                     0
climate                  0
cost_of_living           0
health_care              0
pollution                0
property_income_ratio    0
purchasing_power         0
safety                   0
traffic_time             0
quality_of_life          0
country                  0
lat                      0
lng                      0
dtype: int64

In [29]:
soceco_lat_lng = soceco_lat_lng.drop([1, 20, 23]).reset_index(drop=True)
soceco_lat_lng.head()

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,country,lat,lng
0,Amsterdam,87.45,84.18,69.45,30.79,10.98,81.63,67.32,29.88,168.38,Netherlands,52.35,4.916667
1,Athens,95.18,59.28,56.17,57.3,12.75,40.69,50.49,37.98,119.84,Greece,37.983333,23.733333
2,Belgrade,84.14,40.49,53.69,63.57,22.22,34.87,62.02,35.89,107.89,Serbia,44.833333,20.5
3,Berlin,83.35,67.41,69.68,39.45,9.63,98.54,58.92,34.06,164.83,Germany,52.516667,13.4
4,Bratislava,80.72,50.81,57.17,41.12,13.37,61.82,68.68,30.89,147.54,Slovakia,48.15,17.116667


## Try out the original clustering function

In [49]:
def cluster_one_list(df, col, cluster_no=4):
    X = df[["city", col]]
    X = X.drop("city", 1)
    clusters = KMeans(n_clusters=cluster_no) # train the model
    clusters.fit(X)
    clusters.predict(X)
    
    df2 = df.assign(cluster=clusters.predict(X))
    df2 = df2[["city", "cluster", col]]
    
    # create the list of custers
    cluster0 = (df2.loc[df2["cluster"] == 0])
    list1 = list(cluster0["city"])
    list2 = list(cluster0[col])
    zipped0 = dict(zip(list1, list2))
    mean0 = round(cluster0[col].mean())
    
    cluster1 = (df2.loc[df2["cluster"] == 1])
    list3 = list(cluster1["city"])
    list4 = list(cluster1[col])
    zipped1 = dict(zip(list3, list4))
    mean1 = round(cluster1[col].mean())
    
    cluster2 = (df2.loc[df2["cluster"] == 2])
    list5 = list(cluster2["city"])
    list6 = list(cluster2[col])
    zipped2 = dict(zip(list5, list6))
    mean2 = round(cluster2[col].mean())
    
    cluster3 = (df2.loc[df2["cluster"] == 3])
    list7 = list(cluster3["city"])
    list8 = list(cluster3[col])
    zipped3 = dict(zip(list7, list8))
    mean3 = round(cluster3[col].mean())
    
    #print this out
    print(f'Clustering based on: {col}\n')
    print(zipped0)
    print(f"\nThe average {col} of cluster 0 is {mean0}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped1)
    print(f"\nThe average {col} of cluster 1 is {mean1}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped2)
    print(f"\nThe average {col} of cluster 2 is {mean2}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped3)
    print(f"\nThe average {col} of cluster 3 is {mean3}")

In [51]:
cluster_one_list(soceco_lat_lng, 'pollution', cluster_no=4)

Clustering based on: pollution

{'Copenhagen': 21.47, 'Helsinki': 13.19, 'Ljubljana': 23.24, 'Luxembourg': 21.59, 'Oslo': 25.6, 'Reykjavik': 15.33, 'Stockholm': 20.05, 'Tallinn': 22.13, 'Vienna': 18.15, 'Vilnius': 23.34}

The average pollution of cluster 0 is 20.0
--------------------------------------------------------------------------------------------
{'Athens': 57.3, 'Belgrade': 63.57, 'Brussels': 62.36, 'Budapest': 54.38, 'Chisinau': 62.24, 'Kiev': 66.08, 'London': 58.57, 'Madrid': 52.06, 'Moscow': 58.57, 'Nicosia': 61.04, 'Paris': 64.23, 'Rome': 66.4, 'Sarajevo': 69.24, 'Sofia': 69.24, 'Warsaw': 65.58}

The average pollution of cluster 1 is 62.0
--------------------------------------------------------------------------------------------
{'Amsterdam': 30.79, 'Berlin': 39.45, 'Bratislava': 41.12, 'Dublin': 40.42, 'Lisbon': 34.91, 'Minsk': 41.56, 'Podgorica': 48.6, 'Prague': 34.89, 'Riga': 38.65, 'Zagreb': 31.37}

The average pollution of cluster 2 is 38.0
-------------------------

## Create a function that displays the cities on a map

In [61]:
np.arange(4)

array([0, 1, 2, 3])

In [158]:
def map_cluster_one_list(df, col, cluster_no=4):
    X = df[["city", col]]
    X = X.drop("city", 1)
    clusters = KMeans(n_clusters=cluster_no) # train the model
    clusters.fit(X)
    clusters.predict(X)
    
    df2 = df.assign(cluster=clusters.predict(X))
    df2 = df2[["city", "lat", "lng", "cluster", col]]
    
    # create the list of custers
    cluster0 = (df2.loc[df2["cluster"] == 0])
    list1 = list(cluster0["city"])
    list2 = list(cluster0[col])
    zipped0 = dict(zip(list1, list2))
    mean0 = round(cluster0[col].mean())
    
    cluster1 = (df2.loc[df2["cluster"] == 1])
    list3 = list(cluster1["city"])
    list4 = list(cluster1[col])
    zipped1 = dict(zip(list3, list4))
    mean1 = round(cluster1[col].mean())
    
    cluster2 = (df2.loc[df2["cluster"] == 2])
    list5 = list(cluster2["city"])
    list6 = list(cluster2[col])
    zipped2 = dict(zip(list5, list6))
    mean2 = round(cluster2[col].mean())
    
    cluster3 = (df2.loc[df2["cluster"] == 3])
    list7 = list(cluster3["city"])
    list8 = list(cluster3[col])
    zipped3 = dict(zip(list7, list8))
    mean3 = round(cluster3[col].mean())
    
    #print this out
    print(f'Clustering based on: {col}\n')
    print(zipped0)
    print(f"\nThe average {col} of cluster 0 is {mean0}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped1)
    print(f"\nThe average {col} of cluster 1 is {mean1}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped2)
    print(f"\nThe average {col} of cluster 2 is {mean2}")
    print("--------------------------------------------------------------------------------------------")
    print(zipped3)
    print(f"\nThe average {col} of cluster 3 is {mean3}")
    
    # Map of Europe (54.5260° N, 15.2551° E)
    map_europe = folium.Map(location=[54.5260,15.2551], zoom_start=3, tiles='OpenStreetMap')
    # tiles: 'OpenStreetMap', 'Stamen Toner', 
    
    # color scheme of the clusters
    x = np.arange(cluster_no)
    ys = [i + x + (i*x)**2 for i in range(cluster_no)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    
    # markers on the map
    markers_colors = []
    
    for lat, lng, city, cluster in zip(df2['lat'], df2['lng'], df2['city'], df2['cluster']):
        label = folium.Popup(str(city) + ' cluster ' + str(cluster), parse_html=True)
        #label = folium.Popup('label')
        folium.CircleMarker(location=[lat, lng], 
                            radius=4, 
                            popup=label, 
                            color=rainbow[cluster-1], 
                            fill=True, 
                            fill_color=rainbow[cluster-1], 
                            fill_opacity=1).add_to(map_europe)
    
    return map_europe

In [159]:
map_cluster_one_list(soceco_lat_lng, 'health_care', cluster_no=4) #pollution, health_care, purchasing_power, safety

Clustering based on: health_care

{'Athens': 56.17, 'Belgrade': 53.69, 'Bratislava': 57.17, 'Bucharest': 54.34, 'Kiev': 55.55, 'Minsk': 62.59, 'Moscow': 62.04, 'Riga': 60.73, 'Rome': 59.35, 'Sarajevo': 60.13, 'Skopje': 55.93, 'Sofia': 57.24, 'Valletta': 58.86, 'Warsaw': 54.65}

The average health_care of cluster 0 is 58.0
--------------------------------------------------------------------------------------------
{'Brussels': 74.5, 'Copenhagen': 78.15, 'Helsinki': 77.06, 'Luxembourg': 73.71, 'Madrid': 78.97, 'Oslo': 75.07, 'Paris': 78.58, 'Prague': 74.38, 'Vienna': 78.83}

The average health_care of cluster 1 is 77.0
--------------------------------------------------------------------------------------------
{'Amsterdam': 69.45, 'Berlin': 69.68, 'Lisbon': 71.38, 'Ljubljana': 66.24, 'London': 70.28, 'Reykjavik': 66.63, 'Stockholm': 66.9, 'Tallinn': 71.0, 'Vilnius': 71.09, 'Zagreb': 65.16}

The average health_care of cluster 2 is 69.0
-----------------------------------------------------