In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

from datetime import datetime, timedelta

from statistics import mean

from sklearn.decomposition import PCA

import requests
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup

**Aim: For each location in the dataset, find the 5 most similar locations/tourist destinations based on the selected variables.**

In [2]:
random.seed(3888)

# Preparing the data for nearest neighbours analysis

## Data import

In [3]:
df = pd.read_csv("data/data.txt")
df = df.drop(columns='Unnamed: 0')

df = df.rename(columns = {'tourist_service_index': 'tourist_service_infrastructure'})

# An Australian wouldn't travel to Australia!

df = df[df["iso_code"] != "AUS"]

In [4]:
df_without_covid = df.drop(columns=['new_cases_per_million', 
                                    'new_cases_smoothed_per_million', 
                                    'stringency_index', 
                                    'positive_rate', 
                                    'human_development_index', 
                                    'international_travel_controls',
                                    'cost_living_index',
                                    'date'])

In [5]:
non_covid_colnames = ['iso_code', 'location', 'continent', 'tourist_service_infrastructure', 'art_gallery']
covid_colnames = ['iso_code', 'new_cases_smoothed_per_million', 'date']

In [6]:
df_without_covid = df_without_covid.drop_duplicates()
df_without_covid = df_without_covid.reset_index()
df_without_covid = df_without_covid.drop(columns=['index'])

df_without_covid

Unnamed: 0,iso_code,location,continent,tourist_service_infrastructure
0,ALB,Albania,Europe,4.01
1,DZA,Algeria,Africa,1.84
2,ARG,Argentina,South America,4.53
3,ARM,Armenia,Asia,4.33
4,AUT,Austria,Europe,6.66
...,...,...,...,...
110,VEN,Venezuela,South America,2.96
111,VNM,Vietnam,Asia,2.85
112,YEM,Yemen,Asia,1.87
113,ZMB,Zambia,Africa,2.52


In [7]:
full_tourism = pd.read_csv("data/full_tourism.csv")
full_tourism = full_tourism[full_tourism["Country ISO3"] != "AUS"]

In [8]:
indicators = {
                'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
}

full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

inds = pd.DataFrame()

for ind in indicators.keys():
    inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]
    
non_covid = pd.merge(inds, df_without_covid, on='iso_code')

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


In [9]:
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

covid = covid[covid["iso_code"] != "AUS"]

In [10]:
covid_needed_cols = covid[covid_colnames]

In [11]:
covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')


In [12]:
covid_needed_cols = covid_needed_cols[covid_needed_cols['date'] >= datetime.now() - timedelta(days = 30)]

In [13]:
poi = pd.read_json("data/poi_types.json")

In [14]:
poi = poi.fillna(0)
poi = poi.replace(0, np.nan)
poi = poi.dropna(how='all', axis=0)
poi = poi.replace(np.nan, 0)

In [15]:
poi = poi.transpose()

In [16]:
poi = poi[poi.index != "Australia"]

In [17]:
iso_location = df[["iso_code", "location"]].drop_duplicates()

iso_location.to_csv("iso_location.csv")

In [18]:
def iso_code_to_loc(iso_code):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

In [19]:
def loc_to_iso_code(loc):
    return iso_location[iso_location["location"] == loc]["iso_code"].iloc[0]

In [20]:
poi = poi.set_index(loc_to_iso_code(loc) for loc in poi.index)

In [21]:
poi.columns

Index(['art_gallery', 'food', 'museum', 'park', 'restaurant', 'church',
       'mosque', 'place_of_worship', 'zoo', 'travel_agency', 'amusement_park',
       'aquarium', 'cafe', 'store', 'cemetery', 'hindu_temple',
       'natural_feature', 'library', 'campground', 'lodging', 'casino',
       'local_government_office', 'liquor_store', 'bar', 'shopping_mall',
       'spa', 'transit_station', 'grocery_or_supermarket', 'synagogue',
       'movie_theater', 'general_contractor', 'parking', 'book_store',
       'night_club', 'city_hall', 'clothing_store', 'department_store',
       'health', 'hospital'],
      dtype='object')

In [22]:
# removing less relevant columns

poi = poi.drop(columns=['travel_agency',
                        'store',
                        'cemetery',
                        'library',
                        'campground',
                        'lodging',
                        'local_government_office',
                        'liquor_store',
                        'transit_station',
                        'grocery_or_supermarket',
                        'movie_theater',
                        'general_contractor',
                        'parking',
                        'book_store',
                        'city_hall',
                        'health',
                        'hospital'
                       ])

In [23]:
poi['iso_code'] = poi.index

In [24]:
covid_merged = pd.merge(covid_needed_cols, non_covid, on="iso_code")
covid_merged = pd.merge(covid_merged, poi, on="iso_code")

In [25]:
# https://practicaldatascience.co.uk/data-science/how-to-read-an-rss-feed-in-python

def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)


def get_feed(url):
    """Return a Pandas dataframe containing the RSS feed contents.

    Args: 
        url (string): URL of the RSS feed to read.

    Returns:
        df (dataframe): Pandas dataframe containing the RSS feed contents.
    """
    
    response = get_source(url)
    
    df = pd.DataFrame(columns = ['title', 'pubDate', 'guid', 'description'])

    with response as r:
        items = r.html.find("item", first=False)

        for item in items:        

            title = item.find('title', first=True).text
            pubDate = item.find('pubDate', first=True).text
            guid = item.find('guid', first=True).text
            description = item.find('description', first=True).text

            row = {'title': [title], 'pubDate': [pubDate], 'guid': [guid], 'description': [description]}
            df = pd.concat([df, pd.DataFrame.from_dict(row)])

    return df

In [26]:
travel_advice = get_feed("https://www.smartraveller.gov.au/countries/documents/index.rss")

In [27]:
travel_advice = travel_advice[travel_advice["title"] != "No travel advice"]

In [28]:
travel_advice = travel_advice.drop(columns=['guid'])

In [29]:
travel_advice

Unnamed: 0,title,pubDate,description
0,Mali,07 May 2022 22:00:00 AEST,"On 6 May 2022, the US government issued an ale..."
0,Sri Lanka,07 May 2022 22:00:00 AEST,A Public Emergency has been declared in Sri La...
0,Timor-Leste,06 May 2022 22:00:00 AEST,The Presidential inauguration will be held on ...
0,Papua New Guinea,06 May 2022 22:00:00 AEST,"If you&#039;re not a PNG citizen, you&#039;ll ..."
0,Hong Kong,06 May 2022 22:00:00 AEST,Fully vaccinated travellers can enter Hong Kon...
...,...,...,...
0,Costa Rica,28 Oct 2021 23:00:00 AEDT,If you&#039;re not fully vaccinated against CO...
0,Marshall Islands,28 Oct 2021 23:00:00 AEDT,"Due to COVID-19, Marshall Islands has closed i..."
0,Niger,28 Oct 2021 23:00:00 AEDT,The US Government has issued a security alert ...
0,North Korea (Democratic People's Republic of K...,28 Oct 2021 23:00:00 AEDT,North Korea’s borders remain closed due to COV...


In [30]:
replacements = {
    "United States of America": "United States",
    "Israel and the Palestinian Territories": "Israel",
    "South Korea (Republic of Korea)": "South Korea"
}

In [31]:
for replacement in replacements:
    travel_advice.replace(replacement, replacements[replacement], inplace = True)

In [32]:
set(covid_merged["location"]).difference(set(travel_advice["title"]))

{'Barbados', 'Luxembourg', 'Suriname'}

In [33]:
travel_advice.rename(columns={"title": "location", "description": "advice"}, inplace = True)

In [34]:
travel_advice["advice"] = [BeautifulSoup(s, "lxml").text for s in travel_advice["advice"]]

In [35]:
covid_merged = pd.merge(covid_merged, travel_advice, on="location")

In [36]:
covid_merged

Unnamed: 0,iso_code,new_cases_smoothed_per_million,date,infrastructure,natural_cultural_resources,safety_security,health_hygiene,price_competitiveness,air_transport,ground_port,...,casino,bar,shopping_mall,spa,synagogue,night_club,clothing_store,department_store,pubDate,advice
0,ALB,17.354,2022-04-10,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13 Apr 2022 22:00:00 AEST,Face coverings must be worn at all times in in...
1,ALB,15.216,2022-04-11,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13 Apr 2022 22:00:00 AEST,Face coverings must be worn at all times in in...
2,ALB,13.525,2022-04-12,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13 Apr 2022 22:00:00 AEST,Face coverings must be worn at all times in in...
3,ALB,13.177,2022-04-13,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13 Apr 2022 22:00:00 AEST,Face coverings must be worn at all times in in...
4,ALB,13.326,2022-04-14,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13 Apr 2022 22:00:00 AEST,Face coverings must be worn at all times in in...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3131,ZWE,3.057,2022-05-03,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,11 Apr 2022 22:00:00 AEST,We now advise you exercise a high degree of ca...
3132,ZWE,3.038,2022-05-04,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,11 Apr 2022 22:00:00 AEST,We now advise you exercise a high degree of ca...
3133,ZWE,2.518,2022-05-05,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,11 Apr 2022 22:00:00 AEST,We now advise you exercise a high degree of ca...
3134,ZWE,3.521,2022-05-06,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,11 Apr 2022 22:00:00 AEST,We now advise you exercise a high degree of ca...


In [37]:
covid_colnames.append("advice")

covid_merged = covid_merged[list(set(non_covid_colnames).union(set(covid_colnames)))]

covid_colnames.remove("advice")

covid_merged

Unnamed: 0,art_gallery,location,tourist_service_infrastructure,iso_code,advice,date,new_cases_smoothed_per_million,continent
0,3.0,Albania,4.01,ALB,Face coverings must be worn at all times in in...,2022-04-10,17.354,Europe
1,3.0,Albania,4.01,ALB,Face coverings must be worn at all times in in...,2022-04-11,15.216,Europe
2,3.0,Albania,4.01,ALB,Face coverings must be worn at all times in in...,2022-04-12,13.525,Europe
3,3.0,Albania,4.01,ALB,Face coverings must be worn at all times in in...,2022-04-13,13.177,Europe
4,3.0,Albania,4.01,ALB,Face coverings must be worn at all times in in...,2022-04-14,13.326,Europe
...,...,...,...,...,...,...,...,...
3131,0.0,Zimbabwe,2.95,ZWE,We now advise you exercise a high degree of ca...,2022-05-03,3.057,Africa
3132,0.0,Zimbabwe,2.95,ZWE,We now advise you exercise a high degree of ca...,2022-05-04,3.038,Africa
3133,0.0,Zimbabwe,2.95,ZWE,We now advise you exercise a high degree of ca...,2022-05-05,2.518,Africa
3134,0.0,Zimbabwe,2.95,ZWE,We now advise you exercise a high degree of ca...,2022-05-06,3.521,Africa


In [38]:
descriptions = pd.read_csv("country_descriptions_cleaned_2.csv")

In [39]:
covid_merged = pd.merge(descriptions, covid_merged, on="iso_code")

In [40]:
covid_merged

Unnamed: 0,iso_code,description,art_gallery,location,tourist_service_infrastructure,advice,date,new_cases_smoothed_per_million,continent
0,ALB,Albania (Albanian: Shqipëria) is a country in ...,3.0,Albania,4.01,Face coverings must be worn at all times in in...,2022-04-10,17.354,Europe
1,ALB,Albania (Albanian: Shqipëria) is a country in ...,3.0,Albania,4.01,Face coverings must be worn at all times in in...,2022-04-11,15.216,Europe
2,ALB,Albania (Albanian: Shqipëria) is a country in ...,3.0,Albania,4.01,Face coverings must be worn at all times in in...,2022-04-12,13.525,Europe
3,ALB,Albania (Albanian: Shqipëria) is a country in ...,3.0,Albania,4.01,Face coverings must be worn at all times in in...,2022-04-13,13.177,Europe
4,ALB,Albania (Albanian: Shqipëria) is a country in ...,3.0,Albania,4.01,Face coverings must be worn at all times in in...,2022-04-14,13.326,Europe
...,...,...,...,...,...,...,...,...,...
3075,GBR,The United Kingdom of Great Britain and Northe...,6.0,United Kingdom,6.10,All COVID-19 travel restrictions in the UK hav...,2022-05-03,166.836,Europe
3076,GBR,The United Kingdom of Great Britain and Northe...,6.0,United Kingdom,6.10,All COVID-19 travel restrictions in the UK hav...,2022-05-04,165.496,Europe
3077,GBR,The United Kingdom of Great Britain and Northe...,6.0,United Kingdom,6.10,All COVID-19 travel restrictions in the UK hav...,2022-05-05,162.132,Europe
3078,GBR,The United Kingdom of Great Britain and Northe...,6.0,United Kingdom,6.10,All COVID-19 travel restrictions in the UK hav...,2022-05-06,159.615,Europe


## Computing medians of quantitative columns

In [41]:
covid_merged_no_quant = list(set(covid_merged.columns).difference(set(covid_merged.select_dtypes(include=[np.number]).columns)))
covid_merged_no_quant.remove("date")

In [42]:
medians = covid_merged.groupby(["iso_code"]).median()

In [43]:
medians = medians.fillna(covid_merged.median())

  medians = medians.fillna(covid_merged.median())
  medians = medians.fillna(covid_merged.median())


In [44]:
medians

Unnamed: 0_level_0,art_gallery,tourist_service_infrastructure,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,3.0,4.01,15.0915
ARE,0.0,5.63,23.8930
ARG,0.0,4.53,32.2895
ARM,0.0,4.33,3.3210
AUT,0.0,6.66,828.4320
...,...,...,...
VNM,0.0,2.85,194.8225
YEM,0.0,1.87,0.0050
ZAF,0.0,4.30,48.7420
ZMB,0.0,2.52,4.6060


In [45]:
all_data_w_medians = pd.merge(medians, covid_merged[covid_merged_no_quant], on="iso_code").drop_duplicates()

In [46]:
all_data_w_medians = all_data_w_medians.set_index(all_data_w_medians["iso_code"]).drop(columns=["iso_code"])

In [47]:
all_data_w_medians

Unnamed: 0_level_0,art_gallery,tourist_service_infrastructure,new_cases_smoothed_per_million,location,advice,description,continent
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ALB,3.0,4.01,15.0915,Albania,Face coverings must be worn at all times in in...,Albania (Albanian: Shqipëria) is a country in ...,Europe
ARE,0.0,5.63,23.8930,United Arab Emirates,You no longer need a COVID-19 test to travel t...,The United Arab Emirates (Arabic: دولة الإمارا...,Asia
ARG,0.0,4.53,32.2895,Argentina,You no longer need to have a COVID-19 test or ...,Argentina is a large country in the southern p...,South America
ARM,0.0,4.33,3.3210,Armenia,You can enter Armenia with a negative COVID-19...,Armenia (Armenian: Հայաստան Hayastan) is a lan...,Asia
AUT,0.0,6.66,828.4320,Austria,Most COVID-19 measures across Austria have bee...,"Austria (German: Österreich, literally ""the Ea...",Europe
...,...,...,...,...,...,...,...
VNM,0.0,2.85,194.8225,Vietnam,We now advise you exercise a high degree of ca...,Vietnam (Vietnamese: Việt Nam) is a country in...,Asia
YEM,0.0,1.87,0.0050,Yemen,We've reviewed our travel advice for Yemen. We...,Yemen (Arabic: ٱلْيَمَن) is a country in the M...,Asia
ZAF,0.0,4.30,48.7420,South Africa,Heavy rains and flooding in the city of Durban...,"SA redirects here. For the state, see South Au...",Africa
ZMB,0.0,2.52,4.6060,Zambia,The new COVID-19 ‘Omicron’ variant is of globa...,Zambia offers travelers some of the world's be...,Africa


## Min-max scaling

In [48]:
iso_code = medians.index

In [49]:
scaler = MinMaxScaler()

In [50]:
medians_scaled = scaler.fit_transform(medians)

In [51]:
cols = list(set(non_covid_colnames).union(set(covid_colnames)))
to_remove = ['iso_code', 'continent', 'location', 'date']

for col in to_remove:
    cols.remove(col)
    
medians_scaled = pd.DataFrame(medians_scaled, 
                              columns = cols, 
                              index = iso_code)
medians_scaled

Unnamed: 0_level_0,art_gallery,tourist_service_infrastructure,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,0.5,0.458753,0.009300
ARE,0.0,0.784708,0.014725
ARG,0.0,0.563380,0.019899
ARM,0.0,0.523139,0.002047
AUT,0.0,0.991952,0.510540
...,...,...,...
VNM,0.0,0.225352,0.120064
YEM,0.0,0.028169,0.000003
ZAF,0.0,0.517103,0.030038
ZMB,0.0,0.158954,0.002839


In [52]:
# perform PCA if > 2 attributes selected

if len(medians_scaled.columns) > 2:
    pca = PCA(n_components=2)
    pc = pca.fit_transform(medians_scaled)
    medians_scaled = pd.DataFrame(data = pc, columns = ['PC1', 'PC2'], index = medians_scaled.index)

In [53]:
medians_scaled

Unnamed: 0_level_0,PC1,PC2
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ALB,-0.072664,0.433681
ARE,0.153815,-0.071691
ARG,-0.024459,-0.066682
ARM,-0.067538,-0.063599
AUT,0.604888,-0.134246
...,...,...
VNM,-0.244370,-0.069695
YEM,-0.473951,-0.050823
ZAF,-0.056594,-0.066682
ZMB,-0.365263,-0.054464


# Finding 5 nearest neighbours for each location

In [54]:
num_neighbours = 5

In [55]:
dist_metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine', 'cityblock', 'braycurtis', 'canberra',
               'correlation', 'minkowski']

In [56]:
location_neighbours = {}

for metric in dist_metrics:
    nbrs = NearestNeighbors(metric = metric, 
                            n_neighbors = num_neighbours + 1, 
                            algorithm='auto').fit(medians_scaled)
    
    nbr_indices = list(list(x) for x in nbrs.kneighbors(medians_scaled)[1])
    
    iso_location = df[["iso_code", "location"]].drop_duplicates()

    for i in range(len(nbr_indices)):
        current_iso_code = list(medians_scaled.index)[i]
        current_location = iso_code_to_loc(current_iso_code)

        neighbours = []
        for j in range(1, num_neighbours + 1):
            iso_code = medians_scaled.index[nbr_indices[i][j]]
            neighbours.append(iso_code_to_loc(iso_code))

        if not current_location in location_neighbours:
            location_neighbours[current_location] = {metric: neighbours}
        else:
            location_neighbours[current_location][metric] = neighbours

In [57]:
location_neighbours_df = pd.DataFrame(location_neighbours).transpose()

In [58]:
location_neighbours_df

Unnamed: 0,euclidean,manhattan,chebyshev,cosine,cityblock,braycurtis,canberra,correlation,minkowski
Albania,"[Serbia, Morocco, Colombia, Russia, Kazakhstan]","[Morocco, Serbia, Colombia, Russia, Kazakhstan]","[Serbia, Morocco, Colombia, Russia, Kazakhstan]","[Serbia, Morocco, Colombia, Russia, Kazakhstan]","[Morocco, Serbia, Colombia, Russia, Kazakhstan]","[Morocco, Serbia, Colombia, Russia, Kazakhstan]","[Morocco, Serbia, Colombia, Russia, Kazakhstan]","[Mongolia, Moldova, Sri Lanka, Nicaragua, Kenya]","[Serbia, Morocco, Colombia, Russia, Kazakhstan]"
United Arab Emirates,"[Montenegro, Costa Rica, Mauritius, Denmark, P...","[Montenegro, Costa Rica, Mauritius, Peru, Denm...","[Montenegro, Costa Rica, Mauritius, Denmark, P...","[Montenegro, Finland, Estonia, Belgium, Israel]","[Montenegro, Costa Rica, Mauritius, Peru, Denm...","[Montenegro, Costa Rica, Mauritius, Peru, Denm...","[Montenegro, Costa Rica, Mauritius, Peru, Bulg...","[New Zealand, Turkey, Poland, Taiwan, Denmark]","[Montenegro, Costa Rica, Mauritius, Denmark, P..."
Argentina,"[Brazil, Poland, Chile, Romania, Myanmar]","[Brazil, Poland, Chile, Romania, Myanmar]","[Brazil, Poland, Chile, Romania, Myanmar]","[Chile, Brazil, Poland, Romania, Myanmar]","[Brazil, Poland, Chile, Romania, Myanmar]","[Brazil, Poland, Chile, Romania, South Africa]","[Brazil, Poland, Chile, Romania, Myanmar]","[Greece, Jamaica, Italy, Israel, United Arab E...","[Brazil, Poland, Chile, Romania, Myanmar]"
Armenia,"[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Ukraine, South Africa, Myanmar, Tunisia, Oman]","[Netherlands, Mongolia, Moldova, Nicaragua, Ne...","[Ukraine, South Africa, Myanmar, Tunisia, Oman]"
Austria,"[Portugal, Germany, Cyprus, Italy, South Korea]","[Portugal, Cyprus, Germany, Italy, South Korea]","[Germany, Portugal, Cyprus, Italy, South Korea]","[Portugal, Spain, United States, New Zealand, ...","[Portugal, Cyprus, Germany, Italy, South Korea]","[Portugal, Cyprus, Germany, Italy, South Korea]","[Portugal, Italy, Germany, Cyprus, South Korea]","[Romania, Italy, Japan, Israel, Jamaica]","[Portugal, Germany, Cyprus, Italy, South Korea]"
...,...,...,...,...,...,...,...,...,...
Vietnam,"[Sri Lanka, Paraguay, Cambodia, Egypt, Bolivia]","[Sri Lanka, Paraguay, Cambodia, Egypt, Bolivia]","[Paraguay, Sri Lanka, Egypt, Cambodia, Azerbai...","[Honduras, Nicaragua, China, Bolivia, Philippi...","[Sri Lanka, Paraguay, Cambodia, Egypt, Bolivia]","[Sri Lanka, Paraguay, Cambodia, Egypt, Azerbai...","[Sri Lanka, Paraguay, Cambodia, Egypt, Bolivia]","[Netherlands, Mongolia, Moldova, Nicaragua, Ne...","[Sri Lanka, Paraguay, Cambodia, Egypt, Bolivia]"
Yemen,"[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Mongolia, Moldova, Lebanon, Nicaragua, Kazakh...","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]"
South Africa,"[Myanmar, Armenia, Ukraine, Poland, Chile]","[Myanmar, Armenia, Ukraine, Poland, Argentina]","[Myanmar, Armenia, Ukraine, Poland, Chile]","[Myanmar, Armenia, Ukraine, Poland, Tunisia]","[Myanmar, Armenia, Ukraine, Poland, Argentina]","[Myanmar, Armenia, Ukraine, Poland, Chile]","[Myanmar, Armenia, Ukraine, Tunisia, Poland]","[South Korea, Italy, Saudi Arabia, Hungary, Ja...","[Myanmar, Armenia, Ukraine, Poland, Chile]"
Zambia,"[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, Nepal]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Nigeria, Moldova, Lebanon, Mongolia, Nicaragua]","[Tanzania, Pakistan, Ghana, Uganda, India]"


In [59]:
prop_sim = []

for location in location_neighbours.keys():
    ind_pairs = [[0, 1], [0, 2], [1, 2]]
    
    for pair in ind_pairs:
        loc_intersect = set(location_neighbours[location][dist_metrics[0]]).intersection(location_neighbours[location][dist_metrics[1]])
        prop_sim.append(len(loc_intersect)/num_neighbours)

In [60]:
mean(prop_sim)

0.9527272727272728

In [61]:
def find_top_neighbours(country):
    d = {}
    lists = location_neighbours_df.loc[country].tolist()
    for ls in lists:
        for c in ls:
            if c in d:
                d[c] += 1
            else:
                d[c] = 1
    top = []

    for k,v in sorted(d.items(), key=lambda p:p[1], reverse=True)[:num_neighbours]:
        top.append(k)
    return top

In [62]:
locations = location_neighbours_df.index.tolist()
top_neighbours = {}
for location in locations:
    top_neighbours[location] = find_top_neighbours(location)
top_neighbours_df = pd.DataFrame(top_neighbours).transpose()
top_neighbours_df

Unnamed: 0,0,1,2,3,4
Albania,Serbia,Morocco,Colombia,Russia,Kazakhstan
United Arab Emirates,Montenegro,Costa Rica,Mauritius,Denmark,Peru
Argentina,Brazil,Poland,Chile,Romania,Myanmar
Armenia,Ukraine,South Africa,Myanmar,Tunisia,Oman
Austria,Portugal,Germany,Italy,Cyprus,South Korea
...,...,...,...,...,...
Vietnam,Sri Lanka,Paraguay,Cambodia,Egypt,Bolivia
Yemen,Algeria,Bangladesh,Ethiopia,Cameroon,Nepal
South Africa,Myanmar,Armenia,Ukraine,Poland,Chile
Zambia,Tanzania,Pakistan,Ghana,Uganda,India


In [63]:
top_neighbours_df.loc["United Kingdom"]

0        Croatia
1    Netherlands
2        Ireland
3       Slovenia
4       Slovakia
Name: United Kingdom, dtype: object