In [130]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

from datetime import datetime, timedelta

from statistics import mean

from sklearn.decomposition import PCA

**Aim: For each location in the dataset, find the 5 most similar locations/tourist destinations based on the selected variables.**

In [2]:
random.seed(3888)

# Preparing the data for nearest neighbours analysis

## Data import

In [3]:
df = pd.read_csv("data/data.txt")
df = df.drop(columns='Unnamed: 0')

df = df.rename(columns = {'tourist_service_index': 'tourist_service_infrastructure'})

In [4]:
df_without_covid = df.drop(columns=['new_cases_per_million', 
                                    'new_cases_smoothed_per_million', 
                                    'stringency_index', 
                                    'positive_rate', 
                                    'human_development_index', 
                                    'international_travel_controls',
                                    'cost_living_index',
                                    'date'])

In [88]:
non_covid_colnames = ['iso_code', 'location', 'continent', 'tourist_service_infrastructure', 'art_gallery']
covid_colnames = ['iso_code', 'new_cases_smoothed_per_million', 'date']

In [89]:
df_without_covid = df_without_covid.drop_duplicates()
df_without_covid = df_without_covid.reset_index()
df_without_covid = df_without_covid.drop(columns=['index'])

df_without_covid

Unnamed: 0,iso_code,location,continent,tourist_service_infrastructure
0,ALB,Albania,Europe,4.01
1,DZA,Algeria,Africa,1.84
2,ARG,Argentina,South America,4.53
3,ARM,Armenia,Asia,4.33
4,AUS,Australia,Oceania,6.06
...,...,...,...,...
111,VEN,Venezuela,South America,2.96
112,VNM,Vietnam,Asia,2.85
113,YEM,Yemen,Asia,1.87
114,ZMB,Zambia,Africa,2.52


In [90]:
full_tourism = pd.read_csv("data/full_tourism.csv")

full_tourism

Unnamed: 0,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2015,2017,2019
0,AGO,Angola,3524,"WEF Travel & Tourism Competitiveness Index, 1-...",Value,2.60,,2.74
1,AGO,Angola,3525,"WEF Travel & Tourism Competitiveness Index, 1-...",Rank,139.00,,134.00
2,AGO,Angola,3526,"WEF Enabling environment subindex, 1-7 (best)",Value,3.23,,3.44
3,AGO,Angola,3527,"WEF Enabling environment subindex, 1-7 (best)",Rank,136.00,,133.00
4,AGO,Angola,3528,WEF Travel and Tourism policy and enabling con...,Value,3.21,,3.66
...,...,...,...,...,...,...,...,...
5618,ZWE,Zimbabwe,3561,"WEF Tourist service infrastructure pillar, 1-7...",Rank,112.00,106.00,102.00
5619,ZWE,Zimbabwe,3562,"WEF Natural resources pillar, 1-7 (best)",Value,3.51,3.61,3.60
5620,ZWE,Zimbabwe,3563,"WEF Natural resources pillar, 1-7 (best)",Rank,44.00,48.00,44.00
5621,ZWE,Zimbabwe,3564,"WEF Cultural resources and business travel, 1-...",Value,1.46,1.45,1.28


In [91]:
indicators = {
                'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
}

full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

inds = pd.DataFrame()

for ind in indicators.keys():
    inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]
    
non_covid = pd.merge(inds, df_without_covid, on='iso_code')

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


In [92]:
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

In [93]:
covid_needed_cols = covid[covid_colnames]

In [94]:
covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')


In [95]:
covid_needed_cols = covid_needed_cols[covid_needed_cols['date'] >= datetime.now() - timedelta(days = 30)]

In [96]:
poi = pd.read_json("data/poi_types.json")

In [97]:
poi = poi.fillna(0)
poi = poi.replace(0, np.nan)
poi = poi.dropna(how='all', axis=0)
poi = poi.replace(np.nan, 0)

In [98]:
poi = poi.transpose()

In [99]:
iso_location = df[["iso_code", "location"]].drop_duplicates()

In [100]:
def iso_code_to_loc(iso_code):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

In [101]:
def loc_to_iso_code(loc):
    return iso_location[iso_location["location"] == loc]["iso_code"].iloc[0]

In [102]:
poi = poi.set_index(loc_to_iso_code(iso_code) for iso_code in poi.index)

In [103]:
poi.columns

Index(['art_gallery', 'food', 'museum', 'park', 'restaurant', 'church',
       'mosque', 'place_of_worship', 'zoo', 'travel_agency', 'amusement_park',
       'aquarium', 'cafe', 'store', 'cemetery', 'hindu_temple',
       'natural_feature', 'library', 'campground', 'lodging', 'casino',
       'local_government_office', 'liquor_store', 'bar', 'shopping_mall',
       'spa', 'transit_station', 'grocery_or_supermarket', 'synagogue',
       'movie_theater', 'general_contractor', 'parking', 'book_store',
       'night_club', 'city_hall', 'clothing_store', 'department_store',
       'health', 'hospital'],
      dtype='object')

In [104]:
# removing less relevant columns

poi = poi.drop(columns=['travel_agency',
                        'store',
                        'cemetery',
                        'library',
                        'campground',
                        'lodging',
                        'local_government_office',
                        'liquor_store',
                        'transit_station',
                        'grocery_or_supermarket',
                        'movie_theater',
                        'general_contractor',
                        'parking',
                        'book_store',
                        'city_hall',
                        'health',
                        'hospital'
                       ])

In [105]:
poi['iso_code'] = poi.index

In [106]:
covid_merged = pd.merge(covid_needed_cols, non_covid, on="iso_code")
covid_merged = pd.merge(covid_merged, poi, on="iso_code")

In [107]:
covid_merged

Unnamed: 0,iso_code,new_cases_smoothed_per_million,date,infrastructure,natural_cultural_resources,safety_security,health_hygiene,price_competitiveness,air_transport,ground_port,...,hindu_temple,natural_feature,casino,bar,shopping_mall,spa,synagogue,night_club,clothing_store,department_store
0,ALB,22.227,2022-04-06,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ALB,21.432,2022-04-07,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ALB,18.746,2022-04-08,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ALB,18.299,2022-04-09,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ALB,17.354,2022-04-10,3.08,2.04,5.77,5.28,5.26,2.10,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,ZWE,3.351,2022-04-29,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3244,ZWE,3.322,2022-04-30,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3245,ZWE,3.663,2022-05-01,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3246,ZWE,3.190,2022-05-02,2.34,2.44,5.39,2.96,5.32,1.79,2.27,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
covid_merged = covid_merged[list(set(non_covid_colnames).union(set(covid_colnames)))]
covid_merged

Unnamed: 0,location,iso_code,date,new_cases_smoothed_per_million,continent,tourist_service_infrastructure,art_gallery
0,Albania,ALB,2022-04-06,22.227,Europe,4.01,3.0
1,Albania,ALB,2022-04-07,21.432,Europe,4.01,3.0
2,Albania,ALB,2022-04-08,18.746,Europe,4.01,3.0
3,Albania,ALB,2022-04-09,18.299,Europe,4.01,3.0
4,Albania,ALB,2022-04-10,17.354,Europe,4.01,3.0
...,...,...,...,...,...,...,...
3243,Zimbabwe,ZWE,2022-04-29,3.351,Africa,2.95,0.0
3244,Zimbabwe,ZWE,2022-04-30,3.322,Africa,2.95,0.0
3245,Zimbabwe,ZWE,2022-05-01,3.663,Africa,2.95,0.0
3246,Zimbabwe,ZWE,2022-05-02,3.190,Africa,2.95,0.0


## Computing medians of quantitative columns

In [109]:
medians = covid_merged.groupby(["iso_code"]).median()

In [110]:
medians = medians.fillna(covid_merged.median())

  medians = medians.fillna(covid_merged.median())
  medians = medians.fillna(covid_merged.median())


In [111]:
medians

Unnamed: 0_level_0,new_cases_smoothed_per_million,tourist_service_infrastructure,art_gallery
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,15.7630,4.01,3.0
ARE,23.9785,5.63,0.0
ARG,34.3075,4.53,0.0
ARM,4.0910,4.33,0.0
AUS,1686.4350,6.06,6.0
...,...,...,...
VNM,312.0865,2.85,0.0
YEM,0.0140,1.87,0.0
ZAF,27.8460,4.30,0.0
ZMB,5.0095,2.52,0.0


## Min-max scaling

In [141]:
iso_code = medians.index

In [142]:
scaler = MinMaxScaler()

In [143]:
medians_scaled = scaler.fit_transform(medians)

In [144]:
cols = list(set(non_covid_colnames).union(set(covid_colnames)))
to_remove = ['iso_code', 'continent', 'location', 'date']

for col in to_remove:
    cols.remove(col)

medians_scaled = pd.DataFrame(medians_scaled, 
                              columns = cols, 
                              index = iso_code)
medians_scaled

Unnamed: 0_level_0,new_cases_smoothed_per_million,tourist_service_infrastructure,art_gallery
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,0.007822,0.458753,0.5
ARE,0.011898,0.784708,0.0
ARG,0.017023,0.563380,0.0
ARM,0.002030,0.523139,0.0
AUS,0.836801,0.871227,1.0
...,...,...,...
VNM,0.154856,0.225352,0.0
YEM,0.000007,0.028169,0.0
ZAF,0.013817,0.517103,0.0
ZMB,0.002486,0.158954,0.0


In [145]:
# perform PCA if > 2 attributes selected

if len(medians_scaled.columns) > 2:
    pca = PCA(n_components=2)
    pc = pca.fit_transform(medians_scaled)
    medians_scaled = pd.DataFrame(data = pc, columns = ['PC1', 'PC2'], index = medians_scaled.index)

In [162]:
medians_scaled

Unnamed: 0_level_0,PC1,PC2
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ALB,-0.011781,0.412264
ARE,0.123845,-0.160410
ARG,-0.049126,-0.096759
ARM,-0.089385,-0.084853
AUS,0.899975,0.754801
...,...,...
VNM,-0.241383,-0.002266
YEM,-0.483669,0.057774
ZAF,-0.087659,-0.083361
ZMB,-0.378413,0.020047


# Finding 5 nearest neighbours for each location

In [147]:
num_neighbours = 5

In [148]:
dist_metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine', 'cityblock', 'braycurtis', 'canberra',
               'correlation', 'minkowski']

In [149]:
location_neighbours = {}

for metric in dist_metrics:
    nbrs = NearestNeighbors(metric = metric, 
                            n_neighbors = num_neighbours + 1, 
                            algorithm='auto').fit(medians_scaled)
    
    nbr_indices = list(list(x) for x in nbrs.kneighbors(medians_scaled)[1])
    
    iso_location = df[["iso_code", "location"]].drop_duplicates()

    for i in range(len(nbr_indices)):
        current_iso_code = list(medians_scaled.index)[i]
        current_location = iso_code_to_loc(current_iso_code)

        neighbours = []
        for j in range(1, num_neighbours + 1):
            iso_code = medians_scaled.index[nbr_indices[i][j]]
            neighbours.append(iso_code_to_loc(iso_code))

        if not current_location in location_neighbours:
            location_neighbours[current_location] = {metric: neighbours}
        else:
            location_neighbours[current_location][metric] = neighbours

In [150]:
location_neighbours_df = pd.DataFrame(location_neighbours).transpose()

In [151]:
location_neighbours_df

Unnamed: 0,euclidean,manhattan,chebyshev,cosine,cityblock,braycurtis,canberra,correlation,minkowski
Albania,"[Serbia, Morocco, Colombia, Russia, Hong Kong]","[Serbia, Morocco, Russia, Colombia, Hong Kong]","[Serbia, Morocco, Colombia, Russia, Hong Kong]","[Serbia, Morocco, Colombia, Russia, Hong Kong]","[Serbia, Morocco, Russia, Colombia, Hong Kong]","[Serbia, Morocco, Colombia, Russia, Hong Kong]","[Morocco, Colombia, Kazakhstan, Mexico, Serbia]","[Nicaragua, North Macedonia, Mongolia, Morocco...","[Serbia, Morocco, Colombia, Russia, Hong Kong]"
United Arab Emirates,"[Montenegro, Costa Rica, Estonia, Peru, Maurit...","[Montenegro, Costa Rica, Estonia, Peru, Norway]","[Montenegro, Costa Rica, Mauritius, Estonia, P...","[Montenegro, Costa Rica, Mauritius, Denmark, B...","[Montenegro, Costa Rica, Estonia, Peru, Norway]","[Montenegro, Costa Rica, Estonia, Peru, Bulgaria]","[Montenegro, Costa Rica, Estonia, Israel, Bulg...","[Hungary, Seychelles, Croatia, Greece, Myanmar]","[Montenegro, Costa Rica, Estonia, Peru, Maurit..."
Argentina,"[Brazil, Poland, Romania, Chile, Myanmar]","[Brazil, Poland, Romania, Chile, Myanmar]","[Brazil, Poland, Romania, Chile, Myanmar]","[Brazil, Poland, Romania, Chile, Myanmar]","[Brazil, Poland, Romania, Chile, Myanmar]","[Brazil, Poland, Romania, Myanmar, Chile]","[Brazil, Poland, Romania, Chile, Myanmar]","[Seychelles, Israel, Ireland, Hungary, Japan]","[Brazil, Poland, Romania, Chile, Myanmar]"
Armenia,"[South Africa, Ukraine, Myanmar, Chile, Tunisia]","[South Africa, Ukraine, Myanmar, Chile, Tunisia]","[South Africa, Ukraine, Myanmar, Tunisia, Chile]","[South Africa, Ukraine, Myanmar, Tunisia, Chile]","[South Africa, Ukraine, Myanmar, Chile, Tunisia]","[South Africa, Ukraine, Myanmar, Chile, Tunisia]","[South Africa, Ukraine, Myanmar, Chile, Tunisia]","[Mongolia, Morocco, Lebanon, North Macedonia, ...","[South Africa, Ukraine, Myanmar, Chile, Tunisia]"
Australia,"[United Kingdom, France, Barbados, Croatia, Sl...","[United Kingdom, France, Barbados, Croatia, Sl...","[United Kingdom, France, Croatia, Barbados, Sl...","[Ireland, Slovenia, Barbados, Croatia, United ...","[United Kingdom, France, Barbados, Croatia, Sl...","[United Kingdom, France, Barbados, Croatia, Sl...","[United Kingdom, France, Barbados, Slovenia, C...","[Greece, Israel, Iceland, Hungary, Japan]","[United Kingdom, France, Barbados, Croatia, Sl..."
...,...,...,...,...,...,...,...,...,...
Vietnam,"[Bolivia, Sri Lanka, Cambodia, Paraguay, Egypt]","[Bolivia, Nicaragua, Sri Lanka, Honduras, Camb...","[Sri Lanka, Cambodia, Paraguay, Bolivia, Egypt]","[Moldova, Kenya, Mongolia, Zimbabwe, Venezuela]","[Bolivia, Nicaragua, Sri Lanka, Honduras, Camb...","[Bolivia, Sri Lanka, Nicaragua, Honduras, Camb...","[Moldova, Mongolia, Kenya, Zimbabwe, Venezuela]","[Albania, Oman, Ecuador, Philippines, Russia]","[Bolivia, Sri Lanka, Cambodia, Paraguay, Egypt]"
Yemen,"[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]","[Nicaragua, North Macedonia, Morocco, Nigeria,...","[Algeria, Bangladesh, Ethiopia, Cameroon, Nepal]"
South Africa,"[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Armenia, Ukraine, Myanmar, Chile, Tunisia]","[Netherlands, North Macedonia, Morocco, Nicara...","[Armenia, Ukraine, Myanmar, Chile, Tunisia]"
Zambia,"[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Ghana, Pakistan, Uganda, Nepal]","[Tanzania, Pakistan, Ghana, Uganda, India]","[Tanzania, Pakistan, Ghana, Uganda, Nepal]","[Tanzania, Ghana, Uganda, Nepal, Cameroon]","[Netherlands, Moldova, Lebanon, Nicaragua, Nepal]","[Tanzania, Pakistan, Ghana, Uganda, India]"


In [152]:
prop_sim = []

for location in location_neighbours.keys():
    ind_pairs = [[0, 1], [0, 2], [1, 2]]
    
    for pair in ind_pairs:
        loc_intersect = set(location_neighbours[location][dist_metrics[0]]).intersection(location_neighbours[location][dist_metrics[1]])
        prop_sim.append(len(loc_intersect)/num_neighbours)

In [153]:
mean(prop_sim)

0.9586206896551724

In [154]:
def find_top_neighbours(country):
    d = {}
    lists = location_neighbours_df.loc[country].tolist()
    for ls in lists:
        for c in ls:
            if c in d:
                d[c] += 1
            else:
                d[c] = 1
    top = []

    for k,v in sorted(d.items(), key=lambda p:p[1], reverse=True)[:num_neighbours]:
        top.append(k)
    return top

In [155]:
locations = location_neighbours_df.index.tolist()
top_neighbours = {}
for location in locations:
    top_neighbours[location] = find_top_neighbours(location)
top_neighbours_df = pd.DataFrame(top_neighbours).transpose()
top_neighbours_df

Unnamed: 0,0,1,2,3,4
Albania,Morocco,Serbia,Colombia,Russia,Hong Kong
United Arab Emirates,Montenegro,Costa Rica,Estonia,Peru,Mauritius
Argentina,Brazil,Poland,Romania,Chile,Myanmar
Armenia,South Africa,Ukraine,Myanmar,Chile,Tunisia
Australia,United Kingdom,Barbados,Croatia,Slovenia,France
...,...,...,...,...,...
Vietnam,Bolivia,Sri Lanka,Cambodia,Paraguay,Egypt
Yemen,Algeria,Bangladesh,Ethiopia,Cameroon,Nepal
South Africa,Armenia,Ukraine,Myanmar,Chile,Tunisia
Zambia,Tanzania,Ghana,Uganda,Pakistan,India


In [161]:
top_neighbours_df.loc["Australia"]

0    United Kingdom
1          Barbados
2           Croatia
3          Slovenia
4            France
Name: Australia, dtype: object