In [250]:
import pandas as pd
import random
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

from datetime import datetime, timedelta

from statistics import mean

**Aim: For each location in the dataset, find the 5 most locations/tourist destinations based on tourist experience and COVID situation (cases) over the last 90 days.**

In [251]:
random.seed(3888)

# Preparing the data for nearest neighbours analysis

## Data import

In [252]:
df = pd.read_csv("data/data.txt")
df = df.drop(columns='Unnamed: 0')

df = df.rename(columns = {'tourist_service_index': 'tourist_service_infrastructure'})

In [253]:
non_covid_colnames = ['iso_code', 'location', 'continent', 'tourist_service_infrastructure']
covid_colnames = ['iso_code', 'new_cases_smoothed_per_million', 'date', 'stringency_index']

In [254]:
df_without_covid = df_without_covid.drop_duplicates()
df_without_covid = df_without_covid.reset_index()
df_without_covid = df_without_covid.drop(columns=['index'])

df_without_covid

Unnamed: 0,iso_code,location,continent,tourist_service_infrastructure
0,ALB,Albania,Europe,4.01
1,DZA,Algeria,Africa,1.84
2,ARG,Argentina,South America,4.53
3,ARM,Armenia,Asia,4.33
4,AUS,Australia,Oceania,6.06
...,...,...,...,...
111,VEN,Venezuela,South America,2.96
112,VNM,Vietnam,Asia,2.85
113,YEM,Yemen,Asia,1.87
114,ZMB,Zambia,Africa,2.52


In [255]:
full_tourism = pd.read_csv("data/full_tourism.csv")

full_tourism

Unnamed: 0,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2015,2017,2019
0,AGO,Angola,3524,"WEF Travel & Tourism Competitiveness Index, 1-...",Value,2.60,,2.74
1,AGO,Angola,3525,"WEF Travel & Tourism Competitiveness Index, 1-...",Rank,139.00,,134.00
2,AGO,Angola,3526,"WEF Enabling environment subindex, 1-7 (best)",Value,3.23,,3.44
3,AGO,Angola,3527,"WEF Enabling environment subindex, 1-7 (best)",Rank,136.00,,133.00
4,AGO,Angola,3528,WEF Travel and Tourism policy and enabling con...,Value,3.21,,3.66
...,...,...,...,...,...,...,...,...
5618,ZWE,Zimbabwe,3561,"WEF Tourist service infrastructure pillar, 1-7...",Rank,112.00,106.00,102.00
5619,ZWE,Zimbabwe,3562,"WEF Natural resources pillar, 1-7 (best)",Value,3.51,3.61,3.60
5620,ZWE,Zimbabwe,3563,"WEF Natural resources pillar, 1-7 (best)",Rank,44.00,48.00,44.00
5621,ZWE,Zimbabwe,3564,"WEF Cultural resources and business travel, 1-...",Value,1.46,1.45,1.28


In [256]:
indicators = {
                'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
}

full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

inds = pd.DataFrame()

for ind in indicators.keys():
    inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]
    
non_covid = pd.merge(inds, df_without_covid, on='iso_code')

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


In [257]:
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

In [258]:
covid_needed_cols = covid[covid_colnames]

In [259]:
covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')


In [260]:
covid_needed_cols = covid_needed_cols[covid_needed_cols['date'] >= datetime.now() - timedelta(days = 90)]

In [261]:
covid_merged = pd.merge(covid_needed_cols, non_covid, on="iso_code")

In [262]:
covid_merged = covid_merged[list(set(non_covid_colnames).union(set(covid_colnames)))]
covid_merged

Unnamed: 0,tourist_service_infrastructure,iso_code,date,location,continent,stringency_index,new_cases_smoothed_per_million
0,4.01,ALB,2022-02-02,Albania,Europe,61.11,508.440
1,4.01,ALB,2022-02-03,Albania,Europe,61.11,430.769
2,4.01,ALB,2022-02-04,Albania,Europe,61.11,449.814
3,4.01,ALB,2022-02-05,Albania,Europe,61.11,449.814
4,4.01,ALB,2022-02-06,Albania,Europe,61.11,441.709
...,...,...,...,...,...,...,...
10203,2.95,ZWE,2022-04-26,Zimbabwe,Africa,,3.502
10204,2.95,ZWE,2022-04-27,Zimbabwe,Africa,,3.720
10205,2.95,ZWE,2022-04-28,Zimbabwe,Africa,,3.796
10206,2.95,ZWE,2022-04-29,Zimbabwe,Africa,,3.351


## Computing medians of quantitative columns

In [263]:
medians = covid_merged.groupby(["iso_code"]).median()

In [264]:
medians = medians.fillna(covid_merged.median())

  medians = medians.fillna(covid_merged.median())
  medians = medians.fillna(covid_merged.median())


In [265]:
medians

Unnamed: 0_level_0,tourist_service_infrastructure,stringency_index,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,4.01,46.30,22.0035
ARE,5.63,46.30,34.5235
ARG,4.53,45.37,112.3540
ARM,4.33,43.52,21.6590
AUS,6.06,48.61,1534.3550
...,...,...,...
VNM,2.85,63.89,606.3320
YEM,1.87,26.85,0.0420
ZAF,4.30,37.96,26.3995
ZMB,2.52,27.78,7.6110


## Min-max scaling

In [266]:
iso_code = medians.index

In [267]:
scaler = MinMaxScaler()

In [268]:
medians_scaled = scaler.fit_transform(medians)

In [269]:
cols = list(set(non_covid_colnames).union(set(covid_colnames)))
to_remove = ['iso_code', 'continent', 'location', 'date']

for col in to_remove:
    cols.remove(col)

medians_scaled = pd.DataFrame(medians_scaled, 
                              columns = cols, 
                              index = iso_code)
medians_scaled

Unnamed: 0_level_0,tourist_service_infrastructure,stringency_index,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALB,0.458753,0.566298,0.005579
ARE,0.784708,0.566298,0.008753
ARG,0.563380,0.554196,0.028486
ARM,0.523139,0.530124,0.005491
AUS,0.871227,0.596357,0.389020
...,...,...,...
VNM,0.225352,0.795185,0.153729
YEM,0.028169,0.313208,0.000011
ZAF,0.517103,0.457775,0.006693
ZMB,0.158954,0.325309,0.001930


# Finding 5 nearest neighbours for each location

In [270]:
def iso_code_to_loc(iso_code):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

In [271]:
num_neighbours = 5

In [272]:
dist_metrics = ['euclidean', 'manhattan', 'chebyshev']

In [273]:
location_neighbours = {}

for metric in dist_metrics:
    nbrs = NearestNeighbors(metric = metric, 
                            n_neighbors = num_neighbours + 1, 
                            algorithm='ball_tree').fit(medians_scaled)
    
    nbr_indices = list(list(x) for x in nbrs.kneighbors(medians_scaled)[1])
    
    iso_location = df[["iso_code", "location"]].drop_duplicates()

    for i in range(len(nbr_indices)):
        current_iso_code = list(medians_scaled.index)[i]
        current_location = iso_code_to_loc(current_iso_code)

        neighbours = []
        for j in range(1, num_neighbours + 1):
            iso_code = medians_scaled.index[nbr_indices[i][j]]
            neighbours.append(iso_code_to_loc(iso_code))

        if not current_location in location_neighbours:
            location_neighbours[current_location] = {metric: neighbours}
        else:
            location_neighbours[current_location][metric] = neighbours

In [274]:
location_neighbours_df = pd.DataFrame(location_neighbours).transpose()

In [275]:
location_neighbours_df

Unnamed: 0,euclidean,manhattan,chebyshev
Albania,"[North Macedonia, Guatemala, Armenia, Oman, Le...","[North Macedonia, Armenia, Guatemala, Oman, Le...","[North Macedonia, Guatemala, Armenia, Russia, ..."
United Arab Emirates,"[Montenegro, Costa Rica, Bulgaria, Japan, Thai...","[Montenegro, Costa Rica, Japan, Qatar, Bulgaria]","[Montenegro, Costa Rica, Bulgaria, Thailand, J..."
Argentina,"[Armenia, Suriname, Georgia, Qatar, Albania]","[Armenia, Suriname, Georgia, Albania, Qatar]","[Armenia, Suriname, Georgia, Russia, South Afr..."
Armenia,"[Argentina, South Africa, Albania, Russia, Sur...","[South Africa, Argentina, Albania, North Maced...","[Argentina, Russia, Albania, South Africa, Sur..."
Australia,"[Germany, New Zealand, Estonia, Greece, Maurit...","[Germany, New Zealand, Estonia, Greece, Maurit...","[New Zealand, Estonia, Germany, Greece, Singap..."
...,...,...,...
Vietnam,"[Iran, Pakistan, Indonesia, Kenya, India]","[Iran, Pakistan, Indonesia, Kenya, China]","[Indonesia, Iran, Zimbabwe, Kazakhstan, Pakistan]"
Yemen,"[Cameroon, Zambia, Bangladesh, Uganda, Nigeria]","[Cameroon, Zambia, Bangladesh, Ethiopia, Uganda]","[Cameroon, Nigeria, Zambia, Bangladesh, Uganda]"
South Africa,"[Armenia, Russia, Panama, Serbia, Mexico]","[Armenia, Russia, Mexico, Panama, Argentina]","[Russia, Armenia, Serbia, North Macedonia, Pan..."
Zambia,"[Venezuela, Cameroon, Nigeria, El Salvador, Ye...","[Venezuela, Nigeria, El Salvador, Cameroon, Ye...","[Cameroon, Venezuela, Nigeria, El Salvador, Ba..."


In [276]:
prop_sim = []

for location in location_neighbours.keys():
    ind_pairs = [[0, 1], [0, 2], [1, 2]]
    
    for pair in ind_pairs:
        loc_intersect = set(location_neighbours[location][dist_metrics[0]]).intersection(location_neighbours[location][dist_metrics[1]])
        prop_sim.append(len(loc_intersect)/num_neighbours)

In [277]:
mean(prop_sim)

0.8758620689655172

In [278]:
location_neighbours["Australia"]

{'euclidean': ['Germany', 'New Zealand', 'Estonia', 'Greece', 'Mauritius'],
 'manhattan': ['Germany', 'New Zealand', 'Estonia', 'Greece', 'Mauritius'],
 'chebyshev': ['New Zealand', 'Estonia', 'Germany', 'Greece', 'Singapore']}