In [20]:
import pandas as pd
import random
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

from datetime import datetime, timedelta

from statistics import mean

**Aim: For each location in the dataset, find the 5 most locations/tourist destinations based on tourist experience and COVID situation (cases) over the last 90 days.**

In [4]:
random.seed(3888)

# Preparing the data for nearest neighbours analysis

## Data import

In [11]:
df = pd.read_csv("data/data.txt")
df = df.drop(columns='Unnamed: 0')

df = df.rename(columns = {'tourist_service_index': 'tourist_service_infrastructure'})

In [174]:
non_covid_colnames = ['iso_code', 'location', 'continent', 'price_competitiveness', 'tourist_service_infrastructure']
covid_colnames = ['iso_code', 'new_cases_smoothed_per_million', 'date', 'stringency_index']

In [175]:
df_without_covid = df_without_covid.drop_duplicates()
df_without_covid = df_without_covid.reset_index()
df_without_covid = df_without_covid.drop(columns=['index'])

df_without_covid

Unnamed: 0,iso_code,location,continent,tourist_service_infrastructure
0,ALB,Albania,Europe,4.01
1,DZA,Algeria,Africa,1.84
2,ARG,Argentina,South America,4.53
3,ARM,Armenia,Asia,4.33
4,AUS,Australia,Oceania,6.06
...,...,...,...,...
111,VEN,Venezuela,South America,2.96
112,VNM,Vietnam,Asia,2.85
113,YEM,Yemen,Asia,1.87
114,ZMB,Zambia,Africa,2.52


In [176]:
full_tourism = pd.read_csv("data/full_tourism.csv")

full_tourism

Unnamed: 0,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2015,2017,2019
0,AGO,Angola,3524,"WEF Travel & Tourism Competitiveness Index, 1-...",Value,2.60,,2.74
1,AGO,Angola,3525,"WEF Travel & Tourism Competitiveness Index, 1-...",Rank,139.00,,134.00
2,AGO,Angola,3526,"WEF Enabling environment subindex, 1-7 (best)",Value,3.23,,3.44
3,AGO,Angola,3527,"WEF Enabling environment subindex, 1-7 (best)",Rank,136.00,,133.00
4,AGO,Angola,3528,WEF Travel and Tourism policy and enabling con...,Value,3.21,,3.66
...,...,...,...,...,...,...,...,...
5618,ZWE,Zimbabwe,3561,"WEF Tourist service infrastructure pillar, 1-7...",Rank,112.00,106.00,102.00
5619,ZWE,Zimbabwe,3562,"WEF Natural resources pillar, 1-7 (best)",Value,3.51,3.61,3.60
5620,ZWE,Zimbabwe,3563,"WEF Natural resources pillar, 1-7 (best)",Rank,44.00,48.00,44.00
5621,ZWE,Zimbabwe,3564,"WEF Cultural resources and business travel, 1-...",Value,1.46,1.45,1.28


In [177]:
indicators = {
                'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
}

full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

inds = pd.DataFrame()

for ind in indicators.keys():
    inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]
    
non_covid_merged = pd.merge(inds, df_without_covid, on='iso_code')

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


In [178]:
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

In [179]:
covid_needed_cols = covid[covid_colnames]

In [180]:
covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')


In [181]:
covid_needed_cols = covid_needed_cols[covid_needed_cols['date'] >= datetime.now() - timedelta(days = 90)]

In [182]:
covid_merged = pd.merge(covid_needed_cols, non_covid_merged, on="iso_code")

In [183]:
covid_merged = covid_merged[list(set(non_covid_colnames).union(set(covid_colnames)))]
covid_merged

Unnamed: 0,iso_code,tourist_service_infrastructure,date,price_competitiveness,location,continent,stringency_index,new_cases_smoothed_per_million
0,ALB,4.01,2022-02-02,5.26,Albania,Europe,61.11,508.440
1,ALB,4.01,2022-02-03,5.26,Albania,Europe,61.11,430.769
2,ALB,4.01,2022-02-04,5.26,Albania,Europe,61.11,449.814
3,ALB,4.01,2022-02-05,5.26,Albania,Europe,61.11,449.814
4,ALB,4.01,2022-02-06,5.26,Albania,Europe,61.11,441.709
...,...,...,...,...,...,...,...,...
10203,ZWE,2.95,2022-04-26,5.32,Zimbabwe,Africa,,3.502
10204,ZWE,2.95,2022-04-27,5.32,Zimbabwe,Africa,,3.720
10205,ZWE,2.95,2022-04-28,5.32,Zimbabwe,Africa,,3.796
10206,ZWE,2.95,2022-04-29,5.32,Zimbabwe,Africa,,3.351


## Computing medians of quantitative columns

In [184]:
medians = covid_merged.groupby(["iso_code"]).median()

In [185]:
medians = medians.fillna(covid_merged.median())

  medians = medians.fillna(covid_merged.median())
  medians = medians.fillna(covid_merged.median())


In [186]:
medians

Unnamed: 0_level_0,tourist_service_infrastructure,price_competitiveness,stringency_index,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALB,4.01,5.26,46.30,22.0035
ARE,5.63,5.53,46.30,34.5235
ARG,4.53,4.90,45.37,112.3540
ARM,4.33,5.75,43.52,21.6590
AUS,6.06,4.43,48.61,1534.3550
...,...,...,...,...
VNM,2.85,5.87,63.89,606.3320
YEM,1.87,5.97,26.85,0.0420
ZAF,4.30,5.59,37.96,26.3995
ZMB,2.52,5.14,27.78,7.6110


## Min-max scaling

In [187]:
iso_code = medians.index

In [188]:
scaler = MinMaxScaler()

In [189]:
medians_scaled = scaler.fit_transform(medians)

In [190]:
cols = list(set(non_covid_colnames).union(set(covid_colnames)))
to_remove = ['iso_code', 'continent', 'location', 'date']

for col in to_remove:
    cols.remove(col)

medians_scaled = pd.DataFrame(medians_scaled, 
                              columns = cols, 
                              index = iso_code)
medians_scaled

Unnamed: 0_level_0,tourist_service_infrastructure,price_competitiveness,stringency_index,new_cases_smoothed_per_million
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALB,0.458753,0.584746,0.566298,0.005579
ARE,0.784708,0.661017,0.566298,0.008753
ARG,0.563380,0.483051,0.554196,0.028486
ARM,0.523139,0.723164,0.530124,0.005491
AUS,0.871227,0.350282,0.596357,0.389020
...,...,...,...,...
VNM,0.225352,0.757062,0.795185,0.153729
YEM,0.028169,0.785311,0.313208,0.000011
ZAF,0.517103,0.677966,0.457775,0.006693
ZMB,0.158954,0.550847,0.325309,0.001930


# Finding 5 nearest neighbours for each location

In [191]:
def iso_code_to_loc(iso_code):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

In [192]:
num_neighbours = 5

In [193]:
dist_metrics = ['euclidean', 'manhattan', 'chebyshev']

In [194]:
location_neighbours = {}

for metric in dist_metrics:
    nbrs = NearestNeighbors(metric = metric, 
                            n_neighbors = num_neighbours + 1, 
                            algorithm='ball_tree').fit(medians_scaled)
    
    nbr_indices = list(list(x) for x in nbrs.kneighbors(medians_scaled)[1])
    
    iso_location = df[["iso_code", "location"]].drop_duplicates()

    for i in range(len(nbr_indices)):
        current_iso_code = list(medians_scaled.index)[i]
        current_location = iso_code_to_loc(current_iso_code)

        neighbours = []
        for j in range(1, num_neighbours + 1):
            iso_code = medians_scaled.index[nbr_indices[i][j]]
            neighbours.append(iso_code_to_loc(iso_code))

        if not current_location in location_neighbours:
            location_neighbours[current_location] = {metric: neighbours}
        else:
            location_neighbours[current_location][metric] = neighbours

In [195]:
location_neighbours_df = pd.DataFrame(location_neighbours).transpose()

In [196]:
location_neighbours_df

Unnamed: 0,euclidean,manhattan,chebyshev
Albania,"[Lebanon, Suriname, Guatemala, Argentina, Sout...","[Lebanon, Suriname, Guatemala, Armenia, Oman]","[Suriname, Lebanon, Argentina, South Africa, S..."
United Arab Emirates,"[Montenegro, Bulgaria, Thailand, Costa Rica, Q...","[Montenegro, Bulgaria, Costa Rica, Qatar, Thai...","[Montenegro, Bulgaria, Thailand, Qatar, Costa ..."
Argentina,"[Albania, Suriname, Costa Rica, Mexico, South ...","[Suriname, Albania, Costa Rica, Mexico, Armenia]","[Albania, Mexico, Suriname, Serbia, Lebanon]"
Armenia,"[South Africa, Russia, North Macedonia, Surina...","[South Africa, North Macedonia, Russia, Oman, ...","[Russia, South Africa, Suriname, North Macedon..."
Australia,"[Germany, New Zealand, Greece, Italy, Mauritius]","[Germany, New Zealand, Italy, Mauritius, Greece]","[New Zealand, Germany, Greece, Singapore, Italy]"
...,...,...,...
Vietnam,"[Pakistan, Indonesia, India, China, Kazakhstan]","[Pakistan, Indonesia, China, India, Azerbaijan]","[Indonesia, Kazakhstan, Pakistan, India, China]"
Yemen,"[Uganda, Ethiopia, Nigeria, Bangladesh, Tanzania]","[Uganda, Ethiopia, El Salvador, Bangladesh, Ne...","[Nigeria, Uganda, Tanzania, Ethiopia, Bangladesh]"
South Africa,"[Armenia, Russia, Panama, Serbia, North Macedo...","[Armenia, Panama, Russia, Serbia, Bolivia]","[Russia, Armenia, Serbia, North Macedonia, Pan..."
Zambia,"[Venezuela, Cameroon, Nigeria, Senegal, Paraguay]","[Venezuela, Cameroon, Nigeria, Senegal, Paraguay]","[Cameroon, Venezuela, Nigeria, Bangladesh, Sen..."


In [197]:
prop_sim = []

for location in location_neighbours.keys():
    ind_pairs = [[0, 1], [0, 2], [1, 2]]
    
    for pair in ind_pairs:
        loc_intersect = set(location_neighbours[location][dist_metrics[0]]).intersection(location_neighbours[location][dist_metrics[1]])
        prop_sim.append(len(loc_intersect)/num_neighbours)

In [198]:
mean(prop_sim)

0.8344827586206897