In [21]:
import itertools
import more_itertools
import random

from statistics import mean

from analytics_clustering import *
from analytics_helper_clustering import *
from analytics import *
from analytics_helper import *
from common import *

In [22]:
# hides warnings - these do not affect code functionality

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [23]:
random.seed(3888)

In [24]:
countries_data = integrate_all_data()

# Clustering

Mean silhouette score across

- 100 randomly-chosen interest combinations, and
- all possible region combinations.

In [25]:
scores = []

In [26]:
interest_combs = list(itertools.product([False, True], repeat=15))

In [27]:
interests = ["covid", 
             "infrastructure_quality_and_availability",
             "health_and_safety",
             "cost",
             "fun",
             "nature",
             "food",
             "museums",
             "showstheatresandmusic",
             "wellness",
             "wildlife"]

In [28]:
interest_combs_chosen = random.sample(interest_combs, 100)

while [False] * 15 in interest_combs_chosen:
    interest_combs_chosen = random.sample(interest_combs, 100)

In [29]:
region_combs = list(more_itertools.powerset(['Asia-Pacific', 'Europe and Africa', 'Americas']))

In [30]:
for interest_comb in interest_combs_chosen:  
    for regions in region_combs:
        interested = {}
        for i, interest in enumerate(interests):
            interested[interest] = interest_comb[i]

        if list(interested.values()).count(True) != 0 and len(regions) != 0:
            weightings = convert_interests_to_col_weightings(interested)
            medians_scaled_pca, medians_scaled = prepare_data_for_clustering(countries_data, regions, weightings)

            scores.append(compute_silhouette_score(medians_scaled_pca))

In [31]:
mean(scores)

0.3633255640966533

**Observation:** Our silhouette score is average; there is _some_ clustering structure in our data, though it is not very well-defined.

However, for our purposes, it doesn't really matter whether our clusters are _extremely_ well-defined or not; either way, each cluster will still be a group of fairly-similar countries.

# k-NN

In [32]:
num_neighbours = 10

In [33]:
iso_location = read_iso_loc_data()

In [34]:
interest_combs_10NN = list(more_itertools.powerset(interests))

In [35]:
interest_combs_10NN_chosen = random.sample(interest_combs_10NN, 3)

while [False] * 15 in interest_combs_chosen:
    interest_combs_chosen = random.sample(interest_combs, 3)

In [36]:
print("chosen_interests:")
interest_combs_10NN_chosen

chosen_interests:


[('covid',
  'infrastructure_quality_and_availability',
  'health_and_safety',
  'nature',
  'museums',
  'showstheatresandmusic',
  'wellness',
  'wildlife'),
 ('covid',),
 ('fun', 'nature', 'food', 'wellness')]

In [37]:
countries_10NN_chosen = random.sample(list(iso_location['location'].drop_duplicates()), 5)

In [38]:
print("chosen_countries:")
countries_10NN_chosen

chosen_countries:


['Slovenia', 'Lithuania', 'Colombia', 'Vietnam', 'Norway']

In [39]:
all_metrics = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
               'cosine', 'euclidean', 'manhattan', 'minkowski']

## Single model

1. We randomly choose 3 interest combinations, $\text{chosen_interests}$.
2. We randomly choose 5 countries, $\text{chosen_countries}$.
3. For each pair of distance metrics $d_1, d_2$:
    - For each $i$ in $\text{chosen_interests}$:
        - For each $c$ in $\text{chosen_countries}$:
            - we compute proportion of similarity between sets of countries produced by 
                - 10-NN for $c$ when $d_1$ is used as distance metric (with columns in $i$ weighted more), and 
                - 10-NN for $c$ when $d_2$ is used as distance metric (with columns in $i$ weighted more).
5. We compute the average $a$ of all proportion of similarity values.

In [40]:
metrics_pairs = list(itertools.combinations(all_metrics, r=2))

In [41]:
prop_similarity = []

for pair in metrics_pairs:
    regions = ['Asia-Pacific', 'Europe and Africa', 'Americas']
    
    for interests in interest_combs_10NN_chosen:  
    
        if len(interests) > 0 and len(regions) > 0:
            continents = convert_regions_to_continents(regions)

            cols_of_interest = convert_interests_to_cols(interests)
            weightings = generate_feature_weightings_dict(cols_of_interest)

            for country in countries_10NN_chosen:  
                medians_scaled, medians, data_no_quant = prepare_data_for_nn(countries_data, country, continents, weightings)

                if not loc_to_iso_code(country, iso_location) in medians.index:
                    continue

                neighbours_all_metrics = list(generate_final_df_w_nn(country,
                                                                 medians_scaled,
                                                                 medians,
                                                                 data_no_quant,
                                                                 num_neighbours = num_neighbours,
                                                                 dist_metrics=[pair[0]])['5NN'])[0]

                neighbours_metric_missing = list(generate_final_df_w_nn(country,
                                                                 medians_scaled,
                                                                 medians,
                                                                 data_no_quant,
                                                                 num_neighbours = num_neighbours,
                                                                 dist_metrics=[pair[1]])['5NN'])[0]

                neighbours_intersect = set(neighbours_all_metrics).intersection(neighbours_metric_missing)    
                sim = len(neighbours_intersect)/num_neighbours

                prop_similarity.append(sim)

In [42]:
mean(prop_similarity)

0.625

**Observation:** Changing our distance metric leads to a _non-negligible change in result_ (on average) - our model is not very stable, not robust to change in distance metric.

**Solution:** Use an ensemble. Each model in ensemble is a 10-NN model using a *different distance metric*.

## Ensemble

1. We randomly choose 3 interest combinations, $\text{chosen_interests}$.
2. We randomly choose 5 countries, $\text{chosen_countries}$.
3. For each distance metric $d$:
    - For each $i$ in $\text{chosen_interests}$:
        - For each $c$ in $\text{chosen_countries}$:
            - we compute the proportion of similarity between sets of countries returned by
                - 10-NN for $c$ with full ensemble (and with columns in $i$ weighted more), and
                - 10-NN for $c$ with full ensemble **minus $d$** (and with columns in $i$ weighted more).
    - We compute the average $a$ of all the proportion of similarity values corresponding to $d$. $a$ becomes the overall "score" for $d$.

In [43]:
prop_similarity = {}

for metric in all_metrics:
    dist_metrics = list(set(all_metrics) - set([metric]))
    regions = ['Asia-Pacific', 'Europe and Africa', 'Americas']
    
    for interests in interest_combs_10NN_chosen:  
    
        if len(interests) > 0 and len(regions) > 0:
            continents = convert_regions_to_continents(regions)

            cols_of_interest = convert_interests_to_cols(interests)
            weightings = generate_feature_weightings_dict(cols_of_interest)

            for country in countries_10NN_chosen:  
                medians_scaled, medians, data_no_quant = prepare_data_for_nn(countries_data, country, continents, weightings)

                if not loc_to_iso_code(country, iso_location) in medians.index:
                    continue

                neighbours_all_metrics = list(generate_final_df_w_nn(country,
                                                                 medians_scaled,
                                                                 medians,
                                                                 data_no_quant,
                                                                 num_neighbours = num_neighbours,
                                                                 dist_metrics=all_metrics)['5NN'])[0]

                neighbours_metric_missing = list(generate_final_df_w_nn(country,
                                                                 medians_scaled,
                                                                 medians,
                                                                 data_no_quant,
                                                                 num_neighbours = num_neighbours,
                                                                 dist_metrics=dist_metrics)['5NN'])[0]

                neighbours_intersect = set(neighbours_all_metrics).intersection(neighbours_metric_missing)    
                sim = len(neighbours_intersect)/num_neighbours

                if not metric in prop_similarity:
                    prop_similarity[metric] = [sim]
                else:
                    prop_similarity[metric].append(sim)

In [44]:
avg_prop_similarity = {}

for metric in prop_similarity.keys():
    avg_prop_similarity[metric] = mean(prop_similarity[metric])

In [45]:
avg_prop_similarity

{'braycurtis': 0.9533333333333334,
 'canberra': 0.98,
 'chebyshev': 0.98,
 'cityblock': 0.9533333333333334,
 'correlation': 0.98,
 'cosine': 0.98,
 'euclidean': 0.98,
 'manhattan': 0.9533333333333334,
 'minkowski': 0.98}

**Observation:** Removing any single model from ensemble leads to only a slight change in result. Thus, we ensure that *no one distance metric completely determines 10 nearest neighbours*, and so our model is very robust to removal/addition of distance metrics. Whereas when we used just 1 model/distance metric, changing the single distance metric used led to considerably more variability in result.