In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests
from requests_html import HTML
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import random
from k_means_constrained import KMeansConstrained
from statistics import mean

In [89]:
random.seed(3888)

def get_variable_groups():
    variable_groups = {}
    variable_groups["covid"] = ['new_cases_smoothed_per_million', 'stringency_index', 'positive_rate', 'new_vaccinations_smoothed_per_million']
    variable_groups["infrastructure_quality_and_availability"] = ['tourist_service', 'air_transport', 'ground_port']
    variable_groups["health_and_safety"] = ['safety_security', 'health_hygiene']
    variable_groups["cost"] = ['price_competitiveness']
    
    # POIs
    variable_groups["fun"] = ['amusementparks', 'nightlife']
    variable_groups["nature"] = ['beaches', 'camping', 'exploringnature']
    variable_groups["food"] = ['eatingout']
    variable_groups["museums"] = ['museums']
    variable_groups["showstheatresandmusic"] = ['showstheatresandmusic']
    variable_groups["wellness"] = ['wellness']
    variable_groups["wildlife"] = ['zoos']

    return variable_groups

def convert_interest_level_to_weighting(interested):
    interested_mapping = {
        True: 100,
        False: 1
    }
    
    return interested_mapping[interested]

def convert_interests_to_col_weightings(interests):
    variable_groups = get_variable_groups()
    
    col_weightings = {}
    
    for interest in interests:
        cols = variable_groups[interest]
        weighting = convert_interest_level_to_weighting(interests[interest])
        for col in cols:
            col_weightings[col] = weighting
            
    return col_weightings

def get_all_features():
    variable_groups = get_variable_groups()

    all_features = []
    for group in variable_groups.values():
        for col in group:
            all_features.append(col)

    return all_features

def get_all_cols():
    cols = get_all_features()
    cols.append("iso_code")
    cols.append("location")
    cols.append("advice")
    cols.append("description")
    cols.append("continent")
    cols.append("date")

    return cols

def read_original_data():
    df = pd.read_csv("data/data.txt")
    df = df.drop(columns='Unnamed: 0')

    df = df.rename(columns = {'tourist_service_index': 'tourist_service'})

    df_without_covid = df.drop(columns=['new_cases_per_million', 
                                    'new_cases_smoothed_per_million', 
                                    'stringency_index', 
                                    'positive_rate', 
                                    'human_development_index', 
                                    'international_travel_controls',
                                    'cost_living_index',
                                    'date',
                                    'location',
                                    'continent'])

    df_without_covid = df_without_covid.drop_duplicates()
    df_without_covid = df_without_covid.reset_index()
    df_without_covid = df_without_covid.drop(columns=['index'])

    return df_without_covid

def read_tourism_data():
    full_tourism = pd.read_csv("data/full_tourism.csv")
    full_tourism = full_tourism[full_tourism["Country ISO3"] != "AUS"]

    indicators = {
                    'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                    'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                    'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                    'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                    'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                    'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                    'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
    }

    full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
    full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
    full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
    full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
    full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
    full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

    inds = pd.DataFrame()

    for ind in indicators.keys():
        inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]

    return inds

def read_live_covid_data():
    covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

    covid['date'] = pd.to_datetime(covid['date'], format='%Y-%m-%d')

    covid = covid[covid['date'] >= datetime.now() - timedelta(days = 30)]

    return covid

def read_poi_data():
    poi = pd.read_csv("data/triposo_poi.csv", index_col = 0)
    poi = poi.apply(pd.to_numeric)
    
    return poi

def read_iso_loc_data():
    df = pd.read_csv("data/data.txt")
    df = df.drop(columns='Unnamed: 0')

    iso_location = df[["iso_code", "location"]].drop_duplicates()

    return iso_location

def iso_code_to_loc(iso_code, iso_location):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

def loc_to_iso_code(loc, iso_location):
    return iso_location[iso_location["location"] == loc]["iso_code"].iloc[0]

def read_smartraveller_data():
    # https://practicaldatascience.co.uk/data-science/how-to-read-an-rss-feed-in-python

    def get_source(url):
        """Return the source code for the provided URL. 

        Args: 
            url (string): URL of the page to scrape.

        Returns:
            response (object): HTTP response object from requests_html. 
        """

        try:
            session = HTMLSession()
            response = session.get(url)
            return response

        except requests.exceptions.RequestException as e:
            print(e)


    def get_feed(url):
        """Return a Pandas dataframe containing the RSS feed contents.

        Args: 
            url (string): URL of the RSS feed to read.

        Returns:
            df (dataframe): Pandas dataframe containing the RSS feed contents.
        """
        
        response = get_source(url)
        
        df = pd.DataFrame(columns = ['title', 'pubDate', 'guid', 'description'])

        with response as r:
            items = r.html.find("item", first=False)

            for item in items:        

                title = item.find('title', first=True).text
                pubDate = item.find('pubDate', first=True).text
                guid = item.find('guid', first=True).text
                description = item.find('description', first=True).text

                row = {'title': [title], 'pubDate': [pubDate], 'guid': [guid], 'description': [description]}
                df = pd.concat([df, pd.DataFrame.from_dict(row)])

        return df

    travel_advice = get_feed("https://www.smartraveller.gov.au/countries/documents/index.rss")

    travel_advice = travel_advice[travel_advice["title"] != "No travel advice"]

    travel_advice = travel_advice.drop(columns=['guid'])

    replacements = {
        "United States of America": "United States",
        "Israel and the Palestinian Territories": "Israel",
        "South Korea (Republic of Korea)": "South Korea"
    }

    for replacement in replacements:
        travel_advice.replace(replacement, replacements[replacement], inplace = True)

    travel_advice.rename(columns={"title": "location", "description": "advice"}, inplace = True)

    travel_advice["advice"] = [BeautifulSoup(s, "lxml").text for s in travel_advice["advice"]]

    return travel_advice

def read_triposo_data():
    descriptions = pd.read_csv("country_descriptions_cleaned_2.csv")

    return descriptions

def integrate_all_data():
    # reading in all data
    original = read_original_data()
    tourism = read_tourism_data()
    covid = read_live_covid_data()
    poi = read_poi_data()
    smartraveller = read_smartraveller_data()
    triposo = read_triposo_data()

    # merging
    full = pd.merge(original, tourism, on="iso_code")
    full = pd.merge(full, covid, on="iso_code")
    full = pd.merge(full, poi, on="iso_code")

    full = pd.merge(full, smartraveller, on="location")
    full = pd.merge(full, triposo, on="iso_code")

    full = full[full["iso_code"] != "AUS"]
    
    return full[get_all_cols()]

def prepare_data_for_clustering(data, continents, weightings):
    print(data.columns)
    
    # continents filtering
    data = data[data["continent"].isin(continents)]
    data_no_quant = list(set(data.columns).difference(set(data.select_dtypes(include=[np.number]).columns)))
    data_no_quant.remove("date")

    medians = data.groupby(["iso_code"]).median()
    medians = medians.fillna(data.median())

    if medians.shape[0] == 0:
        return medians, medians, data[data_no_quant]

    iso_code = medians.index

    scaler = MinMaxScaler()

    medians_scaled = scaler.fit_transform(medians)

    cols = list(data.columns)

    to_remove = ['iso_code', 'continent', 'location', 'date', 'advice', 'description']

    for col in to_remove:
        cols.remove(col)
                
    medians_scaled = pd.DataFrame(medians_scaled, 
                                columns = cols, 
                                index = iso_code)

    for col in medians_scaled.columns:
        medians_scaled[col] = medians_scaled[col].apply(lambda x: x * weightings[col])

    if len(medians_scaled.columns) > 2:
        pca = PCA(n_components=2)
        pc = pca.fit_transform(medians_scaled)
        medians_scaled = pd.DataFrame(data = pc, columns = ['PC1', 'PC2'], index = medians_scaled.index)

    return medians_scaled, medians, data[data_no_quant]

def generate_best_cluster(scaled_data, interested):
    clf = KMeansConstrained(
            n_clusters=scaled_data.shape[0]//10,
            size_min=10,
            size_max=12,
            random_state=3888
    )
    
    labels = clf.fit_predict(scaled_data)
    
    clusters = {}
    iso_location = read_iso_loc_data()

    for i, label in enumerate(labels):
        if label in clusters:
            clusters[label].append(list(medians_scaled.index)[i])
        else:
            clusters[label] = [list(medians_scaled.index)[i]]
            
    col_weightings = convert_interests_to_col_weightings(interested)
        
    cols_of_interest = [col for col in col_weightings if col_weightings[col] == 100]
    
    cluster_rating = {}

    for cluster_label in clusters:
        all_ratings = []
        
        for col in cols_of_interest:
            cluster_col_vals = list(medians.loc[clusters[cluster_label]][col])
            for val in cluster_col_vals:
                all_ratings.append(val)
        
        cluster_rating[cluster_label] = mean(all_ratings)
        
    best_cluster = sorted(cluster_rating, key=lambda x: cluster_rating[x], reverse = True)[0]
        
    return clusters[best_cluster]

In [98]:
interested = {}

interested["covid"] = False
interested["infrastructure_quality_and_availability"] = False
interested["health_and_safety"] = True
interested["cost"] = False
interested["fun"] = True
interested["nature"] = False
interested["food"] = False
interested["museums"] = False
interested["showstheatresandmusic"] = False
interested["wellness"] = False
interested["wildlife"] = False

weightings = convert_interests_to_col_weightings(interested)

In [95]:
countries_data = integrate_all_data()

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


In [99]:
medians_scaled, medians, data_no_quant = prepare_data_for_clustering(countries_data, ["Asia", "Europe"], weightings)

Index(['new_cases_smoothed_per_million', 'stringency_index', 'positive_rate',
       'new_vaccinations_smoothed_per_million', 'tourist_service',
       'air_transport', 'ground_port', 'safety_security', 'health_hygiene',
       'price_competitiveness', 'amusementparks', 'nightlife', 'beaches',
       'camping', 'exploringnature', 'eatingout', 'museums',
       'showstheatresandmusic', 'wellness', 'zoos', 'iso_code', 'location',
       'advice', 'description', 'continent', 'date'],
      dtype='object')


  medians = medians.fillna(data.median())
  medians = medians.fillna(data.median())


In [100]:
generate_best_cluster(medians_scaled, interested)

['BEL', 'CHN', 'DEU', 'DNK', 'ESP', 'FRA', 'GBR', 'ITA', 'JPN', 'NLD', 'TWN']