In [475]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

from datetime import datetime, timedelta

from statistics import mean

from sklearn.decomposition import PCA

import requests
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession

from bs4 import BeautifulSoup

import pprint

In [476]:
random.seed(3888)

# Inputs from UI

## Country name

In [477]:
country = 'China'

## Variable groups

### Contents of each variable group

In [557]:
variable_groups = {}
variable_groups["covid"] = possible_covid_colnames
variable_groups["infrastructure quality_and_availability"] = ['tourist_service', 'air_transport', 'ground_port']
variable_groups["health_and_safety"] = ['safety_security', 'health_hygiene']
variable_groups["cost"] = ['price_competitiveness']
variable_groups["food"] = ['food', 'restaurant', 'cafe']
variable_groups["places_of_worship"] = ['church', 'mosque', 'place_of_worship', 'hindu_temple', 'synagogue']
variable_groups["indoor_attractions"] = ['art_gallery', 'museum', 'aquarium']
variable_groups["outdoor_attractions"] = ['amusement_park', 'zoo']
variable_groups["nature"] = ['park', 'natural_cultural_resources', 'natural_feature']
variable_groups["nightlife"] = ['casino', 'bar', 'night_club']
variable_groups["shopping"] = ['shopping_mall', 'clothing_store', 'department_store']
variable_groups["relaxation"] = ['spa']

In [558]:
pp = pprint.PrettyPrinter()
pp.pprint(variable_groups)

{'cost': ['price_competitiveness'],
 'covid': ['new_cases_smoothed_per_million',
           'stringency_index',
           'positive_rate',
           'new_vaccinations_smoothed_per_million'],
 'food': ['food', 'restaurant', 'cafe'],
 'health_and_safety': ['safety_security', 'health_hygiene'],
 'indoor_attractions': ['art_gallery', 'museum', 'aquarium'],
 'infrastructure quality_and_availability': ['tourist_service',
                                             'air_transport',
                                             'ground_port'],
 'nature': ['park', 'natural_cultural_resources', 'natural_feature'],
 'nightlife': ['casino', 'bar', 'night_club'],
 'outdoor_attractions': ['amusement_park', 'zoo'],
 'places_of_worship': ['church',
                       'mosque',
                       'place_of_worship',
                       'hindu_temple',
                       'synagogue'],
 'relaxation': ['spa'],
 'shopping': ['shopping_mall', 'clothing_store', 'department_store']}


### User-specified variable groups

In [560]:
possible_variable_group_names = list(variable_groups.keys())
possible_variable_group_names

['covid',
 'infrastructure quality_and_availability',
 'health_and_safety',
 'cost',
 'food',
 'places_of_worship',
 'indoor_attractions',
 'outdoor_attractions',
 'nature',
 'nightlife',
 'shopping',
 'relaxation']

`ui_variable_group_names` should be a subset of `possible_variable_group_names`.

In [561]:
ui_variable_group_names = possible_variable_group_names

## Converting variable groups to column names

In [None]:
ui_covid_colnames = ['new_cases_smoothed_per_million']

if 'covid' in ui_variable_group_names:
    

In [478]:
possible_covid_colnames = ['new_cases_smoothed_per_million', 
                           'stringency_index', 
                           'positive_rate', 
                           'new_vaccinations_smoothed_per_million']

In [479]:
possible_non_covid_colnames = ['tourist_service',
                               'natural_cultural_resources',
                               'safety_security',
                               'health_hygiene',
                               'price_competitiveness',
                               'air_transport',
                               'ground_port',
                               'art_gallery',
                               'food',
                               'museum',
                               'park',
                               'restaurant',
                               'church',
                               'mosque',
                               'place_of_worship',
                               'zoo',
                               'amusement_park',
                               'aquarium',
                               'cafe',
                               'hindu_temple',
                               'natural_feature',
                               'casino',
                               'bar',
                               'shopping_mall',
                               'spa',
                               'synagogue',
                               'night_club',
                               'clothing_store',
                               'department_store'
                              ]

`ui_covid_colnames` and `ui_non_covid_colnames` should be a subset of `possible_covid_colnames` and `possible_non_covid_colnames` (respectively).

In [480]:
ui_covid_colnames = ['new_cases_smoothed_per_million']
ui_non_covid_colnames = ['tourist_service_infrastructure', 'art_gallery']

# Integrating the data

## CSV file we created initially

In [481]:
df = pd.read_csv("data/data.txt")
df = df.drop(columns='Unnamed: 0')

df = df.rename(columns = {'tourist_service_index': 'tourist_service'})

# An Australian wouldn't travel to Australia!

df = df[df["iso_code"] != "AUS"]

In [482]:
df_without_covid = df.drop(columns=['new_cases_per_million', 
                                    'new_cases_smoothed_per_million', 
                                    'stringency_index', 
                                    'positive_rate', 
                                    'human_development_index', 
                                    'international_travel_controls',
                                    'cost_living_index',
                                    'date'])

In [483]:
covid_colnames = []

if len(ui_covid_colnames) > 0:
    covid_colnames = ['iso_code', 'date'] + ui_covid_colnames

non_covid_colnames = ['iso_code', 'location', 'continent'] + ui_non_covid_colnames

In [484]:
df_without_covid = df_without_covid.drop_duplicates()
df_without_covid = df_without_covid.reset_index()
df_without_covid = df_without_covid.drop(columns=['index'])

## Integrating Open Trade and Competitiveness Data

In [485]:
full_tourism = pd.read_csv("data/full_tourism.csv")
full_tourism = full_tourism[full_tourism["Country ISO3"] != "AUS"]

In [486]:
indicators = {
                'WEF Infrastructure subindex, 1-7 (best)': 'infrastructure', 
                'WEF Natural and cultural resources subindex, 1-7 (best)': 'natural_cultural_resources',
                'WEF Safety and security pillar, 1-7 (best)': 'safety_security',
                'WEF Health and hygiene, 1-7 (best)': 'health_hygiene',
                'WEF Price competitiveness in the Travel and Tourism Industry pillar, 1-7 (best)': 'price_competitiveness',
                'WEF Air transport infrastructure, 1-7 (best)': 'air_transport',
                'WEF Ground and port infrastructure, 1-7 (best)': 'ground_port'
}

full_tourism_req_indicators = full_tourism[full_tourism["Indicator"].isin(indicators)]
full_tourism_req_indicators = full_tourism_req_indicators[['Country ISO3', 'Indicator', 'Subindicator Type', '2019']]
full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]
full_tourism_req_indicators = full_tourism_req_indicators.drop(columns = ['Subindicator Type'])
full_tourism_req_indicators = full_tourism_req_indicators.rename(columns = {'Country ISO3': 'iso_code'})
full_tourism_req_indicators = full_tourism_req_indicators.set_index('iso_code')

inds = pd.DataFrame()

for ind in indicators.keys():
    inds[indicators[ind]] = full_tourism_req_indicators[full_tourism_req_indicators["Indicator"] == ind].drop(columns = ["Indicator"]).rename(columns = {'2019': indicators[ind]})[indicators[ind]]
    
non_covid = pd.merge(inds, df_without_covid, on='iso_code')

  full_tourism_req_indicators = full_tourism_req_indicators[full_tourism["Subindicator Type"] == "Value"]


## Integrating live COVID data

In [487]:
covid = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

covid = covid[covid["iso_code"] != "AUS"]

In [488]:
covid_needed_cols = covid[covid_colnames]

In [489]:
covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_needed_cols['date'] = pd.to_datetime(covid_needed_cols['date'], format='%Y-%m-%d')


In [490]:
covid_needed_cols = covid_needed_cols[covid_needed_cols['date'] >= datetime.now() - timedelta(days = 30)]

## Integrating Google Maps POI data

In [491]:
poi = pd.read_json("data/poi_types.json")

In [492]:
poi = poi.fillna(0)
poi = poi.replace(0, np.nan)
poi = poi.dropna(how='all', axis=0)
poi = poi.replace(np.nan, 0)

In [493]:
poi = poi.transpose()

In [494]:
poi = poi[poi.index != "Australia"]

In [495]:
iso_location = df[["iso_code", "location"]].drop_duplicates()

iso_location.to_csv("iso_location.csv")

In [496]:
def iso_code_to_loc(iso_code):
    return iso_location[iso_location["iso_code"] == iso_code]["location"].iloc[0]

In [497]:
def loc_to_iso_code(loc):
    return iso_location[iso_location["location"] == loc]["iso_code"].iloc[0]

In [498]:
poi = poi.set_index(loc_to_iso_code(loc) for loc in poi.index)

In [499]:
poi.columns

Index(['art_gallery', 'food', 'museum', 'park', 'restaurant', 'church',
       'mosque', 'place_of_worship', 'zoo', 'travel_agency', 'amusement_park',
       'aquarium', 'cafe', 'store', 'cemetery', 'hindu_temple',
       'natural_feature', 'library', 'campground', 'lodging', 'casino',
       'local_government_office', 'liquor_store', 'bar', 'shopping_mall',
       'spa', 'transit_station', 'grocery_or_supermarket', 'synagogue',
       'movie_theater', 'general_contractor', 'parking', 'book_store',
       'night_club', 'city_hall', 'clothing_store', 'department_store',
       'health', 'hospital'],
      dtype='object')

In [500]:
# removing less relevant columns
poi = poi.drop(columns=['travel_agency',
                        'store',
                        'cemetery',
                        'library',
                        'campground',
                        'lodging',
                        'local_government_office',
                        'liquor_store',
                        'transit_station',
                        'grocery_or_supermarket',
                        'movie_theater',
                        'general_contractor',
                        'parking',
                        'book_store',
                        'city_hall',
                        'health',
                        'hospital'
                       ])

In [501]:
poi.columns

Index(['art_gallery', 'food', 'museum', 'park', 'restaurant', 'church',
       'mosque', 'place_of_worship', 'zoo', 'amusement_park', 'aquarium',
       'cafe', 'hindu_temple', 'natural_feature', 'casino', 'bar',
       'shopping_mall', 'spa', 'synagogue', 'night_club', 'clothing_store',
       'department_store'],
      dtype='object')

In [502]:
poi['iso_code'] = poi.index

In [503]:
covid_merged = pd.merge(covid_needed_cols, non_covid, on="iso_code")
covid_merged = pd.merge(covid_merged, poi, on="iso_code")

## Integrating Smartraveller advice

In [504]:
# https://practicaldatascience.co.uk/data-science/how-to-read-an-rss-feed-in-python

def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)


def get_feed(url):
    """Return a Pandas dataframe containing the RSS feed contents.

    Args: 
        url (string): URL of the RSS feed to read.

    Returns:
        df (dataframe): Pandas dataframe containing the RSS feed contents.
    """
    
    response = get_source(url)
    
    df = pd.DataFrame(columns = ['title', 'pubDate', 'guid', 'description'])

    with response as r:
        items = r.html.find("item", first=False)

        for item in items:        

            title = item.find('title', first=True).text
            pubDate = item.find('pubDate', first=True).text
            guid = item.find('guid', first=True).text
            description = item.find('description', first=True).text

            row = {'title': [title], 'pubDate': [pubDate], 'guid': [guid], 'description': [description]}
            df = pd.concat([df, pd.DataFrame.from_dict(row)])

    return df

In [505]:
travel_advice = get_feed("https://www.smartraveller.gov.au/countries/documents/index.rss")

In [506]:
travel_advice = travel_advice[travel_advice["title"] != "No travel advice"]

In [507]:
travel_advice = travel_advice.drop(columns=['guid'])

In [508]:
travel_advice

Unnamed: 0,title,pubDate,description
0,Mali,07 May 2022 22:00:00 AEST,"On 6 May 2022, the US government issued an ale..."
0,Sri Lanka,07 May 2022 22:00:00 AEST,A Public Emergency has been declared in Sri La...
0,Timor-Leste,06 May 2022 22:00:00 AEST,The Presidential inauguration will be held on ...
0,Papua New Guinea,06 May 2022 22:00:00 AEST,"If you're not a PNG citizen, you'll only be al..."
0,Hong Kong,06 May 2022 22:00:00 AEST,Fully vaccinated travellers can enter Hong Kon...
...,...,...,...
0,Costa Rica,28 Oct 2021 23:00:00 AEDT,If you're not fully vaccinated against COVID-1...
0,Marshall Islands,28 Oct 2021 23:00:00 AEDT,"Due to COVID-19, Marshall Islands has closed i..."
0,Niger,28 Oct 2021 23:00:00 AEDT,The US Government has issued a security alert ...
0,North Korea (Democratic People's Republic of K...,28 Oct 2021 23:00:00 AEDT,North Korea’s borders remain closed due to COV...


In [509]:
replacements = {
    "United States of America": "United States",
    "Israel and the Palestinian Territories": "Israel",
    "South Korea (Republic of Korea)": "South Korea"
}

In [510]:
for replacement in replacements:
    travel_advice.replace(replacement, replacements[replacement], inplace = True)

In [511]:
travel_advice.rename(columns={"title": "location", "description": "advice"}, inplace = True)

In [512]:
travel_advice["advice"] = [BeautifulSoup(s, "lxml").text for s in travel_advice["advice"]]

In [513]:
covid_merged = pd.merge(covid_merged, travel_advice, on="location")

In [515]:
covid_colnames.append("advice")

covid_merged = covid_merged[list(set(non_covid_colnames).union(set(covid_colnames)))]

covid_colnames.remove("advice")

## Integrating country descriptions (taken from Triposo API)

In [516]:
descriptions = pd.read_csv("country_descriptions_cleaned_2.csv")

In [517]:
covid_merged = pd.merge(descriptions, covid_merged, on="iso_code")

# Preparing data for NN

## Computing medians of quantitative columns

In [518]:
covid_merged_no_quant = list(set(covid_merged.columns).difference(set(covid_merged.select_dtypes(include=[np.number]).columns)))
covid_merged_no_quant.remove("date")

In [519]:
medians = covid_merged.groupby(["iso_code"]).median()

In [520]:
medians = medians.fillna(covid_merged.median())

  medians = medians.fillna(covid_merged.median())
  medians = medians.fillna(covid_merged.median())


## Filtering (hard limits)

**This is where any filtering by value should take place.**

## Min-max scaling

In [521]:
iso_code = medians.index

In [522]:
scaler = MinMaxScaler()

In [523]:
medians_scaled = scaler.fit_transform(medians)

In [524]:
cols = list(set(non_covid_colnames).union(set(covid_colnames)))
to_remove = ['iso_code', 'continent', 'location', 'date']

for col in to_remove:
    cols.remove(col)
    
medians_scaled = pd.DataFrame(medians_scaled, 
                              columns = cols, 
                              index = iso_code)

## Performing PCA (if more than 2 features selected)

In [525]:
if len(medians_scaled.columns) > 2:
    pca = PCA(n_components=2)
    pc = pca.fit_transform(medians_scaled)
    medians_scaled = pd.DataFrame(data = pc, columns = ['PC1', 'PC2'], index = medians_scaled.index)

# Finding 5 nearest neighbours for each location

In [526]:
medians_scaled

Unnamed: 0_level_0,PC1,PC2
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ALB,-0.072664,0.433681
ARE,0.153815,-0.071691
ARG,-0.024459,-0.066682
ARM,-0.067538,-0.063599
AUT,0.604888,-0.134246
...,...,...
VNM,-0.244370,-0.069695
YEM,-0.473951,-0.050823
ZAF,-0.056594,-0.066682
ZMB,-0.365263,-0.054464


In [527]:
num_neighbours = 5

In [528]:
dist_metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine', 'cityblock', 'braycurtis', 'canberra',
               'correlation', 'minkowski']

In [529]:
location_neighbours = {}

for metric in dist_metrics:
    nbrs = NearestNeighbors(metric = metric, 
                            n_neighbors = num_neighbours + 1, 
                            algorithm='auto').fit(medians_scaled)
    
    nbr_indices = list(list(x) for x in nbrs.kneighbors(medians_scaled)[1])
    
    iso_location = df[["iso_code", "location"]].drop_duplicates()

    for i in range(len(nbr_indices)):
        current_iso_code = list(medians_scaled.index)[i]
        current_location = iso_code_to_loc(current_iso_code)

        neighbours = []
        for j in range(1, num_neighbours + 1):
            iso_code = medians_scaled.index[nbr_indices[i][j]]
            neighbours.append(iso_code_to_loc(iso_code))

        if not current_location in location_neighbours:
            location_neighbours[current_location] = {metric: neighbours}
        else:
            location_neighbours[current_location][metric] = neighbours

In [530]:
location_neighbours_df = pd.DataFrame(location_neighbours).transpose()

In [531]:
def find_top_neighbours(country):
    d = {}
    lists = location_neighbours_df.loc[country].tolist()
    for ls in lists:
        for c in ls:
            if c in d:
                d[c] += 1
            else:
                d[c] = 1
    top = []

    for k,v in sorted(d.items(), key=lambda p:p[1], reverse=True)[:num_neighbours]:
        top.append(k)
    return top

In [532]:
locations = location_neighbours_df.index.tolist()
top_neighbours = {}
for location in locations:
    top_neighbours[location] = find_top_neighbours(location)

In [533]:
final_df = pd.merge(medians, covid_merged[covid_merged_no_quant], on="iso_code").drop_duplicates()

In [534]:
final_df = final_df.set_index(final_df["iso_code"]).drop(columns=["iso_code"])

In [535]:
final_df['5NN'] = [top_neighbours[iso_code_to_loc(iso_code)] for iso_code in final_df.index]

# Output to UI

In [536]:
output_df = final_df.loc[loc_to_iso_code("United States")].to_frame().transpose()

In [537]:
output_df

Unnamed: 0,tourist_service_infrastructure,new_cases_smoothed_per_million,art_gallery,advice,description,location,continent,5NN
USA,6.56,149.613,0.0,If you're aged 18 and over you must be fully v...,The United States of America is a vast country...,United States,North America,"[Spain, Seychelles, Canada, Greece, Thailand]"
