In [27]:
import pandas as pd
import pycountry

In [28]:
df = pd.read_parquet("../data/raw/export_df.parquet")
df.head()

Unnamed: 0,hotel_id,datum_dolaska,datum_kreiranja_rezervacije,datum_odjave,datum_otkazivanja_rezervacije,broj_odraslih_gostiju,broj_djece_gostiju,zemlja_gosta,kanal_prodaje_id,tip_sobe_id,cijena_nocenja,status_rezervacije,rezervacija_id,gost_id
0,0,2015-07-01,2015-01-21,2015-07-01,,2,0.0,PRT,0,0,100.0,Check-Out,1313223,1077152
1,0,2015-07-01,2015-06-10,2015-07-01,,2,0.0,PRT,0,0,100.0,Check-Out,1313224,1017906
2,0,2015-07-01,2015-05-13,2015-07-02,,1,0.0,GBR,0,1,64.991345,Check-Out,1313225,1039896
3,0,2015-07-01,2014-05-30,2015-07-02,,1,0.0,GBR,1,1,74.368897,Check-Out,1313226,1008245
4,0,2015-07-01,2014-07-06,2015-07-03,,2,0.0,GBR,2,1,130.973278,Check-Out,1313227,1093703


In [29]:
df_gdp = pd.read_csv("../data/raw/GDP.csv")
df_gdp.head()

Unnamed: 0,Country,Country Code,2014,2015,2016,2017,2018
0,Aruba,ABW,38223.37226,38249.05487,38390.27165,39454.62983,
1,Afghanistan,AFG,1897.525938,1886.692977,1896.99252,1934.636754,1955.006208
2,Angola,AGO,7199.245478,7096.600615,6756.935074,6650.58494,6452.355165
3,Albania,ALB,11259.22589,11662.03048,11868.17897,12930.14003,13364.1554
4,Arab World,ARB,16153.24486,16501.79259,16935.3833,17099.88939,17570.1376


### Country codes
Check if we can account for the GDP of each country present in the hotel occupancy dataset.

In [30]:
country_codes_guests = set(df["zemlja_gosta"].values)
country_codes_gdp = set(df_gdp["Country Code"].values)

if len(country_codes_guests) == len(
    set.intersection(country_codes_guests, country_codes_gdp)
):
    print("All country codes are present in the GDP dataset.")
else:
    print("Country codes not present in the GDP dataset.")
    diff_code_to_name = {
        code: pycountry.countries.get(alpha_3=code)
        and pycountry.countries.get(alpha_3=code).name
        or "Country code not found."
        for code in set.difference(country_codes_guests, country_codes_gdp)
    }
    print(diff_code_to_name)

Country codes not present in the GDP dataset.
{'TMP': 'Country code not found.', 'ATA': 'Antarctica', 'GLP': 'Guadeloupe', 'GGY': 'Guernsey', 'CN': 'Country code not found.', 'MYT': 'Mayotte', 'TWN': 'Taiwan, Province of China', '0': 'Country code not found.', 'ASM': 'American Samoa', 'UMI': 'United States Minor Outlying Islands', 'AND': 'Andorra', 'CUB': 'Cuba', 'JEY': 'Jersey', 'ATF': 'French Southern Territories'}


Inspecting the CN country code. Change ISO2 CN -> ISO3 CHN should be applied.

In [31]:
pycountry.countries.get(alpha_2="CN").name

'China'

Inspect the "0" country code.

In [32]:
df[df["zemlja_gosta"] == "0"]

Unnamed: 0,hotel_id,datum_dolaska,datum_kreiranja_rezervacije,datum_odjave,datum_otkazivanja_rezervacije,broj_odraslih_gostiju,broj_djece_gostiju,zemlja_gosta,kanal_prodaje_id,tip_sobe_id,cijena_nocenja,status_rezervacije,rezervacija_id,gost_id
30,0,2015-07-01,2015-03-25,2015-07-15,,1,0.0,0,0,1,108.777471,Check-Out,1313253,1025267
7086,0,2016-07-21,2016-02-16,2016-07-22,2016-07-20,1,0.0,0,1,1,149.505993,Canceled,1320309,1018737
7854,0,2016-08-30,2016-04-14,2016-09-04,2016-07-22,2,0.0,0,0,1,139.266985,Canceled,1321077,1103989
8773,0,2016-10-13,2015-08-14,2016-10-14,2016-10-13,1,0.0,0,1,1,50.101986,Canceled,1321996,1050748
9603,0,2016-12-20,2016-12-20,2016-12-21,2016-12-19,1,0.0,0,1,1,139.024443,Canceled,1322826,1088682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116656,1,2017-07-23,2017-01-09,2017-07-26,,2,0.0,0,2,2,117.535924,Check-Out,1429879,1013418
117569,1,2017-08-02,2017-06-12,2017-08-09,,2,0.0,0,2,2,74.297247,Check-Out,1430792,1077720
118006,1,2017-10-15,2017-06-21,2017-10-16,,2,2.0,0,2,5,355.084326,Check-Out,1431229,1072520
118701,1,2017-12-25,2017-06-24,2017-12-27,,1,0.0,0,2,1,149.546725,Check-Out,1431924,1021807


We assume "0" was used if the guest's country of origin was not known.

### Corresponding years

Explore which years from GDP dataset which do not appear in the hotel occupancy dataset. We focus on the dates from the reservation column.

In [33]:
reservation_years = {year for year in pd.to_datetime(df["datum_kreiranja_rezervacije"]).dt.year}
reservation_years

{2013, 2014, 2015, 2016, 2017}

In [34]:
list(df_gdp.columns)[2:]

['2014', '2015', '2016', '2017', '2018']

Years which do not appear in the "reservation_years" set should be removed from the GDP dataset.