In [1]:
import pandas as pd
import altair as alt

# Set max rows to a higher value
alt.data_transformers.disable_max_rows()

# Load the datasets
arrivals_df = pd.read_csv("../data/export/df_cleaned_Inbound_Tourism_Arrivals.csv")
transport_df = pd.read_csv("../data/export/df_cleaned_Inbound_Tourism_Transport.csv")
regions_df = pd.read_csv("../data/export/df_cleaned_Inbound_Tourism_Regions.csv")

# Preview datasets
print("Arrivals DataFrame:")
print(arrivals_df.head())
print("\nTransport DataFrame:")
print(transport_df.head())
print("\nRegions DataFrame:")
print(regions_df.head())


Arrivals DataFrame:
          Country   1995   1996   1997   1998    1999     2000     2001  \
0         ALBANIA  304.0  287.0  119.0  184.0   371.0    317.0    354.0   
1         ALGERIA  520.0  605.0  635.0  678.0   749.0    866.0    901.0   
2  AMERICAN SAMOA   34.0   35.0   26.0   36.0    41.0     44.0     36.0   
3         ANDORRA    0.0    0.0    0.0    0.0  9422.0  10991.0  11351.0   
4          ANGOLA    9.0   21.0   45.0   52.0    45.0     51.0     67.0   

      2002     2003  ...    2013    2014    2015    2016    2017    2018  \
0    470.0    557.0  ...  3256.0  3673.0  4131.0  4736.0  5118.0  5927.0   
1    988.0   1166.0  ...  2733.0  2301.0  1710.0  2039.0  2451.0  2657.0   
2      0.0      0.0  ...    49.3    51.6    47.1    38.3    42.3    51.8   
3  11507.0  11601.0  ...  7676.0  7797.0  7850.0  8025.0  8152.0  8328.0   
4     91.0    107.0  ...   650.0   595.0   592.0   397.0   261.0   218.0   

     2019    2020    2021    2022  
0  6406.0  2658.0  5689.0  7543.8  


In [2]:
# Pivoting the arrivals dataset
arrivals_long = arrivals_df.melt(id_vars=["Country"], var_name="Year", value_name="Tourist Arrivals")

# Convert Year column to numeric
arrivals_long["Year"] = arrivals_long["Year"].astype(int)

# Preview data
arrivals_long.head()


Unnamed: 0,Country,Year,Tourist Arrivals
0,ALBANIA,1995,304.0
1,ALGERIA,1995,520.0
2,AMERICAN SAMOA,1995,34.0
3,ANDORRA,1995,0.0
4,ANGOLA,1995,9.0


In [4]:
# Pivot transport data to get air travel arrivals for 2010 and 2022
transport_long = transport_df.melt(id_vars=["Country", "Arrival by mode of transport"], 
                                   var_name="Year", value_name="Arrivals")

# Convert Year to numeric
transport_long["Year"] = transport_long["Year"].astype(int)

# Filter only for air arrivals
air_travel = transport_long[transport_long["Arrival by mode of transport"] == "Air"]

# Get 2010 and 2022 values for each country
air_2010 = air_travel[air_travel["Year"] == 2010][["Country", "Arrivals"]].rename(columns={"Arrivals": "Air_2010"})
air_2022 = air_travel[air_travel["Year"] == 2022][["Country", "Arrivals"]].rename(columns={"Arrivals": "Air_2022"})

# Merge to compare growth
growth_df = air_2010.merge(air_2022, on="Country", how="inner")

# Define categories based on 50% growth threshold
def classify_growth(row):
    if row["Air_2022"] > row["Air_2010"] * 1.5:
        return "High Increase in Direct Flights"
    else:
        return "Low Increase in Direct Flights"

growth_df["Flight Growth Category"] = growth_df.apply(classify_growth, axis=1)

# Merge category with arrivals data
merged_df = arrivals_long.merge(growth_df[["Country", "Flight Growth Category"]], on="Country", how="left")

# Preview data
merged_df.head()


Unnamed: 0,Country,Year,Tourist Arrivals,Flight Growth Category
0,ALBANIA,1995,304.0,High Increase in Direct Flights
1,ALGERIA,1995,520.0,Low Increase in Direct Flights
2,AMERICAN SAMOA,1995,34.0,Low Increase in Direct Flights
3,ANDORRA,1995,0.0,
4,ANGOLA,1995,9.0,Low Increase in Direct Flights


In [5]:
print(merged_df["Flight Growth Category"].unique())

merged_df["Flight Growth Category"] = merged_df["Flight Growth Category"].fillna("Low Increase in Direct Flights")


['High Increase in Direct Flights' 'Low Increase in Direct Flights' nan]


In [6]:
agg_df = merged_df.groupby(["Year", "Flight Growth Category"])["Tourist Arrivals"].mean().reset_index()
agg_df["Year"] = agg_df["Year"].astype(int)  # Ensure Year is an integer
covid_rule = alt.Chart(pd.DataFrame({"Year": [2020]})).mark_rule(color="red", strokeDash=[5,5]).encode(x="Year:O")

color=alt.Color("Flight Growth Category:N", scale=alt.Scale(scheme="category10"))

chart = alt.Chart(agg_df).mark_line().encode(
    x=alt.X("Year:O", title="Year", axis=alt.Axis(labelAngle=45)),
    y=alt.Y("Tourist Arrivals:Q", title="Average Tourist Arrivals"),
    color="Flight Growth Category:N"
).properties(
    title="Tourism Growth Over Time Based on Direct Flight Increase",
    width=700,
    height=400
)


chart + covid_rule


In [7]:
print(agg_df["Flight Growth Category"].value_counts())


Flight Growth Category
High Increase in Direct Flights    28
Low Increase in Direct Flights     28
Name: count, dtype: int64
