<div class="alert alert-block alert-info">A notebook to preprocess data: import from CSV, rename columns, do grouping, enrich the dataset (e.g., add tags), and finally save it as a CSV for later analysis. </div>

<div class="alert alert-block alert-warning"> <b>Warning:</b> Be careful with the grouping granularity of the output CSV that is more high level than the actual granularity of the initial dataset (e.g., AC type, OPE_AL...). </div>

<div class="alert alert-block alert-warning"> <b>TO DO:</b> old part / data error code</div>

# Import

In [None]:
import numpy as np
import polars as pl
import pandas as pd
from polars import col as d
import glob
import os

# csv

In [None]:
folder_path = '/home/sara/Desktop/ATSLab/data_processing/OneDrive_2025-07-22/Schedule_Data/' 

## schedules

In [None]:
file_pattern = os.path.join(folder_path, 'schedules_processed_carrier_hours_*_incsmall_novia_allairports.csv')

all_files = glob.glob(file_pattern)

list_of_df = []

for file_path in all_files:
    filename = os.path.basename(file_path)
    year = int(filename.split('_')[4])
    print(year)
    df = (
        pl.scan_csv(file_path)
        .with_columns(pl.lit(year).alias('YEAR'))
        .with_columns(
            Departures = d.Departures.cast(pl.Float64),
            Arrivals = d.Arrivals.cast(pl.Float64),
            DepartureSeats = d.DepartureSeats.cast(pl.Float64),
            ArrivalSeats = d.ArrivalSeats.cast(pl.Float64),
            DepartureFlightHours = d.DepartureFlightHours.cast(pl.Float64),
            ArrivalFlightHours = d.ArrivalFlightHours.cast(pl.Float64),
            EquipSACode = d.EquipSACode.cast(pl.Int64),
            OriginAirport = d.OriginAirport.cast(pl.Utf8),
            DestinationAirport = d.DestinationAirport.cast(pl.Utf8),
        )
        .collect()
    )
    if year == 2019:
        df = df.drop("EquipATIBin")
        print('drop EquipATIBin column for 2019')

    list_of_df.append(df)
    
print("Import done")

## other tables

In [None]:
## define path
airports_metrics_path = os.path.join(folder_path, 'airports_v8_updatedmetrics_psos.csv')
airports_lookup_path = os.path.join(folder_path, 'all_airports_lookup.csv')
cities_metrics_path = os.path.join(folder_path, 'cities_v8_updatedmetrics_psos.csv')
fleet_lookup_path = os.path.join(folder_path, 'fleet_lookups.csv')

## define schema dict
schema_dict_cities_metrics = {
    "Pop10": pl.Float64,
    "Pop15": pl.Float64
}

schema_dict_airports_metrics = {
    "Elev_ft": pl.Float64,
}

## scan csv
df_airports_lookup = pl.scan_csv(airports_lookup_path, encoding='utf8-lossy', null_values=["NA"]).collect()
df_cities_metrics = pl.scan_csv(cities_metrics_path, encoding='utf8-lossy', null_values=["NA"], schema_overrides=schema_dict_cities_metrics).collect()
df_airports_metrics = pl.scan_csv(airports_metrics_path, encoding='utf8-lossy', null_values=["NA"], schema_overrides=schema_dict_airports_metrics).collect()
df_fleet_lookup = pl.scan_csv(fleet_lookup_path, encoding='utf8-lossy', null_values=["NA"]).collect()

In [None]:
airline_mapping_path = os.path.join(folder_path, 'subsidiariesGlobal_2019.csv')
df_airline_mapping = pl.scan_csv(airline_mapping_path, encoding='utf8-lossy', null_values=["NA"]).collect().rename({'Group':'AL_GROUP', 'Subsidiary':'OPE_AL'})

# Renaming columns

## schedules

In [None]:
schedules_final_df = (
    pl.concat(list_of_df)
    .rename({'OriginAirport':'APT_CODE_A',
    'DestinationAirport':'APT_CODE_B',
    'TimeBin':'TIME_BIN',
    'Equip': 'AC_TYPE',
    'Carrier':'OPE_AL',
    'EquipSACode': 'AIM_SIZE_CAT',
    'Departures':'FLT_DEP',
    'Arrivals':'FLT_ARR',
    'DepartureSeats':'SEATS_DEP',
    'ArrivalSeats':'SEATS_ARR',
    'DepartureFlightHours':'FLT_HOURS_DEP',
    'ArrivalFlightHours':'FLT_HOURS_ARR'
    })

    [['YEAR',
    'APT_CODE_A',
    'APT_CODE_B',
    'TIME_BIN',
    'AC_TYPE',
    'OPE_AL',
    'AIM_SIZE_CAT',
    'FLT_DEP',
    'FLT_ARR',
    'SEATS_DEP',
    'SEATS_ARR',
    'FLT_HOURS_DEP',
    'FLT_HOURS_ARR']]
)

print("Shape of the final schedule df :", schedules_final_df.shape)

## df_airports_lookup

In [None]:
df_airports_lookup_modif = (
    df_airports_lookup
    .rename({ ## not sure about the APT prefix
        'Number':'APT_ID',
        'Airport':'APT_CODE',
        'Name':'APT_NAME',
        'City':'APT_CITY_NAME',
        'CityName': 'APT_CITY_FULL_NAME',
        'Country': 'APT_COUNTRY_CODE',
        'CountryName':'APT_COUNTRY_NAME',
        'Region':'APT_REGION', ## _29 
        'Latitude':'LATITUDE',
        'Longitude':'LONGITUDE'
    })
)

## df_cities_metrics

In [None]:
df_cities_metrics_modif = (
    df_cities_metrics
    .rename({
        'Number':'METRO_ID',
 'City':'METRO_CITY',
 'Country': 'METRO_COUNTRY',
 'N_airports_2015':'NB_APT_2015',
 'Mid_Long':'METRO_AVG_LONG',
 'Mid_Lat':'METRO_AVG_LAT',
 'Pop10':'POPU_2010',
 'Pop11':'POPU_2011',
 'Pop12':'POPU_2012',
 'Pop13':'POPU_2013',
 'Pop14':'POPU_2014',
 'Pop15':'POPU_2015',
 'Inc10_LocalCurrency':'INC_2010_LC',
 'Inc11_LocalCurrency':'INC_2011_LC',
 'Inc12_LocalCurrency':'INC_2012_LC',
 'Inc13_LocalCurrency':'INC_2013_LC',
 'Inc14_LocalCurrency':'INC_2014_LC',
 'Inc15_LocalCurrency':'INC_2015_LC',
 'Inc10_USD2015_MER':'INC_2010_USD15',
 'Inc11_USD2015_MER':'INC_2011_USD15',
 'Inc12_USD2015_MER':'INC_2012_USD15',
 'Inc13_USD2015_MER':'INC_2013_USD15',
 'Inc14_USD2015_MER':'INC_2014_USD15',
 'Inc15_USD2015_MER':'INC_2015_USD15',
 'Spec':'IS_SPECIAL',
 'Access_Road':'HAS_ACCESS_ROAD',
 'Access_Rail':'HAS_ACCESS_RAIL',
 'Access_Sea':'HAS_ACCESS_SEA',
 'A1':'APT_ID_1',
 'A2':'APT_ID_2',
 'A3':'APT_ID_3',
 'A4':'APT_ID_4',
 'A5':'APT_ID_5',
 'A6':'APT_ID_6',
 'A7':'APT_ID_7',
 'A8':'APT_ID_8',
 'A9':'APT_ID_9',
 'A10':'APT_ID_10',
 'A11':'APT_ID_11',
 'A12':'APT_ID_12',
 'A13':'APT_ID_13',
 'C1':'CITY_CODE_1',
 'C2':'CITY_CODE_2',
 'C3':'CITY_CODE_3',
 'C4':'CITY_CODE_4',
 'C5':'CITY_CODE_5',
 'C6':'CITY_CODE_6',
 'C7':'CITY_CODE_7',
 'C8':'CITY_CODE_8',
 'C9':'CITY_CODE_9',
 'C10':'CITY_CODE_10',
 'C11':'CITY_CODE_11',
 'C12':'CITY_CODE_12',
 'C13':'CITY_CODE_13',

 'ICAO.Region.Code':'REGION_ID',
 'Country.Code':'COUNTRY_ID',
 'Optional.Region.Code':'OPT_REGION_ID',
 'N_Runways_2015':'NB_RUNWAYS_2015',
 'Capital_city':'IS_CAPITAL',
 'nom_single':'MAIN_CITY_NAME',
 'global_hub':'IS_GLOBAL_HUB',
 'domestic_hub':'IS_DOMESTIC_HUB',
 'GaWC_index':'GAWC_INDEX',

 'Deps_2015':'FLT_DEP_2015',
 'Seats_2015':'SEATS_DEP_2015',
 'Links_to_other_cities_2015':'LINKS_TO_CITY_2015',
 'Domestic_links_to_other_cities_2015':'DOMESTIC_LINKS_TO_CITY_2015',
 'Daily_links_to_other_cities_2015':'DAILY_LINKS_TO_CITY_2015',
 'Links_to_other_cities_over_1000mi_2015':'LINKS_TO_CITY_OV_1000MI_2015',
 'Links_to_other_cities_over_3000mi_2015':'LINKS_TO_CITY_OV_3000MI_2015',
 'In_set_proportion_of_global_RPK_2015':'RATIO_GLOBAL_RPK_2015',

 'Deps_2019':'FLT_DEP_2019',
 'Seats_2019':'SEATS_DEP_2019',
 'Links_to_other_cities_2019':'LINKS_TO_CITY_2019',
 'Domestic_links_to_other_cities_2019':'DOMESTIC_LINKS_TO_CITY_2019',
 'Daily_links_to_other_cities_2019':'DAILY_LINKS_TO_CITY_2019',
 'Links_to_other_cities_over_1000mi_2019':'LINKS_TO_CITY_OV_1000MI_2019',
 'Links_to_other_cities_over_3000mi_2019':'LINKS_TO_CITY_OV_3000MI_2019',
 'In_set_proportion_of_global_RPK_2019':'RATIO_GLOBAL_RPK_2019',

 'Deps_2023':'FLT_DEP_2023',
 'Seats_2023':'SEATS_DEP_2023',
 'Links_to_other_cities_2023':'LINKS_TO_CITY_2023',
 'Domestic_links_to_other_cities_2023':'DOMESTIC_LINKS_TO_CITY_2023',
 'Daily_links_to_other_cities_2023':'DAILY_LINKS_TO_CITY_2023',
 'Links_to_other_cities_over_1000mi_2023':'LINKS_TO_CITY_OV_1000MI_2023',
 'Links_to_other_cities_over_3000mi_2023':'LINKS_TO_CITY_OV_3000MI_2023',
 'In_set_proportion_of_global_RPK_2023':'RATIO_GLOBAL_RPK_2023'

    })
)

## df_airports_metrics

In [None]:
df_airports_metrics_modif = (
    df_airports_metrics
    .rename({
        'Number':'OLD_ID',
        'Number_2005':'ID_2005',
        'Code':'APT_CODE',
        'ICAO.Code':'APT_ICAO_CODE',
        'Name':'APT_CITY_LONG_NAME',
        'Country':'APT_COUNTRY_LONG_NAME',
        'Country.Code':'APT_COUNTRY_CODE',
        'Latitude':'LATITUDE',
        'Longitude':'LONGITUDE',
        'Timezone..19.Apr.2016.':'TIME_ZONE_2016',
        'World.Region':'REGION_ID',
        'City': 'APT_CITY_FULL_NAME_LOWER',
        'Island':'IS_ISLAND',
        'Elev_ft':'ELEV_FT',
        'Nrunways':'NB_RUNWAYS',
        'Longest_runway_ft':'LONGEST_RUNWAY_FT',
        'opened_year':'OPENING_YEAR',
        'build_year':'BUILDING_YEAR',
        'closed_year':'CLOSING_YEAR',
        'wikipedia_reference':'WIKI_LINK',
        'airport_city_greatcircle_dist_km': 'APT_CITY_GC_DIST_KM',
        'airport_city_drive_dist_km':'APT_CITY_DRIVE_DIST_KM',
        'airport_city_drive_time_h':'APT_CITY_DRIVE_TIME_H',

        'Scheduled_departures_2015': 'FLT_SCHEDULED_DEP_2015',
        'Scheduled_seats_2015':'SEATS_SCHEDULED_DEP_2015', ## is it DEP SEATS ? or DEP & ARR SEATS ? 
        'Cumulative_scheduled_seats_2015':'CUMU_SEATS_SCHEDULED_DEP_2015',
        'Cumulative_scheduled_departures_2015':'CUMU_FLT_SCHEDULED_DEP_2015',

        'Scheduled_departures_2019': 'FLT_SCHEDULED_DEP_2019',
        'Scheduled_seats_2019':'SEATS_SCHEDULED_DEP_2019', ## is it DEP SEATS ? or DEP & ARR SEATS ? 
        'Cumulative_scheduled_seats_2019':'CUMU_SEATS_SCHEDULED_DEP_2019',
        'Cumulative_scheduled_departures_2019':'CUMU_FLT_SCHEDULED_DEP_2019',

        'Scheduled_departures_2023': 'FLT_SCHEDULED_DEP_2023',
        'Scheduled_seats_2023':'SEATS_SCHEDULED_DEP_2023', ## is it DEP SEATS ? or DEP & ARR SEATS ? 
        'Cumulative_scheduled_seats_2023':'CUMU_SEATS_SCHEDULED_DEP_2023',
        'Cumulative_scheduled_departures_2023':'CUMU_FLT_SCHEDULED_DEP_2023',

        'Departing_leg_passengers_2015':'LEG_PAX_DEP_2015',
        'Departing_leg_passengers_2019':'LEG_PAX_DEP_2019', 
    })
)

## df_fleet_lookup

In [None]:
df_fleet_lookup_modif = (
    df_fleet_lookup
    .rename({
        'Name':'AC_FULL_NAME',
        'ICAOcode':'AC_ICAO_CODE',
        'IATAcode':'AC_IATA_CODE',
        'estSeats':'SEATS_ESTIM',
        'Manufacturer_Country':'MANUF_COUNTRY_NAME',
        'Military.Utility':'IS_MILITARY_UTIL',
        'AIM_SizeCat':'AIM_SIZE_CAT',
        'ATI_SizeCat':'ATI_SIZE_CAT',
        'TypicalSeats':'SEATS_TYPICAL',
        'BuiltFrom':'BUILT_FROM',
        'BuiltTo':'BUILT_TO',
        'MaxFreightVolFreighterm3':'MAX_VOL_FREIGHTER_M3',
        'MaxFreightWeightFreighterTonne':'MAX_WEIGHT_FREIGHTER_T',
        'btscode':'BTS_CODE',
        'FlightGlobalname':'AC_GLOBAL_NAME',
        'NewFlightGlobalName':'AC_NEW_GLOBAL_NAME',
        'exFleetTotalP':'FLEET_TOTAL_P',
        'exFleetTotalF':'FLEET_TOTAL_F',
        'AvgSeatsFG':'AVG_SEATS_FG', ## what is FG ?
        'AvgMaxPayloadP':'AVG_MAX_PAYLOAD_P',
        'AvgMaxPayloadF':'AVG_MAX_PAYLOAD_F',
        'AvgMaxCargoPayloadP':'AVG_MAX_CARGO_PAYLOAD_P',
        'EngineType':'ENGINE_TYPE'
    })
)

# Helper functions 

In [None]:
def consecutive_sequences(years):
    years = sorted(years)
    groups = [[years[0]]]
    for y in years[1:]:
        if y == groups[-1][-1] + 1:
            groups[-1].append(y)
        else:
            groups.append([y])
    return groups


def find_sequence_length(year: int, sequences: list[list[int]]) -> int:
    for seq in sequences:
        if year in seq:
            return len(seq)
    return 0 


# Enhanced schedules dataset

In [None]:
df_enhanced = (
    schedules_final_df

    .filter(d.APT_CODE_A != d.APT_CODE_B) ## filter out when the airport A and airport B are the same

    ## group by
    .group_by(['YEAR', 'APT_CODE_A', 'APT_CODE_B']) 
    .agg(d.FLT_DEP.sum().alias('FLT'), d.FLT_ARR.sum(), d.SEATS_DEP.sum().alias('SEATS'), d.SEATS_ARR.sum(), d.FLT_HOURS_DEP.sum().alias('FLT_HOURS'), d.FLT_HOURS_ARR.sum(), d.OPE_AL.unique().count().alias('NB_OPE_AL'), d.OPE_AL.unique().alias('LIST_OPE_AL'))

    ## add the REGION ID
    .join(df_airports_metrics_modif.select('APT_CODE', 'REGION_ID').rename({'APT_CODE':'APT_CODE_A', 'REGION_ID':'REGION_ID_A'}).unique(), how = 'left', on = ['APT_CODE_A'])
    .join(df_airports_metrics_modif.select('APT_CODE', 'REGION_ID').rename({'APT_CODE':'APT_CODE_B', 'REGION_ID':'REGION_ID_B'}).unique(), how = 'left', on = ['APT_CODE_B'])

    ## add the list of existing year of an airport pair
    .pipe( ## over function don't work on my version that's weird
        lambda df: df.join(
            df.group_by(["APT_CODE_A", "APT_CODE_B"])
              .agg(d.YEAR.unique().alias("LIST_EXISTING_YEAR"), d.YEAR.unique().count().alias('NB_EXISTING_YEAR'), d.YEAR.max().alias('LAST_EXISTING_YEAR'), d.YEAR.min().alias('FIRST_EXISTING_YEAR')),
            on=["APT_CODE_A", "APT_CODE_B"],
            how="left"
        )
    )

    ## add tag to know when the route open, end, reopen, break
    .with_columns(IS_OPENING = (d.FIRST_EXISTING_YEAR == d.YEAR) & (d.YEAR != 2000))
    .with_columns(IS_END = (d.LAST_EXISTING_YEAR == d.YEAR) & (d.YEAR != 2023))
    .with_columns(IS_REOPENING = (~d.LIST_EXISTING_YEAR.list.contains(d.YEAR - 1)) & (d.YEAR != d.FIRST_EXISTING_YEAR))
    .with_columns(IS_PAUSE = (~d.LIST_EXISTING_YEAR.list.contains(d.YEAR + 1)) & (d.YEAR != d.LAST_EXISTING_YEAR))


    ## apply previous function to have a list of list of the consecutive years the routes was open
    .with_columns(LIST_CONSEC_YEAR = d.LIST_EXISTING_YEAR.map_elements(consecutive_sequences, return_dtype=pl.List(pl.List(pl.Int32))))
    .with_columns([pl.struct(["YEAR", "LIST_CONSEC_YEAR"]).map_elements(lambda x: find_sequence_length(x["YEAR"], x["LIST_CONSEC_YEAR"]),return_dtype=pl.Int32).alias("CONSEC_YEAR_OPEN_DURATION")])

    ## how many breaks the route had, in total
    .with_columns(TOTAL_BREAKS = d.LIST_CONSEC_YEAR.list.len() -1)

    ## the duration of the first opening
    .with_columns(DURATION_FIRST_OPENING = d.LIST_CONSEC_YEAR.list.first().list.len().cast(pl.Int32))

    # ## the duration of the first break
    # .pipe(
    #     lambda df: df.join(
    #                         df.filter(d.TOTAL_BREAKS != 0)
    #                         .with_columns(DURATION_FIRST_BREAK = d.LIST_CONSEC_YEAR.list.get(1).list.first() - d.LIST_CONSEC_YEAR.list.first().list.last() - 1)
    #                         .select(['YEAR', 'APT_CODE_A', 'APT_CODE_B', 'DURATION_FIRST_BREAK'])
    #         ,
    #         on=['YEAR', 'APT_CODE_A', 'APT_CODE_B',],
    #         how="left"
    #     )
    # )
    

    ## add columns of 1 to sum when grouping in the future (i'm sure there is a smarter way to do it)
    .with_columns(NB_OPENING_RTE = pl.when(d.IS_OPENING) ## no need to put a condition for the year it's already done before
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_SHORT_OPENING_RTE = pl.when(d.IS_OPENING & (d.DURATION_FIRST_OPENING <= 3))
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_LONG_OPENING_RTE = pl.when(d.IS_OPENING & (d.DURATION_FIRST_OPENING > 3))
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_ENDING_RTE = pl.when(d.IS_END) ## same here but for 2023
                             .then(1)
                             .otherwise(0)
    )

    .with_columns(NB_REOPENING_RTE = pl.when(d.IS_REOPENING) ## same comment
                                          .then(1)
                                          .otherwise(0)
    )

    .with_columns(NB_PAUSE_RTE = pl.when(d.IS_PAUSE) ## same comment
                            .then(1)
                            .otherwise(0)
    )    
)

# Same but without covid

In [None]:
df_enhanced_wo_covid = (
    schedules_final_df

    .filter(d.YEAR < 2020)
    .filter(d.APT_CODE_A != d.APT_CODE_B) ## filter out when the airport A and airport B are the same

    ## group by
    .group_by(['YEAR', 'APT_CODE_A', 'APT_CODE_B']) 
    .agg(d.FLT_DEP.sum().alias('FLT'), d.FLT_ARR.sum(), d.SEATS_DEP.sum().alias('SEATS'), d.SEATS_ARR.sum(), d.FLT_HOURS_DEP.sum().alias('FLT_HOURS'), d.FLT_HOURS_ARR.sum(), d.OPE_AL.unique().count().alias('NB_OPE_AL'))

    ## add the REGION ID
    .join(df_airports_metrics_modif.select('APT_CODE', 'REGION_ID').rename({'APT_CODE':'APT_CODE_A', 'REGION_ID':'REGION_ID_A'}).unique(), how = 'left', on = ['APT_CODE_A'])
    .join(df_airports_metrics_modif.select('APT_CODE', 'REGION_ID').rename({'APT_CODE':'APT_CODE_B', 'REGION_ID':'REGION_ID_B'}).unique(), how = 'left', on = ['APT_CODE_B'])

    ## add the list of existing year of an airport pair
    .pipe( ## over function don't work on my version that's weird
        lambda df: df.join(
            df.group_by(["APT_CODE_A", "APT_CODE_B"])
              .agg(d.YEAR.unique().alias("LIST_EXISTING_YEAR"), d.YEAR.unique().count().alias('NB_EXISTING_YEAR'), d.YEAR.max().alias('LAST_EXISTING_YEAR'), d.YEAR.min().alias('FIRST_EXISTING_YEAR')),
            on=["APT_CODE_A", "APT_CODE_B"],
            how="left"
        )
    )

    ## add tag to know when the route open, end, reopen, break
    .with_columns(IS_OPENING = (d.FIRST_EXISTING_YEAR == d.YEAR) & (d.YEAR != 2000))
    .with_columns(IS_END = (d.LAST_EXISTING_YEAR == d.YEAR) & (d.YEAR != 2023))
    .with_columns(IS_REOPENING = (~d.LIST_EXISTING_YEAR.list.contains(d.YEAR - 1)) & (d.YEAR != d.FIRST_EXISTING_YEAR))
    .with_columns(IS_PAUSE = (~d.LIST_EXISTING_YEAR.list.contains(d.YEAR + 1)) & (d.YEAR != d.LAST_EXISTING_YEAR))


    ## apply previous function to have a list of list of the consecutive years the routes was open
    .with_columns(LIST_CONSEC_YEAR = d.LIST_EXISTING_YEAR.map_elements(consecutive_sequences, return_dtype=pl.List(pl.List(pl.Int32))))
    .with_columns([pl.struct(["YEAR", "LIST_CONSEC_YEAR"]).map_elements(lambda x: find_sequence_length(x["YEAR"], x["LIST_CONSEC_YEAR"]),return_dtype=pl.Int32).alias("CONSEC_YEAR_OPEN_DURATION")])

    ## how many breaks the route had, in total
    .with_columns(TOTAL_BREAKS = d.LIST_CONSEC_YEAR.list.len() -1)

    ## the duration of the first opening
    .with_columns(DURATION_FIRST_OPENING = d.LIST_CONSEC_YEAR.list.first().list.len().cast(pl.Int32))   

    ## add columns of 1 to sum when grouping in the future (i'm sure there is a smarter way to do it)
    .with_columns(NB_OPENING_RTE = pl.when(d.IS_OPENING) ## no need to put a condition for the year it's already done before
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_SHORT_OPENING_RTE = pl.when(d.IS_OPENING & (d.DURATION_FIRST_OPENING <= 3))
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_LONG_OPENING_RTE = pl.when(d.IS_OPENING & (d.DURATION_FIRST_OPENING > 3))
                                     .then(1)
                                     .otherwise(0)
    )

    .with_columns(NB_ENDING_RTE = pl.when(d.IS_END) ## same here but for 2023
                             .then(1)
                             .otherwise(0)
    )

    .with_columns(NB_REOPENING_RTE = pl.when(d.IS_REOPENING) ## same comment
                                          .then(1)
                                          .otherwise(0)
    )

    .with_columns(NB_PAUSE_RTE = pl.when(d.IS_PAUSE) ## same comment
                            .then(1)
                            .otherwise(0)
    )    
)

# Filter on Transatlantic

In [None]:
df_enhanced_filtered = (
    df_enhanced
    .filter(d.REGION_ID_A.is_in([10,13]))
    .filter(d.REGION_ID_B.is_in([10,13]))

    ## add tag for futur plot
    .with_columns(MKT_TYPE = pl.when((d.REGION_ID_A == d.REGION_ID_B) & (d.REGION_ID_A == 10))
                               .then(pl.lit('INTRA_US'))
                               .when((d.REGION_ID_A == d.REGION_ID_B) & (d.REGION_ID_A == 13))
                               .then(pl.lit('INTRA_EUR'))
                               .otherwise(pl.lit('INTER'))
    )

    ## the direction (to be more precise on the INTER)
    .with_columns(DIRECTION = pl.when((d.REGION_ID_A == 10) & (d.REGION_ID_B == 13))
                                .then(pl.lit('US_to_EUR'))
                                .when((d.REGION_ID_A == 13) & (d.REGION_ID_B == 10))
                                .then(pl.lit('EUR_to_US'))
                                .otherwise(d.MKT_TYPE)

    )
)

In [None]:
df_enhanced_wo_covid_filtered = (
    df_enhanced_wo_covid
    .filter(d.REGION_ID_A.is_in([10,13]))
    .filter(d.REGION_ID_B.is_in([10,13]))

    ## add tag for futur plot
    .with_columns(MKT_TYPE = pl.when((d.REGION_ID_A == d.REGION_ID_B) & (d.REGION_ID_A == 10))
                               .then(pl.lit('INTRA_US'))
                               .when((d.REGION_ID_A == d.REGION_ID_B) & (d.REGION_ID_A == 13))
                               .then(pl.lit('INTRA_EUR'))
                               .otherwise(pl.lit('INTER'))
    )

    ## the direction (to be more precise on the INTER)
    .with_columns(DIRECTION = pl.when((d.REGION_ID_A == 10) & (d.REGION_ID_B == 13))
                                .then(pl.lit('US_to_EUR'))
                                .when((d.REGION_ID_A == 13) & (d.REGION_ID_B == 10))
                                .then(pl.lit('EUR_to_US'))
                                .otherwise(d.MKT_TYPE)

    )
)

# Save CSV

## scheduled dataset

### Transtlantic data

In [None]:
df_enhanced_filtered.write_parquet("scheduled_dataset_transatlantic_enhanced.parquet")

In [None]:
df_enhanced_wo_covid_filtered.write_parquet("scheduled_dataset_wo_covid_transatlantic_enhanced.parquet")

## other dataset

In [None]:
df_fleet_lookup_modif.write_csv("df_fleet_lookup_modif.csv")
df_airports_lookup_modif.write_csv("df_airports_lookup_modif.csv")
df_airports_metrics_modif.write_csv("df_airports_metrics_modif.csv")
df_cities_metrics_modif.write_csv("df_cities_metrics_modif.csv")
df_airline_mapping.write_csv("df_airline_mapping.csv")

# OLD - data error

- sometimes apt A == apt B
- seats dep != seats arr --> normal but also not normal

In [None]:
# df_error_look = (
#     df_filtered
#     .filter(d.APT_CODE_A != d.APT_CODE_B)

#     .group_by(['YEAR', 'MKT_TYPE'])
#     .agg(d.SEATS_DEP.sum(), d.SEATS_ARR.sum())
#     .with_columns(DIFF = d.SEATS_DEP - d.SEATS_ARR)
#     # .select('SEATS_DEP', 'SEATS_ARR')
#     # .sum()
#     .sort('YEAR', 'MKT_TYPE')
#     .to_pandas()

# )

In [None]:
# (
#     px.line(df_error_look,
#     x = 'YEAR', y = ['SEATS_DEP', 'SEATS_ARR'],
#     facet_col = 'MKT_TYPE'
#     )
#     .update_yaxes(matches=None, showticklabels=True)

# )

In [None]:
# (
#     px.line(df_error_look,
#     x = 'YEAR', y = 'DIFF',
#     facet_col = 'MKT_TYPE'
#     )
#     .update_yaxes(matches=None, showticklabels=True)

# )

In [None]:
# graph_check_filter = (
#     df_filtered
#     .filter(d.APT_CODE_A == d.APT_CODE_B)
#     .group_by(['YEAR', 'MKT_TYPE'])
#     .agg(d.SEATS_DEP.sum().alias('SEATS'))
#     .sort('YEAR')
#     .to_pandas()
# )

In [None]:
# (
#     px.line(graph_check_filter,
#     x = 'YEAR', y = 'SEATS',
#     markers = True, 
#     facet_col = 'MKT_TYPE'
#     )
#     .update_yaxes(matches=None, showticklabels=True)
# )

In [None]:
# (
#     df_filtered
#     .filter(d.APT_CODE_A != d.APT_CODE_B)
#     .group_by(['YEAR', 'APT_CODE_A', 'APT_CODE_B', 'REGION_ID_A', 'REGION_ID_B', 'MKT_TYPE'])
#     .agg(d.FLT_DEP.sum(), d.FLT_ARR.sum(), d.SEATS_DEP.sum(), d.SEATS_ARR.sum())

#     .filter(d.SEATS_DEP != d.SEATS_ARR)

#     .with_columns(IS_MORE_THAN_10_FLT_DIFF=abs(d.FLT_DEP - d.FLT_ARR)>10)

#     .group_by(['YEAR', 'MKT_TYPE', 'IS_MORE_THAN_10_FLT_DIFF'])
#     .agg(d.SEATS_DEP.sum(), d.SEATS_ARR.sum())

#     .sort('YEAR', 'MKT_TYPE')
    

# )