<div class="alert alert-block alert-info"> Notebook to filter routes from the full set of possible routes, to be used alongside the other analysis notebooks. </div>

# Import

## lib

In [None]:
import numpy as np
import polars as pl
import pandas as pd
from polars import col as d
import glob
import os

## csv

In [None]:
folder_path = '/home/sara/Desktop/ATSLab/data/' 

In [None]:
df_airports_lookup_modif = pl.read_csv(folder_path+"df_airports_lookup_modif.csv")
df_airports_metrics_modif = pl.read_csv(folder_path+"df_airports_metrics_modif.csv")
df_scheduled = pl.read_parquet(folder_path+"scheduled_dataset_transatlantic_enhanced.parquet") ## change name
df_airports_ratings = pl.read_csv("/home/sara/Desktop/ATSLab/data_scrapping/csv_output/20250819_output_airport_ratings.csv")

In [None]:
df_route_combinaison_enhanced = pl.read_parquet(folder_path+'df_route_combinaison_enhanced.parquet')

# Filtering based on XLR performance / airports feasibility

## range

In [None]:
(
    df_route_combinaison_enhanced

    ## from 283_404 to 80_880
    .filter(d.DIST_GC_KM <= 8700)
    .filter(d.DIST_GC_KM >= 7400)  
    .head(2) 
)

## landing/take off

In [None]:
(
    df_route_combinaison_enhanced
    .filter(d.IS_FEASIBLE)
    .head(2)

    ## from 283_404 to 97_044
)

## width runway

don't have the data but it will be a good thing to have the width

## heliport / aerodrome / industrial

In [None]:
list_heli_aero = (
    df_airports_ratings
    .filter(d.GOOGLE_NAME.str.contains('Heliport') | d.GOOGLE_NAME.str.contains('Aerodrome'))
    ['APT_CODE']
    .to_list()
)

In [None]:
list_industrial_aiport = ['YMX']

In [None]:
(
    df_route_combinaison_enhanced
    ## from 283_404 to 282_048
    # .filter(~d.APT_CODE_A.is_in(list_heli_aero))
    # .filter(~d.APT_CODE_A.is_in(list_heli_aero))

    ## from 283_404 to 282_568 (not a lot only 1 airport for the moment, can be completed with the Wikipedia data)
    ## list industrial airport / non commercial
    # .filter(~d.APT_CODE_A.is_in(list_industrial_aiport))
    # .filter(~d.APT_CODE_B.is_in(list_industrial_aiport))

    .head(2)

)

## closed airport

In [None]:
(
    df_route_combinaison_enhanced
    .filter(~d.CLOSING_YEAR_A.is_null() | ~d.CLOSING_YEAR_B.is_null()) ## 5_424
    .head(2)
)

## combining every criterion

In [None]:
df_combinaison_filtered_perfo = (   
    ## from 283_404 to 29_602
    df_route_combinaison_enhanced
    .filter(d.IS_FEASIBLE)
    .filter(d.DIST_GC_KM <= 8700) ## also filter the null values
    .filter(d.DIST_GC_KM >= 7400)  
    .filter(d.CLOSING_YEAR_A.is_null() & d.CLOSING_YEAR_B.is_null())

    .filter(~d.APT_CODE_A.is_in(list_industrial_aiport))
    .filter(~d.APT_CODE_B.is_in(list_industrial_aiport))


    ## doesn't change anything
    .filter(~d.APT_CODE_A.is_in(list_heli_aero))
    .filter(~d.APT_CODE_A.is_in(list_heli_aero))



    # .filter(d.DIST_GC_KM.is_null()) ## 7_458

)

# Filtering based on aiport metrics

## total drive distance

In [None]:
(
    df_combinaison_filtered_perfo
    ## from 29_602 to 25_682
    # .filter(d.ROUTE_DRIVE_DIST_KM <= 100)

    ## from 29_602 to 25_930
    .filter(d.ROUTE_DRIVE_DIST_KM >= 20)
    .head(2)
)

## drive time

In [None]:
(
    df_combinaison_filtered_perfo
    ## from 29_602 to 28_298

    # .filter(d.APT_CITY_DRIVE_TIME_H_A <= 1.5)
    # .filter(d.APT_CITY_DRIVE_TIME_H_B <= 1.5)

    ## from 29_602 to 27_994
    .filter(d.ROUTE_DRIVE_TIME_H >= 0.4)
    
    .head(2)
)

## number of review

In [None]:
(
    df_combinaison_filtered_perfo

    ## from 29_602 to 22_862
    # .filter((d.NB_REVIEW_LOG_A > 5) & (d.NB_REVIEW_LOG_B > 5))

    ## from 29_602 to 24_551
    .filter(d.NB_REVIEW_LOG_A + d.NB_REVIEW_LOG_B >= 13)
)

## number of runways

In [None]:
(
    ## from 29_602 to 27_186
    df_combinaison_filtered_perfo
    .filter(~((d.NB_RUNWAYS_A == 1) & (d.NB_RUNWAYS_B == 1)))
    .head(2)
)

## ratings

In [None]:
(
    df_combinaison_filtered_perfo
    
    ## from 29_602 to 28_891
    # .filter(d.ROUTE_RATING <= 9)

    ## from 29_602 to 28_919
    # .filter((d.RATING_A > 3) & (d.RATING_B > 3))

    # .filter(~((d.RATING_A < 3.5) & (d.RATING_B < 3.5))) ## 14

    .head(2)

)

## is island

In [None]:
(
    ## from 29_602 to 29_537

    df_combinaison_filtered_perfo   
    .filter(~((d.IS_ISLAND_A == 1) & (d.IS_ISLAND_B == 1)) )

)

## elevation

In [None]:
(
    ## from 29_602 to 28_609

    df_combinaison_filtered_perfo   
    .filter(~((d.ELEV_FT_A >= 1500) & (d.ELEV_FT_B >= 1500)))
    .filter(~((d.ELEV_FT_A >= 500) & (d.ELEV_FT_B >= 2500)))
    .filter(~((d.ELEV_FT_A >= 2500) & (d.ELEV_FT_B >= 500)))

    .filter(~((d.ELEV_FT_A < 0) & (d.ELEV_FT_B >= 1000)))
    .filter(~((d.ELEV_FT_B < 0) & (d.ELEV_FT_A >= 1000)))

    .head(2)


)

## time zone

In [None]:
(
    ## from 29_602 to 

    df_combinaison_filtered_perfo   

    .filter(d.TIME_ZONE_2016_A > 6)




)

## combining filter

In [None]:
(
    df_combinaison_filtered_perfo

    ## from 29_602 to 15_034

    .filter(~((d.NB_RUNWAYS_A == 1) & (d.NB_RUNWAYS_B == 1)))

    .filter((d.NB_REVIEW_LOG_A > 5) & (d.NB_REVIEW_LOG_B > 5))
    .filter(d.NB_REVIEW_LOG_A + d.NB_REVIEW_LOG_B >= 13)

    .filter(d.APT_CITY_DRIVE_TIME_H_A <= 1.5)
    .filter(d.APT_CITY_DRIVE_TIME_H_B <= 1.5)
    .filter(d.ROUTE_DRIVE_TIME_H >= 0.4)

    .filter(d.ROUTE_DRIVE_DIST_KM <= 100)
    .filter(d.ROUTE_DRIVE_DIST_KM >= 20)

    .filter(d.ROUTE_RATING <= 9)
    .filter((d.RATING_A > 3) & (d.RATING_B > 3))
    .filter(~((d.RATING_A < 3.5) & (d.RATING_B < 3.5)))

    .filter(~((d.IS_ISLAND_A == 1) & (d.IS_ISLAND_B == 1)))

    ## not sure
    .filter(~((d.ELEV_FT_A >= 1500) & (d.ELEV_FT_B >= 1500)))
    .filter(~((d.ELEV_FT_A >= 500) & (d.ELEV_FT_B >= 2500)))
    .filter(~((d.ELEV_FT_A >= 2500) & (d.ELEV_FT_B >= 500)))
    .filter(~((d.ELEV_FT_A < 0) & (d.ELEV_FT_B >= 1000)))
    .filter(~((d.ELEV_FT_B < 0) & (d.ELEV_FT_A >= 1000)))

# .filter(d.HAS_EXISTED) #313


)