<div class="alert alert-block alert-info">Notebook with an initial look at how the routes were opened. </div>

# Import

## lib

In [None]:
import numpy as np
import scipy
import polars as pl
import pandas as pd
from polars import col as d
import plotly
import plotly.express as px
import glob
import os
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import math

## csv

In [None]:
folder_path = '/home/sara/Desktop/ATSLab/data/' 

In [None]:
df_scheduled = pl.read_parquet(folder_path+"scheduled_dataset_transatlantic_enhanced.parquet") ## change name
df_scheduled_wo_covid = pl.read_parquet(folder_path+"scheduled_dataset_wo_covid_transatlantic_enhanced.parquet") ## change name

In [None]:
df_airline_mapping = pl.read_csv(folder_path+'df_airline_mapping.csv')
df_airports_lookup = pl.read_csv(folder_path+'df_airports_lookup_modif.csv')

# how many passenger by mkt type

In [None]:
df_metrics_by_year = (
    df_scheduled
    .group_by(['YEAR', 'MKT_TYPE'])
    .agg(d.SEATS.sum(), d.FLT.sum())
    .sort('YEAR', 'MKT_TYPE')
    .to_pandas()
)

In [None]:
(
    px.line(df_metrics_by_year, 
    x = 'YEAR', y = 'SEATS', 
    markers = True, 
    facet_col = 'MKT_TYPE', facet_col_spacing=0.05
    )
    .update_yaxes(matches=None, showticklabels=True)
)

# how many directional/undirectionnal routes ?

In [None]:
df_nb_rte = (
    df_scheduled
    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
    .with_columns(UNDIR_APT_PAIR_CODE = pl.when(d.APT_CODE_A < d.APT_CODE_B)
                                          .then(d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
                                          .otherwise(d.APT_CODE_B + pl.lit('-') + d.APT_CODE_A)
    )

    .group_by('YEAR', 'MKT_TYPE')
    .agg(d.DIR_APT_PAIR_CODE.unique().count().alias('NB_DIR_APT_PAIR_CODE'), d.UNDIR_APT_PAIR_CODE.unique().count().alias('NB_UNDIR_APT_PAIR_CODE'))
    .sort('YEAR', 'MKT_TYPE')
    .to_pandas()

)

In [None]:
(
    px.line(df_nb_rte,  
    x = 'YEAR', y ='NB_DIR_APT_PAIR_CODE',
    facet_col = 'MKT_TYPE',
    markers = True,
    facet_col_spacing=0.03)
    .update_yaxes(matches=None, showticklabels=True)
    .update_xaxes(dtick = 1, range = [2000,2023], autotickangles = [45,45,45])
)

In [None]:
# px.bar(df_nb_rte,  x = 'YEAR', y ='NB_UNDIR_APT_PAIR_CODE', facet_col = 'MKT_TYPE', orientation='v', facet_col_spacing=0.05).update_yaxes(matches=None, showticklabels=True)
px.line(df_nb_rte,  x = 'YEAR', y ='NB_UNDIR_APT_PAIR_CODE', facet_col = 'MKT_TYPE', markers = True, facet_col_spacing=0.03).update_yaxes(matches=None, showticklabels=True)

## directionnal view for the international market

In [None]:
df_zoom_direction_inter = (
    df_scheduled
    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)

    .with_columns(UNDIR_APT_PAIR_CODE = pl.when(d.APT_CODE_A < d.APT_CODE_B)
                                          .then(d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
                                          .otherwise(d.APT_CODE_B + pl.lit('-') + d.APT_CODE_A)
    )
    
    .filter(d.MKT_TYPE == 'INTER')

    .group_by('YEAR', 'DIRECTION')
    .agg(d.DIR_APT_PAIR_CODE.unique().count().alias('NB_APT_PAIR_CODE'))
    .sort('YEAR', 'DIRECTION')
    .to_pandas()
)

In [None]:
(
    px.line(df_zoom_direction_inter,
    x = 'YEAR', y='NB_APT_PAIR_CODE',
    color = 'DIRECTION',
    markers = True
    )
    .update_xaxes(dtick = 1, range = [1999.5,2023.5])
)

In [None]:
df_search_asymetry = (    
    df_scheduled

    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)

    .with_columns(UNDIR_APT_PAIR_CODE = pl.when(d.APT_CODE_A < d.APT_CODE_B)
                                          .then(d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
                                          .otherwise(d.APT_CODE_B + pl.lit('-') + d.APT_CODE_A)
    )
    
    .filter(d.MKT_TYPE == 'INTER')

    .with_columns(COUNT = d.DIR_APT_PAIR_CODE.unique().count().over('YEAR', 'UNDIR_APT_PAIR_CODE'))

    .with_columns(TAG = pl.when(d.COUNT == 2)
                          .then(pl.lit('both direction'))
                          .otherwise(d.DIRECTION)
    )

    .group_by('YEAR', 'TAG')
    .agg(d.DIR_APT_PAIR_CODE.unique().count().alias('NB_DIR_APT_PAIR_CODE'))
    .sort('YEAR', 'TAG')
    .filter(d.TAG != 'both direction')
    .rename({'TAG': 'DIRECTION'})
    .to_pandas()
)

In [None]:
(
    px.line(
        df_search_asymetry,
        x='YEAR', y = 'NB_DIR_APT_PAIR_CODE',
        color = 'DIRECTION',
        markers = True
    )
    .update_xaxes(dtick = 1, range = [1999.5,2023.5])

)

# how many routes open/close/reopen/break, is it dynamic ?

## global waterfall

In [None]:
df_waterfall = (    
    df_scheduled
    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
    .group_by(['YEAR', 'MKT_TYPE'])
    .agg(d.DIR_APT_PAIR_CODE.count().alias('NB_DIR_RTE'), d.NB_OPENING_RTE.sum(), d.NB_REOPENING_RTE.sum(), d.NB_ENDING_RTE.sum(), d.NB_PAUSE_RTE.sum())
    .with_columns(NB_ENDING_RTE = -d.NB_ENDING_RTE, NB_PAUSE_RTE = -d.NB_PAUSE_RTE)
    .sort('YEAR')
    .unpivot(index=['YEAR', 'MKT_TYPE'])
    .rename({'variable':'RTE_TYPE', 'value':'NB_RTE'})
    .with_columns(ORDER = pl.when(d.RTE_TYPE == 'NB_DIR_RTE')
                            .then(0)
                            .when(d.RTE_TYPE == 'NB_ENDING_RTE')
                            .then(1)
                            .when(d.RTE_TYPE == 'NB_PAUSE_RTE')
                            .then(2)
                            .when(d.RTE_TYPE == 'NB_REOPENING_RTE')
                            .then(3)
                            .when(d.RTE_TYPE == 'NB_OPENING_RTE')
                            .then(4)
                )
    .sort('YEAR', 'MKT_TYPE', 'ORDER')
    .with_columns(YEAR = pl.when(d.RTE_TYPE.is_in(['NB_REOPENING_RTE', 'NB_OPENING_RTE']))
                           .then(d.YEAR - 1)
                           .otherwise(d.YEAR)
    )
    .with_columns(BASE = d.NB_RTE.cum_sum().shift(1).over('YEAR', 'MKT_TYPE'))
    .with_columns(BASE = pl.when(d.RTE_TYPE == 'NB_DIR_RTE')
                           .then(0)
                           .otherwise(d.BASE)
    )
    .to_pandas()


)

In [None]:
(
    px.bar(
        df_waterfall,
        x = 'YEAR', y = 'NB_RTE',
        base ='BASE',
        color = 'RTE_TYPE',
        facet_row = 'MKT_TYPE',
        barmode='group',
        height = 1500,
         color_discrete_map = {
                "NB_OPENING_RTE": "green",
                "NB_REOPENING_RTE": "dodgerblue",
                "NB_ENDING_RTE": "red",
                "NB_PAUSE_RTE": "orange",
                "NB_DIR_RTE": "gray"}
    

    )
    .update_yaxes(matches=None, showticklabels=True)
    .update_xaxes(showticklabels=True, dtick = 1, autotickangles = [45,45,45], range = [1999.1, 2023.1])
    .update_layout(bargap=0)

)

## zoom on the 4 categories

In [None]:
df_plot_nb_rte_breakdown = (
    df_scheduled
    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
    .group_by(['YEAR', 'MKT_TYPE'])
    .agg(d.NB_OPENING_RTE.sum(), d.NB_REOPENING_RTE.sum(), d.NB_ENDING_RTE.sum(), d.NB_PAUSE_RTE.sum())
    .with_columns(NB_ENDING_RTE = -d.NB_ENDING_RTE, NB_PAUSE_RTE = -d.NB_PAUSE_RTE)
    .sort('YEAR', 'MKT_TYPE')
)

In [None]:
(
    px.bar(
        df_plot_nb_rte_breakdown.to_pandas(), 
         x = 'YEAR', y = ['NB_OPENING_RTE', 'NB_REOPENING_RTE', 'NB_ENDING_RTE', 'NB_PAUSE_RTE'], 
         facet_col = 'MKT_TYPE',
         height = 600,
         facet_col_spacing=0.03,
         color_discrete_map = {
                "NB_OPENING_RTE": "green",
                "NB_REOPENING_RTE": "dodgerblue",
                "NB_ENDING_RTE": "red",
                "NB_PAUSE_RTE": "orange"}

         
    )
    .update_yaxes(matches=None, showticklabels=True)
)

# how many seats do the previous 4 categories represent

In [None]:
df_4_cat_seats = (
    df_scheduled
    .group_by(['YEAR', 'MKT_TYPE'])
    .agg(d.SEATS.filter(d.IS_OPENING).sum().alias('SEATS_OPENING'), d.SEATS.filter(d.IS_REOPENING).sum().alias('SEATS_REOPENING'), d.SEATS.filter(d.IS_PAUSE).sum().alias('SEATS_PAUSE'), d.SEATS.filter(d.IS_END).sum().alias('SEATS_END'), d.SEATS.sum().alias('SEATS'))
    .with_columns(SEATS_PAUSE = -d.SEATS_PAUSE, SEATS_END = -d.SEATS_END)
    .sort('YEAR', 'MKT_TYPE')
    .to_pandas()
)

In [None]:
(
    px.bar(
        df_4_cat_seats, 
         x = 'YEAR', y = ['SEATS_OPENING', 'SEATS_REOPENING', 'SEATS_END', 'SEATS_PAUSE'], 
         facet_col = 'MKT_TYPE',
         height = 600,
         facet_col_spacing=0.03,
         color_discrete_map = {
                "SEATS_OPENING": "green",
                "SEATS_REOPENING": "dodgerblue",
                "SEATS_END": "red",
                "SEATS_PAUSE": "orange"}

         
    )
    .update_yaxes(matches=None, showticklabels=True)
)

# what's the duration of the first opening/launch

In [None]:
df_duration_first_opening = (
    df_scheduled
    .filter(d.IS_OPENING)
    .with_columns(BIN_DURATION_FIRST_OPENING = pl.when(d.DURATION_FIRST_OPENING.is_in([1,2]))
                                            .then(d.DURATION_FIRST_OPENING.cast(pl.Utf8))
                                            .otherwise(pl.lit('3+'))

    )

    .group_by(['YEAR', 'MKT_TYPE', 'BIN_DURATION_FIRST_OPENING'])
    .agg(d.NB_OPENING_RTE.sum())
    .sort(['YEAR', 'MKT_TYPE', 'BIN_DURATION_FIRST_OPENING'], descending=[False, False, True])
    .filter(d.YEAR < 2020)
    .with_columns(PCT = d.NB_OPENING_RTE/d.NB_OPENING_RTE.sum().over(['YEAR', 'MKT_TYPE']))
    .rename({'BIN_DURATION_FIRST_OPENING':'DURATION_FIRST_OPENING'})
    .to_pandas()
)

In [None]:
# (
#     px.bar(df_duration_first_opening, 
#     x = 'YEAR', y = 'PCT', 
#     color = 'DURATION_FIRST_OPENING', 
#     facet_col='MKT_TYPE',
#     height=400, 
#     color_discrete_sequence = px.colors.qualitative.D3
#     )

#     .update_yaxes(matches=None, showticklabels=True)

#     .add_hline(
#     y=0.582475,
#     line_dash="dash",
#     line_color="black",
#     row=1,
#     col=1
# )

#     .add_hline(
#     y=0.531819,
#     line_dash="dash",
#     line_color="black",
#     row=1,
#     col=2
# )

#     .add_hline(
#     y=0.57262,
#     line_dash="dash",
#     line_color="black",
#     row=1,
#     col=3
# )

# )

In [None]:
df_density_opening_duration = ( ## warning covid effect maybe 
    df_scheduled
    .filter(d.IS_OPENING)
    .with_columns(COUNT = 1)
    .group_by("DURATION_FIRST_OPENING", "MKT_TYPE")
    .agg(d.COUNT.sum())
    .with_columns(DISTRIBUTION = d.COUNT/d.COUNT.sum().over('MKT_TYPE'))
    .sort("MKT_TYPE", "DURATION_FIRST_OPENING")
    .with_columns(CDF = d.DISTRIBUTION.cum_sum().over('MKT_TYPE'))
)

#####
# comment/uncomment the y to have the distribution or the cdf
#####

(
    px.bar(
    df_density_opening_duration.to_pandas(),
    x="DURATION_FIRST_OPENING",
    y="DISTRIBUTION",
    # y = 'CDF',
    color="MKT_TYPE",
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.025,
    # markers=True,
)
    .update_xaxes(dtick = 1)
    .update_yaxes(showticklabels=True)
)


In [None]:
df_density_opening_duration = (
    df_scheduled_wo_covid
    # .filter(d.YEAR < 2018)
    .filter(d.IS_OPENING)
    .with_columns(COUNT = 1)
    .group_by("DURATION_FIRST_OPENING", "MKT_TYPE")
    .agg(d.COUNT.sum())
    .with_columns(DISTRIBUTION = d.COUNT/d.COUNT.sum().over('MKT_TYPE'))
    .sort("MKT_TYPE", "DURATION_FIRST_OPENING")
    .with_columns(CDF = d.DISTRIBUTION.cum_sum().over('MKT_TYPE'))

)

#####
# comment/uncomment the y to have the distribution or the cdf
#####

(
    px.bar(
    df_density_opening_duration.to_pandas(),
    x="DURATION_FIRST_OPENING",
    y="DISTRIBUTION",
    # y = 'CDF',
    color="MKT_TYPE",
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.025,
    # markers=True,
)
    .update_xaxes(dtick = 1)
    .update_yaxes(showticklabels=True)
)


# how many breaks does a route take TOCHECK

In [None]:
df_density_pause = (
    df_scheduled
    .filter(d.IS_OPENING)
    .with_columns(COUNT = 1)
    .group_by("DURATION_FIRST_OPENING", "MKT_TYPE")
    .agg(d.COUNT.sum())
    .with_columns(DISTRIBUTION = d.COUNT/d.COUNT.sum().over('MKT_TYPE'))
    .sort("MKT_TYPE", "DURATION_FIRST_OPENING")
    .with_columns(CDF = d.DISTRIBUTION.cum_sum().over('MKT_TYPE'))
)

(
    px.bar(
    df_density_pause.to_pandas(),
    x="DURATION_FIRST_OPENING",
    y="DISTRIBUTION",
    # y = 'CDF',
    color="MKT_TYPE",
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.025,
    # markers=True,
)
    .update_xaxes(dtick = 1)
    .update_yaxes(showticklabels=True)
)


In [None]:
df_count_break = (
    df_scheduled
    .with_columns(DIR_APT_PAIR_CODE = d.APT_CODE_A + pl.lit('-') + d.APT_CODE_B)
    .group_by(['TOTAL_BREAKS', 'MKT_TYPE'])
    .agg(d.DIR_APT_PAIR_CODE.count().alias('NB_DIR_APT_RTE'))
    .with_columns(PCT = d.NB_DIR_APT_RTE/d.NB_DIR_APT_RTE.sum().over('YEAR', 'MKT_TYPE'))
    .sort('YEAR', 'MKT_TYPE', 'TOTAL_BREAKS')
    .to_pandas()
)

In [None]:
(
    px.bar(df_count_break,
    x = 'YEAR', y = 'NB_DIR_APT_RTE',
    color = 'TOTAL_BREAKS',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    orientation='v',
    color_discrete_sequence = px.colors.qualitative.D3
    )
    .update_yaxes(matches=None, showticklabels=True)

)

In [None]:
(
    px.bar(df_count_break,
    x = 'YEAR', y = 'PCT',
    color = 'TOTAL_BREAKS',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    orientation='v',
    color_discrete_sequence = px.colors.qualitative.D3

    )
    .update_yaxes(matches=None, showticklabels=True)

)

# how much traffic do new routes represent

In [None]:
df_opening_when_non_directionel = (
    df_scheduled
    .with_columns(OPENING_YEAR = pl.when(d.FIRST_EXISTING_YEAR == 2000)
                                   .then(pl.lit('2000'))
                                   .when(d.FIRST_EXISTING_YEAR.is_in([2001,2002,2003,2004,2005]))
                                   .then(pl.lit('2001-2005'))
                                   .when(d.FIRST_EXISTING_YEAR.is_in([2006, 2007,2008,2009,2010]))
                                   .then(pl.lit('2006-2010'))
                                   .when(d.FIRST_EXISTING_YEAR.is_in([2011,2012,2013,2014,2015]))
                                   .then(pl.lit('2011-2015'))
                                   .when(d.FIRST_EXISTING_YEAR.is_in([2016,2017,2018,2019]))
                                   .then(pl.lit('2016-2019'))
                                   .otherwise(pl.lit('2020-2023'))
    )
    .group_by(['YEAR', 'OPENING_YEAR', 'MKT_TYPE'])
    .agg(d.FLT.sum(), d.SEATS.sum())
    .with_columns(PCT = d.SEATS/d.SEATS.sum().over('YEAR', 'MKT_TYPE'))
    
    .sort(d.YEAR, d.MKT_TYPE, d.OPENING_YEAR)
    .to_pandas()
)

In [None]:
(
    px.bar(df_opening_when_non_directionel, 
    x='YEAR', y ='SEATS', 
    color = 'OPENING_YEAR', 
    facet_col ='MKT_TYPE',
    facet_col_spacing=0.03,
    color_discrete_sequence = px.colors.qualitative.D3

    )
    .update_yaxes(matches=None, showticklabels=True)
    .update_xaxes(dtick = 1, autotickangles = [45,45,45])

)

In [None]:
(
    px.bar(df_opening_when_non_directionel, 
    x='YEAR', y ='PCT', 
    color = 'OPENING_YEAR', 
    facet_col ='MKT_TYPE',
    facet_col_spacing=0.03,
    color_discrete_sequence = px.colors.qualitative.D3

    )
    .update_yaxes(matches=None, showticklabels=True)
    .update_xaxes(dtick = 1, autotickangles = [45,45,45])


)

In [None]:
df_opening_by_year = (
    df_scheduled
    .group_by(['YEAR', 'FIRST_EXISTING_YEAR', 'MKT_TYPE'])
    .agg(d.FLT.sum(), d.SEATS.sum())
    .with_columns(PCT_SEATS = d.SEATS/d.SEATS.sum().over('YEAR', 'MKT_TYPE'))
    .with_columns(OPENING_YEAR = d.FIRST_EXISTING_YEAR.cast(pl.Utf8))
    .sort(d.YEAR, d.MKT_TYPE, d.OPENING_YEAR)
    .to_pandas()
)

In [None]:
(
    px.bar(df_opening_by_year, 
    x='YEAR', y ='PCT_SEATS', 
    color = 'OPENING_YEAR', 
    facet_col ='MKT_TYPE',
    facet_col_spacing=0.03
    )
    .update_yaxes(matches=None, showticklabels=True)
    .update_xaxes(dtick = 1, autotickangles = [45,45,45])

)

# range of the opening route

In [None]:
R = 6371.0

lat1 = pl.col("LATITUDE_A") * np.pi / 180
lon1 = pl.col("LONGITUDE_A") * np.pi / 180
lat2 = pl.col("LATITUDE_B") * np.pi / 180
lon2 = pl.col("LONGITUDE_B") * np.pi / 180

dlat = lat2 - lat1
dlon = lon2 - lon1

a = (dlat / 2).sin()**2 + lat1.cos() * lat2.cos() * (dlon / 2).sin()**2
c = 2 * a.sqrt().arcsin()


In [None]:
df_test_distance = (
    df_filtered_enhanced
    .join(df_airports_lookup_modif.select('APT_CODE', 'LATITUDE', 'LONGITUDE').rename({col: f"{col}_A" for col in ['APT_CODE', 'LATITUDE', 'LONGITUDE']}), how = 'left', on = 'APT_CODE_A')
    .join(df_airports_lookup_modif.select('APT_CODE', 'LATITUDE', 'LONGITUDE').rename({col: f"{col}_B" for col in ['APT_CODE', 'LATITUDE', 'LONGITUDE']}), how = 'left', on = 'APT_CODE_B')
    
    .with_columns(DIST_GC_KM = R * c,
                #   DIST_EUCL_KM = ((((d.LATITUDE_B - d.LATITUDE_A) * 111)**2) + (((d.LONGITUDE_B - d.LONGITUDE_A) * 111 * (lat1.cos()))**2)).sqrt()
                 )

    # .filter(d.IS_OPENING)

    # .filter(d.MKT_TYPE == 'INTER')

    .with_columns(MEAN_DIST_GC_YEAR = d.DIST_GC_KM.mean().over(['YEAR', 'MKT_TYPE', 'IS_OPENING']),
                  MEDIAN_DIST_GC_YEAR = d.DIST_GC_KM.median().over(['YEAR', 'MKT_TYPE', 'IS_OPENING']),
                 )
    
    .sort('YEAR')
    
)

In [None]:
px.box(
    df_test_distance.filter(d.MKT_TYPE == 'INTER').to_pandas(),
    x='YEAR',
    y='DIST_GC_KM',
    color='IS_OPENING',
)



In [None]:
(
    px.scatter(df_test_distance.filter(d.MKT_TYPE == 'INTER').to_pandas(),
    x = 'YEAR', y = 'DIST_GC_KM',
    color = 'IS_OPENING'
    )
)

In [None]:
(
    px.line(df_test_distance.filter(d.MKT_TYPE == 'INTER').to_pandas(),
    x = 'YEAR', y = 'MEAN_DIST_GC_YEAR',
    markers = True,
    color = 'IS_OPENING'
    )
)

# how many airline are on routes ?

In [None]:
df_nb_ope = (
    df_scheduled
    .with_columns(NB_RTE = 1)
    # .with_columns(NB_OPENING_RTE = d.NB_OPENING_RTE_L3 + d.NB_OPENING_RTE_ME3)
    .group_by(['YEAR', 'MKT_TYPE', 'NB_OPE_AL'])
    .agg(d.NB_RTE.sum(), d.NB_OPENING_RTE.sum(), d.NB_REOPENING_RTE.sum(), d.NB_ENDING_RTE.sum(), d.NB_PAUSE_RTE.sum())
    .sort('MKT_TYPE', 'YEAR', 'NB_OPE_AL')
    .with_columns(NB_RTE_PCT = d.NB_RTE/d.NB_RTE.sum().over(['YEAR', 'MKT_TYPE']))
    .with_columns(NB_OPENING_RTE_PCT = d.NB_OPENING_RTE/d.NB_OPENING_RTE.sum().over(['YEAR', 'MKT_TYPE']))
    .with_columns(NB_REOPENING_RTE_PCT = d.NB_REOPENING_RTE/d.NB_REOPENING_RTE.sum().over(['YEAR', 'MKT_TYPE']))
    .with_columns(NB_ENDING_RTE_PCT = d.NB_ENDING_RTE/d.NB_ENDING_RTE.sum().over(['YEAR', 'MKT_TYPE']))
    .with_columns(NB_PAUSE_RTE_PCT = d.NB_PAUSE_RTE/d.NB_PAUSE_RTE.sum().over(['YEAR', 'MKT_TYPE']))


)

## in general

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_RTE',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_RTE_PCT',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

## at opening

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_OPENING_RTE',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_OPENING_RTE_PCT',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

## at reopening

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_REOPENING_RTE',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_REOPENING_RTE_PCT',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

## at end

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_ENDING_RTE',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_ENDING_RTE_PCT',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

## at break

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_PAUSE_RTE',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

In [None]:
(
    px.bar(df_nb_ope.to_pandas(),
    x = 'YEAR', y = 'NB_PAUSE_RTE_PCT',
    color = 'NB_OPE_AL',
    facet_col = 'MKT_TYPE',
    facet_col_spacing=0.03,
    )
    .update_yaxes(matches=None, showticklabels=True)
)

# is it always the same carrier that open new route on the international market ?

be careful here we have the operational carrier

In [None]:
df_plot_al_opening = (
    df_scheduled
    .filter(d.IS_OPENING)
    .filter(d.MKT_TYPE == 'INTER')
    .select('YEAR', 'LIST_OPE_AL')
    .explode('LIST_OPE_AL')
    .rename({'LIST_OPE_AL':'OPE_AL'})
    
    .join(df_airline_mapping.rename({'AL':'OPE_AL'}), how = 'left', on = 'OPE_AL')
    .with_columns(AL_GROUP = d.AL_GROUP.fill_null('unknown'))

    .group_by(['YEAR', 'AL_GROUP'])
    .agg(pl.len().alias('NB_RTE_OPEN'))
    .sort(['YEAR', 'NB_RTE_OPEN'], descending = [False, True])

)

In [None]:
(
    px.bar(
        df_plot_al_opening.to_pandas(),
        x = 'YEAR', y = 'NB_RTE_OPEN',
        color = 'AL_GROUP',
        orientation = 'v',
    )
)

## top 10 opening airline group

In [None]:
df_plot_al_opening.group_by('AL_GROUP').agg(d.NB_RTE_OPEN.sum()).sort('NB_RTE_OPEN', descending = True).head(10)

# what happen to route that doesn't last more than 3 years ? (end or pause)

In [None]:
df_le3 = (
    df_scheduled_wo_covid

    .filter(d.IS_OPENING)
    .filter(d.DURATION_FIRST_OPENING < 4)

    .with_columns(TAG = pl.when(d.TOTAL_BREAKS == 0)
                           .then(pl.lit('END'))
                           .otherwise(pl.lit('PAUSE'))
    )

    .with_columns(NB_OPENING_RTE = 1)
    .filter(d.YEAR < 2017)
    .group_by(['MKT_TYPE', 'TAG'])
    .agg(d.NB_OPENING_RTE.sum())
    .with_columns(NB_OPENING_RTE_PCT = d.NB_OPENING_RTE/d.NB_OPENING_RTE.sum().over('MKT_TYPE'))
    .sort(['MKT_TYPE', 'NB_OPENING_RTE_PCT'], descending = [False, True])
    .rename({'NB_OPENING_RTE':'NB_OPENING_RTE_LESS_3Y', 'NB_OPENING_RTE_PCT':'NB_OPENING_RTE_LESS_3Y_PCT'})

    .to_pandas()

)

In [None]:
(
    px.bar(df_le3,
    x= 'MKT_TYPE', y='NB_OPENING_RTE_LESS_3Y_PCT',
    color = 'TAG',
    color_discrete_map = {
        "END": "red",
        "PAUSE": "orange"
    }
    )

)

In [None]:
(
    px.bar(df_le3,
    x= 'MKT_TYPE', y='NB_OPENING_RTE_LESS_3Y',
    color = 'TAG',
    color_discrete_map = {
        "END": "red",
        "PAUSE": "orange"
    }
    )

)

# visualisation of airport we're working with

note: some airport (11) may be missing latitude and longitude null in the df_airports_lookup dataframe

In [None]:
apt_series = (df_scheduled
    .select('APT_CODE_A', 'APT_CODE_B')
    .unpivot(on=['APT_CODE_A', 'APT_CODE_B'])
    .drop('variable')
    .rename({'value': 'APT_CODE'})
    .unique()
)

In [None]:
df_geo = (
    df_airports_lookup
    .join(apt_series, how = 'inner', on = 'APT_CODE')
    .select('APT_CODE', 'LATITUDE', 'LONGITUDE') ## 741
    .filter(~ d.LATITUDE.is_null()) ## 730
    .filter(~ d.LONGITUDE.is_null())
    .to_pandas()
)

In [None]:
(
    px.scatter_geo(
        df_geo,
        lat="LATITUDE",
        lon="LONGITUDE",
        color_discrete_sequence=["red"],
        projection="equirectangular", 
        title="Airport location",
        height = 1000
    )
    .update_geos(showcountries=True, showcoastlines=True)
    .update_traces(marker=dict(size=4))  
)

# Ideas

- duration pause/first reopening
- repartition aircraft, which aircrafts goes on new routes (for example new aircraft or old one)
- do kind of the same study but with routes as new metropolitan area
- frequency of new routes vs old one and how the frequency evolves over time, based on the frequencies how does the routes survives