<div class="alert alert-block alert-info"> A notebook designed to be completed for analyzing the relationship between route openings and city metrics, including sample code and pre-processed, formatted data. I want to examine whether there are differences between newly opened routes and existing routes by comparing them against cities metrics. (on the international market) </div>

<div class="alert alert-block alert-warning"><b>Warning :</b> I'm sure every graph can be plot with a loop but it's a good start. </div>

# Import

## lib

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import scipy
import polars as pl
import pandas as pd
from polars import col as d
# import plotly
# import plotly.express as px
import glob
import os
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# import seaborn as sns
# import matplotlib.pyplot as plt
# import math

# from scipy.stats import pointbiserialr
# from scipy.stats import mannwhitneyu
# from scipy.stats import ks_2samp
# import statsmodels.api as sm
# from sklearn.feature_selection import mutual_info_classif
# from scipy.stats import chi2_contingency
# from scipy.stats import fisher_exact
# from sklearn.metrics import roc_auc_score, roc_curve
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

## plot

In [None]:
from utils_plot import facet_distribution_plot
from utils_plot import plot_heatmap_by_group

## csv

In [None]:
folder_path = '/home/sara/Desktop/ATSLab/data/' 

In [None]:
df_cities_analysis = pl.read_parquet(folder_path+"df_cities_metrics_preprocessed_for_stat.parquet")

## some variables

In [None]:
CAT_COLUMNS = ['IS_SPECIAL_A', 'IS_CAPITAL_A', 'IS_GLOBAL_HUB_A', 'IS_DOMESTIC_HUB_A', 'IS_SPECIAL_B', 'IS_CAPITAL_B', 'IS_GLOBAL_HUB_B', 'IS_DOMESTIC_HUB_B'] ## drop YEAR
NUM_COLUMNS = ['POPU_A', 'POPU_B', 'EVO_POPU_A', 'EVO_POPU_B', 'INC_LC_A', 'INC_LC_B', 'EVO_INC_LC_A', 'EVO_INC_LC_B', 'INC_USD2019_A', 'INC_USD2019_B', 'EVO_INC_USD2019_A', 'EVO_INC_USD2019_B', 'POPU_LOG_A', 'POPU_LOG_B', 'INC_LC_LOG_A', 'INC_LC_LOG_B']
NUM_COLUMNS_SHORT = ['EVO_POPU_A','EVO_POPU_B','EVO_INC_LC_A','EVO_INC_LC_B','INC_USD2019_A','INC_USD2019_B','EVO_INC_USD2019_A','EVO_INC_USD2019_B','POPU_LOG_A','POPU_LOG_B','INC_LC_LOG_A','INC_LC_LOG_B']

In [None]:
## to change
dico_is_opening = {
    False: "#E57373",
    True: "#81C784"
}

dico_tag_duration_opening = {
    'SHORT_OPENING': "#FFA500",
    'LONG_OPENING': "#1f77b4",
    'NO_OPENING': "#A9A9A9",

    # "SHORT_OPENING": "#E69F00", 
    # "LONG_OPENING": "#0072B2" 
}

dico_new_opening = {
    "OLD_2010": "#A9A9A9",
    "HAS_OPENED": "#CC79A7", 
    "NEW_OPENING": "#56B4E9" 
}

In [None]:
order_tag_duration_opening = ['NO_OPENING', 'SHORT_OPENING', 'LONG_OPENING']
order_new_opening = ['OLD_2010', 'NEW_OPENING', 'HAS_OPENED']

# Data volume for each group

## global

In [None]:
(
    df_cities_analysis
    .group_by(['IS_OPENING'])
    .agg(pl.len().alias('COUNT'))
)

In [None]:
(
    df_cities_analysis
    .group_by(['TAG_DURATION_OPENING'])
    .agg(pl.len().alias('COUNT'))
)

In [None]:
(
    df_cities_analysis
    .group_by(['TAG_NEW_OPENING'])
    .agg(pl.len().alias('COUNT'))
)

## 2010 base year

In [None]:
(
    df_cities_analysis
    .filter((d.YEAR == 2010) | (d.IS_OPENING))
    .group_by(['IS_OPENING'])
    .agg(pl.len().alias('COUNT'))
)

In [None]:
(
    df_cities_analysis
    .filter((d.YEAR == 2010) | (d.IS_OPENING))
    .group_by(['TAG_DURATION_OPENING'])
    .agg(pl.len().alias('COUNT'))
)

In [None]:
df_cities_analysis_filtered = df_cities_analysis.filter((d.YEAR == 2010) | (d.IS_OPENING)).to_pandas()

In [None]:
df_cities_analysis_pd = df_cities_analysis.to_pandas()

# Histogram

## Population

### global

In [None]:
(
    facet_distribution_plot(
             df=df_cities_analysis_pd,
             category="IS_OPENING",
             value_vars=["POPU_A", "POPU_B"],
             dico_color=dico_is_opening,
             nbins = 30
            )

    .update_xaxes(dtick = 1_000_000)
       
)

In [None]:
(
    facet_distribution_plot(
             df=df_cities_analysis_pd,
             category="TAG_DURATION_OPENING",
             value_vars=["POPU_A", "POPU_B"],
             dico_color=dico_tag_duration_opening,
             nbins = 20
            )

    .update_xaxes(dtick = 2_000_000)
            
)

In [None]:
## change color
(
    facet_distribution_plot(
             df=df_cities_analysis_pd,
             category="TAG_NEW_OPENING",
             value_vars=["POPU_A", "POPU_B"],
             dico_color=dico_new_opening,
             nbins = 20
            )

    .update_xaxes(dtick = 2_000_000)
            
)

### 2010 base year

In [None]:
(
    facet_distribution_plot(
             df=df_cities_analysis_filtered,
             category="IS_OPENING",
             value_vars=["POPU_A", "POPU_B"],
             dico_color=dico_is_opening,
             nbins = 30
            )

    .update_xaxes(dtick = 1_000_000)
            
)

In [None]:
(
    facet_distribution_plot(
             df=df_cities_analysis_filtered,
             category="TAG_DURATION_OPENING",
             value_vars=["POPU_A", "POPU_B"],
             dico_color=dico_tag_duration_opening,
             nbins = 20
            )

    .update_xaxes(dtick = 2_000_000)
            
)

## Log population

to complete

# Heatmap

## Population

### global

In [None]:
bin_size = 2_000_000

df_binned = (
    df_cities_analysis
    .with_columns(POPU_A = d.POPU_A // bin_size * bin_size)
    .with_columns(POPU_B = d.POPU_B // bin_size * bin_size)
)

In [None]:
(
    plot_heatmap_by_group(
             df=df_binned,
             group_col="IS_OPENING",
             x_col="POPU_A",
             y_col="POPU_B",
            #  normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=df_binned,
             group_col="TAG_DURATION_OPENING",
             x_col="POPU_A",
             y_col="POPU_B",
             normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=df_binned,
             group_col="TAG_NEW_OPENING",
             x_col="POPU_A",
             y_col="POPU_B",
            #  normalize=True,
    )
)

### 2010 base year

In [None]:
bin_size = 2_000_000

df_binned = (
    pl.from_pandas(df_cities_analysis_filtered)
    .with_columns(POPU_A = d.POPU_A // bin_size * bin_size)
    .with_columns(POPU_B = d.POPU_B // bin_size * bin_size)
)

In [None]:
(
    plot_heatmap_by_group(
             df=df_binned,
             group_col="IS_OPENING",
             x_col="POPU_A",
             y_col="POPU_B",
            #  normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=df_binned,
             group_col="TAG_DURATION_OPENING",
             x_col="POPU_A",
             y_col="POPU_B",
            #  normalize=True,
 )
)

## Log population

to complete

## Is domestic hub

### global

In [None]:
(
    plot_heatmap_by_group(
             df=pl.from_pandas(df_cities_analysis_pd),
             group_col="IS_OPENING",
             x_col="IS_DOMESTIC_HUB_A",
             y_col="IS_DOMESTIC_HUB_B",
            #  normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=pl.from_pandas(df_cities_analysis_pd),
             group_col="TAG_DURATION_OPENING",
             x_col="IS_DOMESTIC_HUB_A",
             y_col="IS_DOMESTIC_HUB_B",
            #  normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=pl.from_pandas(df_cities_analysis_pd),
             group_col="TAG_NEW_OPENING",
             x_col="IS_DOMESTIC_HUB_A",
             y_col="IS_DOMESTIC_HUB_B",
            #  normalize=True,
 )
)

### 2010 base year

In [None]:
(
    plot_heatmap_by_group(
             df=pl.from_pandas(df_cities_analysis_filtered),
             group_col="IS_OPENING",
             x_col="IS_DOMESTIC_HUB_A",
             y_col="IS_DOMESTIC_HUB_B",
            #  normalize=True,
 )
)

In [None]:
(
    plot_heatmap_by_group(
             df=pl.from_pandas(df_cities_analysis_filtered),
             group_col="TAG_DURATION_OPENING",
             x_col="IS_DOMESTIC_HUB_A",
             y_col="IS_DOMESTIC_HUB_B",
            #  normalize=True,
 )
)

# Ideas

- Why not a PCA, but when I tried it wasn't very conclusive with these data
- Statistical test, biserial point, CDF etc...