## **Graphs for Dashboard**

### Import necessary libraries and Select Stylistic Presets

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns

import folium
from matplotlib import cm
from matplotlib.colors import Normalize
import colorsys
import matplotlib.colors as mcolors


In [2]:
# Set Seaborn Style to Whitegrid
sns.set_style("whitegrid")

# Set Plotly Template to Seaborn with whitegrid style
px.defaults.template = "seaborn"

### Import Dataframe

In [72]:
# Define a generic file path with the information from the machine learning testing and training
file_path = 'yourcsv.csv'

# Read the CSV file into a DataFrame
df_raw = pd.read_csv(file_path)

### Create Features For Visualization

In [None]:
# --- Data Preparation and Feature Engineering ---

# Step 1: Create a binary column 'departure_delay_binary' indicating whether the departure delay
# exceeds a 15-minute threshold (900 seconds). '1' indicates a delayed flight, '0' indicates on-time.
df_raw['departure_delay_binary'] = np.where(df_raw['departure_delay'] >= 15*60, 'delayed', 'on_time')

# Step 2: Modify the 'departure_delay' column to handle negative values.
# Any negative departure delay values are considered as zero (no delay).
df_raw['departure_delay'] = np.where(df_raw['departure_delay'] < 0, 0, df_raw['departure_delay'])

# Step 3: Create a binary column 'arrival_delay_binary' based on the same 15-minute threshold.
# '1' indicates an arrival delay, '0' indicates on-time.
df_raw['arrival_delay_binary'] = np.where(df_raw['arrival_delay'] >= 15*60, 'delayed', 'on_time')

# Step 4: Modify the 'arrival_delay' column to handle negative values.
# Any negative arrival delay values are set to zero (no delay).
df_raw['arrival_delay'] = np.where(df_raw['arrival_delay'] < 0, 0, df_raw['arrival_delay'])

# Step 5: Create a new column 'ICAO_route' which combines the departure (origin) and arrival (destination)
# airport ICAO codes into a single string, separated by a hyphen (e.g., 'TJSJ-KMCO').
df_raw['ICAO_route'] = df_raw['origin.code_icao'] + '-' + df_raw['destination.code_icao']

# Step 6: Drop rows that have missing values in critical columns:
# 'scheduled_out' (scheduled departure time), 'ident_icao' (flight identifier), 'registration' (aircraft ID),
# and 'aircraft_type' (type of the aircraft).
df_raw = df_raw.dropna(subset=['scheduled_out', 'ident_icao', 'registration', 'aircraft_type'])

# Step 7: Convert the 'scheduled_out' column (departure time) to a pandas datetime format for easier manipulation
df_raw['scheduled_out'] = pd.to_datetime(df_raw['scheduled_out'])

# Step 8: Filter data to include only flights scheduled between January 1, 2022 and December 31, 2024.
df_raw = df_raw[(df_raw['scheduled_out'] >= '2022-01-01') & (df_raw['scheduled_out'] <= '2024-12-31')]

# Step 9: Create a new column 'year_week' to represent the Year-Week period of the flight's scheduled departure.
# This allows grouping flights by week (instead of individual days).
df_raw['year_week'] = df_raw['scheduled_out'].dt.to_period('W').dt.to_timestamp()

# Step 10: Create a column 'airport_name' by mapping the ICAO airport codes from the 'icao_to_airport' dictionary.
# This gives user-friendly airport names based on ICAO codes.
icao_to_airport = {
    'TJSJ': 'Luis Muñoz Marín International Airport',
    'EDDB': 'Berlin Brandenburg Airport',
    'CYUL': 'Montréal–Trudeau International Airport',
    'RCTP': 'Taiwan Taoyuan International Airport',
    'RJAA': 'Narita International Airport',
    'BIKF': 'Keflavík International Airport',
    'MMUN': 'Cancún International Airport',
    'LIRF': 'Leonardo da Vinci–Fiumicino Airport',
    'HECA': 'Cairo International Airport',
    'KMSY': 'Louis Armstrong New Orleans International Airport',
    'OEJN': 'King Abdulaziz International Airport',
    'WIII': 'Soekarno–Hatta International Airport',
    'EGLL': 'London Heathrow Airport',
    'KJFK': 'John F. Kennedy International Airport',
    'VHHH': 'Hong Kong International Airport',
    'OMDB': 'Dubai International Airport',
    'SPJC': 'Jorge Chávez International Airport',
    'KLGA': 'LaGuardia Airport',
    'RKSI': 'Incheon International Airport',
    'ESSA': 'Stockholm Arlanda Airport',
    'EIDW': 'Dublin Airport',
    'DAAG': 'Houari Boumediene Airport',
    'VTBS': 'Suvarnabhumi Airport',
    'CYVR': 'Vancouver International Airport',
    'SKBO': 'El Dorado International Airport',
    'EKCH': 'Copenhagen Airport',
    'VIDP': 'Indira Gandhi International Airport',
    'MMGL': 'Miguel Hidalgo y Costilla Guadalajara International Airport',
    'SABE': 'Aeroparque Jorge Newbery',
    'LEPA': 'Palma de Mallorca Airport',
    'ENGM': 'Oslo Gardermoen Airport',
    'KATL': 'Hartsfield–Jackson Atlanta International Airport',
    'SBGR': 'São Paulo/Guarulhos–Governador André Franco Montoro International Airport',
    'KSFO': 'San Francisco International Airport',
    'SCEL': 'Comodoro Arturo Merino Benítez International Airport',
    'LFPG': 'Charles de Gaulle Airport',
    'LFPO': 'Paris Orly Airport',
    'EHAM': 'Amsterdam Airport Schiphol',
    'RJFF': 'Fukuoka Airport',
    'WMKK': 'Kuala Lumpur International Airport',
    'KMCO': 'Orlando International Airport',
    'LPPT': 'Humberto Delgado Airport (Lisbon Portela Airport)'
}
df_raw['origin_airport_name'] = df_raw['origin.code_icao'].map(icao_to_airport)

# Step 11: Create a column 'airport_location' by mapping ICAO codes to airport locations
# This provides the geographical location (city) of the airport based on the ICAO code.
icao_to_location = {
    'TJSJ': 'San Juan, Puerto Rico',
    'EDDB': 'Berlin, Germany',
    'CYUL': 'Montreal, Canada',
    'RCTP': 'Taipei, Taiwan',
    'RJAA': 'Tokyo, Japan',
    'BIKF': 'Reykjavik, Iceland',
    'MMUN': 'Cancún, Mexico',
    'LIRF': 'Rome, Italy',
    'HECA': 'Cairo, Egypt',
    'KMSY': 'New Orleans, USA',
    'OEJN': 'Jeddah, Saudi Arabia',
    'WIII': 'Jakarta, Indonesia',
    'EGLL': 'London, United Kingdom',
    'KJFK': 'New York City, USA',
    'VHHH': 'Hong Kong, China',
    'OMDB': 'Dubai, United Arab Emirates',
    'SPJC': 'Lima, Peru',
    'KLGA': 'New York City, USA',
    'RKSI': 'Seoul, South Korea',
    'ESSA': 'Stockholm, Sweden',
    'EIDW': 'Dublin, Ireland',
    'DAAG': 'Algiers, Algeria',
    'VTBS': 'Bangkok, Thailand',
    'CYVR': 'Vancouver, Canada',
    'SKBO': 'Bogotá, Colombia',
    'EKCH': 'Copenhagen, Denmark',
    'VIDP': 'New Delhi, India',
    'MMGL': 'Guadalajara, Mexico',
    'SABE': 'Buenos Aires, Argentina',
    'LEPA': 'Palma de Mallorca, Spain',
    'ENGM': 'Oslo, Norway',
    'KATL': 'Atlanta, USA',
    'SBGR': 'São Paulo, Brazil',
    'KSFO': 'San Francisco, USA',
    'SCEL': 'Santiago, Chile',
    'LFPG': 'Paris, France',
    'LFPO': 'Paris, France',
    'EHAM': 'Amsterdam, Netherlands',
    'RJFF': 'Fukuoka, Japan',
    'WMKK': 'Kuala Lumpur, Malaysia',
    'KMCO': 'Orlando, USA',
    'LPPT': 'Lisbon, Portugal'
}
df_raw['origin_airport_location'] = df_raw['origin.code_icao'].map(icao_to_location)

# Step 12: Add a new column 'origin_region' to categorize each airport into a region (e.g., North America, Europe, etc.).
# The regions are determined by the airport’s IATA code and are mapped from the 'regions' dictionary.
regions = {
  'LGA': 'North America',
  'BNA': 'North America',
  'CLT': 'North America',
  'ORD': 'North America',
  'CDG': 'Europe',
  'NRT': 'Asia Pacific',
  'YUL': 'North America',
  'ICN': 'Asia Pacific',
  'CPH': 'Europe',
  'DXB': 'Middle East',
  'LIS': 'Europe',
  'TPE': 'Asia Pacific',
  'BOG': 'South America',
  'JFK': 'North America',
  'JED': 'Middle East',
  'MCO': 'North America',
  'PHL': 'North America',
  'SCL': 'South America',
  'LIM': 'South America',
  'KUL': 'Asia Pacific',
  'SIN': 'Asia Pacific',
  'CAI': 'Africa',
  'HKG': 'Asia Pacific',
  'OSL': 'Europe',
  'ARN': 'Europe',
  'YVR': 'North America',
  'BER': 'Europe',
  'GDL': 'North America',
  'LAX': 'North America',
  'CUN': 'North America',
  'KEF': 'Europe',
  'LHR': 'Europe',
  'BKK': 'Asia Pacific',
  'PMI': 'Europe',
  'SJU': 'North America',
  'GRU': 'South America',
  'AMS': 'Europe',
  'SFO': 'North America',
  'ATL': 'North America',
  'MIA': 'North America',
  'DUB': 'Europe',
  'FUK': 'Asia Pacific',
  'MSY': 'North America',
  'FCO': 'Europe',
  'DEL': 'Asia Pacific',
  'ADD': 'Africa',
  'CGK': 'Asia Pacific',
  'DFW': 'North America',
  'RUN': 'Africa',
  'ORY': 'Europe',
  'CZM': 'North America',
  'LIR': 'North America',
  'PTY': 'North America',
  'IST': 'Europe',
  'BOH': 'Europe',
  'HEL': 'Europe',
  'DAL': 'North America',
  'SAT': 'North America',
  'TLV': 'Middle East',
  'DMU': 'South America',
  'BOM': 'Asia Pacific',
  'CVT': 'Europe',
  'MAD': 'Europe',
  'MDW': 'North America',
  'YQB': 'North America',
  'BGO': 'Europe',
  'LYS': 'Europe',
  'ZAG': 'Europe',
  'COS': 'North America',
  'PIT': 'North America',
  'YYZ': 'North America',
  'FUE': 'Europe',
  'HOU': 'North America',
  'LUX': 'Europe',
  'DTW': 'North America',
  'HYC': 'North America',
  'TFS': 'Europe',
  'DUS': 'Europe',
  'PEK': 'Asia Pacific',
  'TYS': 'North America',
  'BUR': 'North America',
  'DEN': 'North America',
  'SLC': 'North America',
  'RNO': 'North America',
  'OAK': 'North America',
  'BWI': 'North America',
  'TPA': 'North America',
  'LAS': 'North America',
  'RSW': 'North America',
  'PBI': 'North America',
  'STL': 'North America',
  'RTM': 'Europe',
  'RDU': 'North America',
  'BBP': 'North America',
  'CVG': 'North America',
  'MED': 'Middle East',
  'HAM': 'Europe',
  'LPA': 'Europe',
  'FRA': 'Europe',
  'FLL': 'North America',
  'BOD': 'Europe',
  'AUS': 'North America',
  'MKE': 'North America',
  'DOH': 'Middle East',
  'DMK': 'Asia Pacific',
  'ZRH': 'Europe',
  'SMF': 'North America',
  'SNA': 'North America',
  'BHD': 'Europe',
  'MSP': 'North America',
  'FAI': 'North America',
  'TUN': 'Africa',
  'STT': 'North America',
  'HRG': 'Africa',
  'LGW': 'Europe',
  'RHO': 'Europe',
  'IXJ': 'Asia Pacific',
  'MDZ': 'South America',
  'OLB': 'Europe',
  'KGS': 'Europe',
  'ANF': 'South America',
  'SAN': 'North America',
  'DSM': 'North America',
  'IAH': 'North America',
  'SEA': 'North America',
  'MTJ': 'North America',
  'ACY': 'North America',
  'BZE': 'North America',
  'HPN': 'North America',
  'LEJ': 'Europe',
  'EWR': 'North America',
  'DWC': 'Middle East',
  'SRQ': 'North America',
  'CMH': 'North America',
  'LEY': 'Europe',
  'GUA': 'Central America',
  'SHJ': 'Middle East',
  'IND': 'North America',
  'ALG': 'Africa',
  'AEP': 'South America',
  'BGR': 'North America',
  'ABQ': 'North America',
  'AUH': 'Middle East',
  'NGO': 'Asia Pacific',
  'OXF': 'Europe',
  'ASW': 'Africa',
  'ONT': 'North America',
  'YYT': 'North America',
  'NAS': 'North America',
  'SYD': 'Australia',
  'HMO': 'North America',
  'SCE': 'North America',
  'NUM': 'South America',
  'GYE': 'South America',
  'EZE': 'South America',
  'SWF': 'North America',
  'BTH': 'Asia Pacific',
  'KIX': 'Asia Pacific',
  'GNV': 'North America',
  'BWN': 'Asia Pacific',
  'BQN': 'North America',
  'GRR': 'North America',
  'KRK': 'Europe',
  'HNL': 'North America',
  'PHX': 'North America',
  'DPS': 'Asia Pacific',
  'KHH': 'Asia Pacific',
  'TAS': 'Asia Pacific',
  'MUC': 'Europe',
  'MCI': 'North America',
  'PVR': 'North America',
  'MNL': 'Asia Pacific',
  'MAN': 'Europe',
  'XRY': 'Europe',
  'COR': 'South America',
  'BLB': 'Europe',
  'HBE': 'Africa',
  'SUF': 'Europe',
  'AMD': 'Asia Pacific',
  'CFU': 'Europe',
  'INT': 'North America',
  'PDX': 'North America',
  'MRU': 'Africa',
  'DZA': 'Europe',
  'MLE': 'Asia Pacific',
  'VLC': 'Europe',
  'GOT': 'Europe',
  'NKM': 'Asia Pacific',
  'TLL': 'Europe',
  'HRL': 'North America',
  'SAP': 'North America',
  'CLE': 'North America',
  'PWM': 'North America',
  'OKC': 'North America',
  'AGP': 'Europe',
  'KAO': 'Asia Pacific',
  'BQH': 'Europe',
  'QSC': 'Europe',
  'RAK': 'Africa',
  'BLL': 'Europe',
  'AMA': 'North America',
  'GOI': 'Asia Pacific',
  'BHM': 'North America',
  'BAH': 'Middle East',
  'CMN': 'Africa',
  'TNG': 'Africa',
  'FLR': 'Europe',
  'VGO': 'Europe',
  'STR': 'Europe',
  'VAA': 'Asia Pacific',
  'ESH': 'Europe',
  'DIL': 'Asia Pacific',
  'HER': 'Europe',
  'VNO': 'Europe',
  'SDQ': 'Caribbean',
  'VIE': 'Europe',
  'AAL': 'Europe',
  'RUH': 'Middle East',
  'SSH': 'Africa',
  'TNR': 'Africa',
  'BOS': 'North America',
  'GAU': 'Asia Pacific',
  'BRC': 'South America',
  'RVN': 'Asia Pacific',
  'DCA': 'North America',
  'ORF': 'North America',
  'ROC': 'North America',
  'OUL': 'Europe',
  'XNA': 'North America',
  'NOU': 'Oceania',
  'LDY': 'North America',
  'BHX': 'Europe',
  'JAX': 'North America',
  'PUS': 'Asia Pacific',
  'ORK': 'Europe',
  'ELP': 'North America',
  'OMA': 'North America',
  'TUS': 'North America',
  'MLU': 'North America',
  'CGN': 'Europe',
  'LXR': 'Africa',
  'MXL': 'North America',
  'SNN': 'Europe'
}
df_raw['origin_region'] = df_raw['origin.code_iata'].map(regions)

# Step 13: Combine all values in 'origin_region' that are categorized as 'Africa' or 'Middle East' 
# into a single 'Africa & Middle East' category for simplified analysis.
df_raw['origin_region'] = df_raw['origin_region'].replace(['Africa', 'Middle East'], 'Africa & Middle East')

# Step 14: Create a dictionary 'ICAO_route_dict' that maps each ICAO route code (departure and arrival airports)
# to their respective city names. The key is the combined ICAO route code (e.g., 'TJSJ-KMCO') and the value is a list
# containing the names of the origin and destination cities.
ICAO_route_dict = {
    'TJSJ-KMCO': ['San Juan', 'Orlando'],
    'EDDB-LSZH': ['Berlin', 'Zurich'],
    'CYUL-LFPG': ['Montreal', 'Paris'],
    'RCTP-VHHH': ['Taipei', 'Hong Kong'],
    'RJAA-RCTP': ['Tokyo (Narita)', 'Taipei'],
    'BIKF-LFPG': ['Keflavik', 'Paris'],
    'MMUN-KDFW': ['Cancun', 'Dallas-Fort Worth'],
    'LIRF-LEMD': ['Rome', 'Madrid'],
    'MMUN-CYYZ': ['Cancun', 'Toronto'],
    'HECA-OEJN': ['Cairo', 'Jeddah'],
    'KMSY-KATL': ['New Orleans', 'Atlanta'],
    # ... (other routes omitted for brevity)
    'OMDB-OEJN': ['Dubai', 'Jeddah']
}

# Step 15: Use the 'ICAO_route_dict' to create a new column 'route_cities' in the dataframe.
# The 'ICAO_route' column contains the ICAO code for each flight route, which is mapped to its corresponding city names
# using the dictionary. This helps in providing user-friendly city names instead of ICAO codes in further analysis.
df_raw['route_cities'] = df_raw['ICAO_route'].map(ICAO_route_dict)

# --- Dropping Unnecessary Columns ---
# Step 16: Define a list 'drop_features' containing the names of columns that are not needed for the analysis or
# modeling process. This step is done to clean up the dataset by removing irrelevant or redundant information.
# Columns such as flight identification, route distance, and baggage claim info are excluded since they do not
# contribute to the analysis of flight delays or routes.
drop_features = ['ident'                            ,
                 'ident_iata'                       ,
                 'actual_runway_off'                ,
                 'actual_runway_on'                 ,
                 'fa_flight_id'                     ,
                 'operator'                         ,
                 'operator_iata'                    ,
                 'flight_number'                    ,
                 'atc_ident'                        ,
                 'inbound_fa_flight_id'             ,
                 'codeshares'                       ,
                 'codeshares_iata'                  ,
                 'blocked'                          ,
                 'diverted'                         ,
                 'position_only'                    ,
                 'filed_ete'                        ,
                 'foresight_predictions_available'  ,
                 'estimated_out'                    ,
                 'actual_out'                       ,
                 'scheduled_off'                    ,
                 'estimated_off'                    ,
                 'actual_off'                       ,
                 'scheduled_on'                     ,
                 'estimated_on'                     ,
                 'actual_on'                        ,
                 'scheduled_in'                     ,
                 'estimated_in'                     ,
                 'actual_in'                        ,
                 'progress_percent'                 ,
                 'status'                           ,
                 'route_distance'                   ,
                 'filed_airspeed'                   ,
                 'filed_altitude'                   ,
                 'route'                            ,
                 'baggage_claim'                    ,
                 'seats_cabin_business'             ,
                 'seats_cabin_coach'                ,
                 'seats_cabin_first'                ,
                 'gate_origin'                      ,
                 'gate_destination'                 ,
                 'terminal_origin'                  ,
                 'terminal_destination'             ,
                 'type'                             ,
                 'origin.code'                      ,
                 'origin.code_lid'                  ,
                 'origin.timezone'                  ,
                 'origin.airport_info_url'          ,
                 'destination.code'                 ,
                 'destination.code_iata'            ,
                 'destination.code_lid'             ,
                 'destination.timezone'             ,
                 'destination.name'                 ,
                 'destination.city'                 ,
                 'destination.airport_info_url'     ,     
                 'destination'                      ,
                 'ident_icao'                       ,
                 'registration'                     ,
                 'cancelled'                        ,
                 'departure_delay'                  ,
                 'arrival_delay'                    ,
                 'scheduled_out'                    ,
                 'destination.code_icao'            ,
                 'arrival_delay_binary'
                 ]                      

# Step 17: Drop the columns listed in 'drop_features' from the dataframe 'df_raw'. This will help simplify the dataset
# and retain only the necessary information for further analysis and modeling.
df_raw.drop(drop_features, axis=1, inplace=True)


### Bar charts with overview of delays per route

In [None]:
# --- First Plot: Delayed Flights by Route ---

# Step 1: Group the dataset by 'route_icao' and calculate the total number of flights
# and the number of delayed flights (true_label == 1).
df_delayed_route = df_raw.groupby('route_icao').agg(
    delayed_count=('true_label', lambda x: (x == 1).sum()),  # Count delayed flights (true_label = 1)
    total_flights=('true_label', 'size')  # Total number of flights for each route
)

# Step 2: Calculate the percentage of delayed flights for each route
df_delayed_route['percentage'] = round(
    df_delayed_route['delayed_count'] / df_delayed_route['total_flights'] * 100
)

# Step 3: Sort the routes by the calculated 'percentage' of delayed flights in ascending order
df_delayed_route = df_delayed_route.sort_values("percentage", ascending=True)

# --- Merge with Route Cities Mapping ---
# Step 4: We assume 'df_route_cities' contains columns 'route_icao' and 'route_cities'.
# This step should merge the 'df_delayed_route' dataframe with the 'df_route_cities' dataframe 
# to add user-friendly city names to each route. The cities will be used for labeling in the final plot.

# Reset the index to use 'route_icao' as a column for merging
df_delayed_route = df_delayed_route.reset_index()

# --- Create the Plotly Bar Chart Using 'route_cities' as the Bar Labels ---
# Step 5: Create a horizontal bar chart using Plotly Express.
# We plot 'percentage' of delayed flights on the x-axis and 'route_cities' on the y-axis.
# The color of each bar corresponds to the delay percentage, and we include a text label 
# on each bar to show the count of delayed flights out of the total flights for each route.

fig = px.bar(
    df_delayed_route,
    x="percentage",  # Percentage of delayed flights on the x-axis
    y="route_cities",  # User-friendly city names on the y-axis
    orientation="h",  # Horizontal bars
    title="Flights Delayed at Origin by Route",  # Title of the plot
    labels={"percentage": "Percentage", "route_cities": "Route"},  # Axis labels
    text=df_delayed_route["delayed_count"].astype(str) + " out of " + df_delayed_route["total_flights"].astype(str),  # Text label on bars
    color="percentage",  # Color the bars based on the delay percentage
    color_continuous_scale='reds'  # Use a red color scale to indicate higher delay percentages
)

# Step 6: Calculate the average delay percentage and add a vertical line to indicate the average delay
avg_route_percentage = df_delayed_route["percentage"].mean()  # Compute the mean delay percentage
fig.add_vline(
    x=avg_route_percentage,  # Add vertical line at the average percentage
    line=dict(color="black", dash="dash"),  # Line style (dashed)
    annotation_text=f"Average: {avg_route_percentage:.1f}%",  # Display the average percentage as annotation
    annotation_position="bottom right"  # Position the annotation at the bottom right of the plot
)

# --- Update Layout for Better Presentation ---
# Step 7: Adjust the layout for improved readability and appearance of the plot.
fig.update_layout(
    xaxis_title="Percentage of Delayed Flights",  # Label for the x-axis
    yaxis_title="Route",  # Label for the y-axis
    showlegend=True,  # Show the legend in the plot
    width=1000,  # Set the plot width
    height=1200,  # Set the plot height
    xaxis_title_font_size=20,  # Font size for the x-axis title
    yaxis_title_font_size=20  # Font size for the y-axis title
)

# Step 8: Update text formatting on bars for better visibility and aesthetics
fig.update_traces(
    textfont=dict(size=30),  # Set the text font size to 30
    textposition="outside",  # Position the text outside the bars
    cliponaxis=False  # Prevent clipping of text
)

# Step 9: Display the plot using Plotly's default rendering engine
fig.show()

### Historical delay overview per region

In [69]:
def departure_delay_prog_for_route_group(df_raw, icao_dep, regionalize=False, routes=[], operators=[], aircraft_types=[]):
    # TODO: Consider the regional option for visualization and integrating this into a Streamlit app for interactive display
    # If 'regionalize' is True, filter the data based on the region of the departure airport
    if regionalize:
        # Extract rows where the 'origin_region' matches the region of the input airport (icao_dep)
        df_filtered = df_raw[df_raw['origin_region'] == df_raw[df_raw['origin.code_icao'] == icao_dep]['origin_region'].iloc[0]]
    else:
        # If regionalize is False, filter by the specific airport code (icao_dep)
        df_filtered = df_raw[df_raw['origin.code_icao'] == icao_dep]

        # Further filter by routes if provided
        if len(routes) > 0:
            df_filtered = df_filtered[df_filtered['ICAO_route'].isin(routes)]

        # Further filter by operators if provided
        if len(operators) > 0:
            df_filtered = df_filtered[df_filtered['operator_icao'].isin(operators)]

        # Further filter by aircraft types if provided
        if len(aircraft_types) > 0:
            df_filtered = df_filtered[df_filtered['aircraft_type'].isin(aircraft_types)]

    # Group the filtered data by 'year_week' and 'departure_delay_binary', and count the occurrences
    # Pivot the grouped data to create a table with 'year_week' as the index and 'departure_delay_binary' as columns
    df_grouped_weekly = df_filtered.groupby(['year_week', 'departure_delay_binary']).size().reset_index(name='count')

    # Reshape the data so each delay status ('on_time' or 'delayed') gets its own column
    df_pivot_weekly = df_grouped_weekly.pivot(index='year_week', columns='departure_delay_binary', values='count').fillna(0)

    # Reorder columns to have 'delayed' first and 'on_time' second
    df_pivot_weekly = df_pivot_weekly[['delayed', 'on_time']]
    
    # Add a new column 'total' to sum up the departures (both delayed and on time) for each week
    df_pivot_weekly['total'] = df_pivot_weekly.sum(axis=1)

    # Initialize a Plotly figure for the visualizations
    fig = go.Figure()

    # Plot a 4-week moving average for both the 'total' and 'delayed' columns
    # This helps to smooth out the data for better trend analysis
    fig.add_trace(go.Scatter(x=df_pivot_weekly.index, y=df_pivot_weekly['total'].rolling(window=4).mean(), mode='lines', line=dict(color='blue'), name='All Departures'))
    fig.add_trace(go.Scatter(x=df_pivot_weekly.index, y=df_pivot_weekly['delayed'].rolling(window=4).mean(), mode='lines', line=dict(color='red'), name='Delayed Departures'))
    
    # Plot the raw 'total' and 'delayed' data with lower opacity for better comparison to the moving averages
    fig.add_trace(go.Scatter(x=df_pivot_weekly.index, y=df_pivot_weekly['total'], mode='lines', line=dict(color='blue'), opacity=0.15, name='All Departures', showlegend=False))
    fig.add_trace(go.Scatter(x=df_pivot_weekly.index, y=df_pivot_weekly['delayed'], mode='lines', line=dict(color='red'), opacity=0.15, name='Delayed Departures', showlegend=False))

    # Add shaded regions to indicate specific timeframes or events (e.g., Ramadan or Monsoon season) based on the origin region
    seasonality = ''
    if df_filtered['origin_region'].iloc[0] == 'Africa & Middle East':
        seasonality = 'Ramadan Periods'
        # Highlight the Ramadan periods for 2022, 2023, and 2024
        fig.add_vrect(x0="2022-04-02", x1="2022-05-01", fillcolor="sandybrown", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-04-17", y=0, text="Ramadan 2022", showarrow=False, font=dict(size=10, color='black'))
        # Repeat for Ramadan 2023 and 2024...

        # Highlight the post-Covid recovery period from early 2022 until August 15, 2022
        fig.add_vrect(x0="2022-01-31", x1="2022-08-15", fillcolor="wheat", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-08-15", y=0, text="Post-Covid Recovery", showarrow=False, font=dict(size=10, color='black'))

    elif df_filtered['origin_region'].iloc[0] == 'Asia Pacific':
        seasonality = 'Monsoon seasons'
        # Highlight the Monsoon seasons for 2022, 2023, and 2024
        fig.add_vrect(x0="2022-05-15", x1="2022-10-15", fillcolor="mediumseagreen", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-09-30", y=0, text="Monsoon 2022", showarrow=False, font=dict(size=10, color='black'))
        # Repeat for Monsoon 2023 and 2024...

        # Post-Covid recovery shading as before
        fig.add_vrect(x0="2022-01-31", x1="2022-08-15", fillcolor="wheat", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-05-15", y=0, text="Post-Covid Recovery", showarrow=False, font=dict(size=10, color='black'))

    else:
        seasonality = 'Winters'
        # Highlight the Winter seasons for 2022-2025
        fig.add_vrect(x0="2022-12-01", x1="2023-01-31", fillcolor="cornflowerblue", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-12-31", y=0, text="Winter 22-23", showarrow=False, font=dict(size=10, color='black'))
        # Repeat for Winter 2023-2024 and 2024-2025...

        # Post-Covid recovery shading as before
        fig.add_vrect(x0="2022-01-31", x1="2022-08-15", fillcolor="wheat", opacity=0.3, layer="below", line_width=0)
        fig.add_annotation(x="2022-05-15", y=0, text="Post-Covid Recovery", showarrow=False, font=dict(size=10, color='black'))

    # Configure the X-Axis to display major ticks at key time points (e.g., start of the year, mid-year)
    tickvals = df_pivot_weekly.index.to_list()
    major_ticks = []
    for i, date in enumerate(tickvals):
        if i == 0 or date.month == 1 and date.week == 1 or date.month == 7 and date.week == 1:
            major_ticks.append(date)

    # Set tick labels to display month and year
    major_ticktext = [date.strftime('%b\n%Y') for date in major_ticks]

    # Customize X-Axis appearance
    fig.update_xaxes(ticks="outside", ticklabelmode="period", tickcolor="black", ticklen=10, tickvals=major_ticks, ticktext=major_ticktext, tickangle=15, tickmode='array', tickson='boundaries', minor=dict(ticklen=4, dtick="M3", tick0="2022-04-01", griddash='dot', gridcolor='white'))

    # Configure the figure title and other layout parameters
    if len(routes) == 0:
        routes = 'All'
    elif len(routes) > 5:
        routes = routes[0:5]

    if len(operators) == 0:
        operators = 'All'
    elif len(operators) > 5:
        operators = operators[0:5]

    if len(aircraft_types) == 0:
        aircraft_types = 'All'
    elif len(aircraft_types) > 5:
        aircraft_types = aircraft_types[0:5]

    # Determine the title based on whether regionalization is used
    if regionalize:
        region_name = df_filtered['origin_region'].iloc[0]
        title_text = f"Weekly Departures in Dataset for {region_name} region<br><sup>Routes: {routes} -- Operators: {operators} -- Aircraft: {aircraft_types}</sup>"
    else:
        airport_name = df_filtered['origin_airport_name'].iloc[0]
        airport_location = df_filtered['origin_airport_location'].iloc[0]
        title_text = f"Weekly Departures in Dataset: {airport_name} ({icao_dep}) - {airport_location}<br><sup>Routes: {routes} -- Operators: {operators} -- Aircraft: {aircraft_types}</sup>"

    # Update the layout of the plot with the title and axis labels
    fig.update_layout(title={'text': title_text, 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
                      xaxis_title='Date', yaxis_title='Number of Departures per Week', legend_title='4-week Moving Average', height=600, width=1200)

    # Customize the legend display
    fig.update_layout(legend=dict(x=0.85, y=-0.2, traceorder='normal', bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='black', borderwidth=1))

    # Display the plot
    fig.show()

    return


## Example to call the function
icao_dep = 'OMDB'  
departure_delay_prog_for_route_group(df_raw, icao_dep, regionalize=False)

### World Map

In [None]:
# Step 1: Load the airport data from an external source
url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
# Define column names for clarity
columns = ["ID", "Name", "City", "Country", "IATA", "ICAO", "Latitude", "Longitude", "Altitude", "Timezone", "DST", "Tz database time zone", "Type", "Source"]
# Load airport data into a dataframe
df_airport_data = pd.read_csv(url, header=None, names=columns)

# Step 2: Initialize df_airport as a copy of df_raw (assuming df_raw exists)
df_airport = df_raw

# Step 3: Extract unique IATA airport codes from the dataset
airport_codes = df_airport['origin.code_iata'].unique()

# Step 4: Create a dictionary to store the coordinates (Latitude, Longitude) for each airport
coordinates = {}
for code in airport_codes:
    # Extract the airport details corresponding to the IATA code
    airport_info = df_airport_data[df_airport_data['IATA'] == code]
    if not airport_info.empty:
        # Get the latitude and longitude for the airport
        lat, lon = airport_info[['Latitude', 'Longitude']].values[0]
        coordinates[code] = (lat, lon)
    else:
        # If no matching airport info is found, assign None to the coordinates
        coordinates[code] = (None, None)

# Step 5: Map the IATA codes to coordinates in the original dataframe
df_airport['coordinates'] = df_airport['origin.code_iata'].map(coordinates)

# Step 6: Split the 'coordinates' column into separate latitude and longitude columns
df_airport[['origin_airport_lat', 'origin_airport_lon']] = pd.DataFrame(df_airport['coordinates'].to_list(), index=df_airport.index)

# Step 7: Drop rows with missing latitude or longitude values
df_airport = df_airport.dropna(subset=['origin_airport_lat', 'origin_airport_lon'])

# Step 8: Filter the dataset to keep only necessary columns for analysis
df_airport = df_airport[["origin.code_iata", 'origin.name', 'origin.city', 'departure_delay_status_numeric', 'origin_airport_lat', 'origin_airport_lon']]

# Step 9: Remove flights that were canceled (departure_delay_status_numeric == 3)
df_airport = df_airport[df_airport["departure_delay_status_numeric"] != 3]

# Step 10: Group the data by IATA airport code and calculate delayed flight counts and total flights
df_grouped = df_airport.groupby('origin.code_iata').agg(
    delayed_count=('true_label', lambda x: (x == 1).sum()),  # Count delayed flights
    num_flights=('true_label', 'size')  # Total number of flights
).reset_index()

# Step 11: Merge the grouped data with the airport details to include name, city, and coordinates
df_grouped = df_grouped.merge(
    df_airport[['origin.code_iata', 'origin.name', 'origin.city', 'origin_airport_lat', 'origin_airport_lon']].drop_duplicates(),
    on='origin.code_iata', 
    how='left'
)

# Step 12: Calculate the percentage of delayed flights for each airport
df_grouped['delay_percentage'] = df_grouped['delayed_count'] / df_grouped['num_flights'] * 100

# Step 13: Set up a colormap for visualization (using a green-red color scale)
colormap = cm.get_cmap('RdYlGn')
# Normalize the color scale based on the minimum and maximum delay percentages
norm = Normalize(vmin=min(df_grouped['delay_percentage']), vmax=max(df_grouped['delay_percentage']))

# Step 14: Define a function to reverse the normalization (used for adjusting color mapping)
def reverse_normalization(value):
    return 1 - norm(value)

# Step 15: Define a function to enhance color saturation for better visualization
def enhance_saturation(hex_color, factor=1.5):
    rgb = mcolors.hex2color(hex_color)
    h, l, s = colorsys.rgb_to_hls(*rgb)
    s = min(1, s * factor)  # Ensure saturation doesn't exceed 1
    enhanced_rgb = colorsys.hls_to_rgb(h, l, s)
    return mcolors.rgb2hex(enhanced_rgb)

# Step 16: Create a folium map centered around a central location (e.g., New York City)
m = folium.Map(location=[48.33282274287975, -8.7853686787791], zoom_start=2)

# Step 17: Add circle markers to the map for each airport, using delay percentage and other metrics for styling
for idx, row in df_grouped.iterrows():
    # Reverse normalize the delay percentage for color mapping
    reversed_normalized_delay = reverse_normalization(row['delay_percentage'])
    # Get the corresponding color for the airport based on the delay percentage
    color = colormap(reversed_normalized_delay)
    # Enhance the saturation of the color for better visibility
    saturated_color = enhance_saturation(mcolors.rgb2hex(color), factor=1.5)
    # Set the size of the marker based on the number of flights
    size = row['num_flights'] / 500  # Adjust dot size to reflect the number of flights

    # Add a circle marker to the map for the airport
    folium.CircleMarker(
        location=(row['origin_airport_lat'], row['origin_airport_lon']),  # Latitude and Longitude columns
        radius=size,  # Set the radius based on number of flights
        color=saturated_color,  # Set the border color
        weight=1,  # Set the weight of the border
        fill=True,  # Enable fill color
        fill_color=saturated_color,  # Set the fill color
        fill_opacity=0.6,  # Set the opacity of the fill
        popup=f"{row['origin.code_iata']} - {row['origin.name']} - {row['delay_percentage']:.2f}% delayed - {row['num_flights']} flights"
    ).add_to(m)

# Step 18: Add a colormap legend to the map for visualization
colormap = cm.ScalarMappable(norm=norm, cmap='RdYlGn')
colormap.set_array([])

# Step 19: Save the generated map to an HTML file for visualization
m.save("../graph_departure_airport_delay_map.html")

# Step 20: Return the folium map object for display or further manipulation
m


### Statistical plots

In [None]:
def streamlit_prediction_stats(df_raw, route, threshold):
    # Step 1: Filter the dataframe for the given route to focus only on relevant data
    df_route = df_raw[df_raw['route_code'] == route].copy()

    # Step 2: Dynamically classify predictions as 'On-Time' or 'Delayed' based on the specified threshold
    df_route['Prediction'] = df_route['predicted_prob_class_1'].apply(
        lambda x: 'On-Time' if x < threshold else 'Delayed'
    )

    # Step 3: Calculate dynamic confusion matrix values based on the threshold for predictions
    # True Positive (TP): Correctly predicted delayed flights
    df_route['TP_dynamic'] = (df_route['true_label'] == 1) & (df_route['predicted_prob_class_1'] >= threshold)
    # False Positive (FP): Incorrectly predicted on-time flights as delayed
    df_route['FP_dynamic'] = (df_route['true_label'] == 0) & (df_route['predicted_prob_class_1'] >= threshold)
    # True Negative (TN): Correctly predicted on-time flights
    df_route['TN_dynamic'] = (df_route['true_label'] == 0) & (df_route['predicted_prob_class_1'] < threshold)
    # False Negative (FN): Incorrectly predicted delayed flights as on-time
    df_route['FN_dynamic'] = (df_route['true_label'] == 1) & (df_route['predicted_prob_class_1'] < threshold)

    # Step 4: Sum up the dynamic TP, FP, TN, FN values for the route
    TP = df_route['TP_dynamic'].sum()
    FP = df_route['FP_dynamic'].sum()
    TN = df_route['TN_dynamic'].sum()
    FN = df_route['FN_dynamic'].sum()

    # Step 5: Calculate precision and recall based on the confusion matrix values
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0  # Prevent division by zero
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0  # Prevent division by zero

    # Step 6: Calculate overall accuracy, precision, recall, and F1 score for the route's predictions
    df_route_stats = pd.DataFrame({
        'route': [route],
        'total_flights': [df_route.shape[0]],
        'total_delayed': [df_route['true_label'].sum()],
        'total_ontime': [df_route['true_label'].count() - df_route['true_label'].sum()],
        'percent_delayed': [df_route['true_label'].sum() / df_route['true_label'].count()],
        'total_FP': [FP],
        'total_FN': [FN],
        'total_TP': [TP],
        'total_TN': [TN],
        'FP_rate': [FP / df_route['true_label'].count()],
        'FN_rate': [FN / df_route['true_label'].count()],
        'TP_rate': [TP / df_route['true_label'].count()],
        'TN_rate': [TN / df_route['true_label'].count()],
        'accuracy': [(TP + TN) / df_route['true_label'].count()],
        'precision': [precision],
        'recall': [recall],
        'f1_score': [2 * (precision * recall) / (precision + recall)]  # F1 score calculation
    })

    # Step 7: Plotting a histogram to visualize the distribution of predicted probabilities
    fig_hist = px.histogram(df_route, 
                             x='predicted_prob_class_1', 
                             nbins=20, 
                             color='Prediction',
                             color_discrete_map={'On-Time': 'cornflowerblue', 'Delayed': 'crimson'})
    fig_hist.update_xaxes(title_text='Probability of Delay', range=[0, 1])
    fig_hist.update_yaxes(title_text='No. of Predictions Made')

    # Step 8: Customize histogram appearance (bin opacity and border)
    fig_hist.update_traces(marker=dict(
        line=dict(color='black', width=1.5),  # Black border for better visibility
        opacity=0.5  # Set opacity for visual clarity
    ))

    # Step 9: Add a vertical line to indicate the selected threshold value
    fig_hist.add_shape(
        type="line",
        x0=threshold, y0=0, x1=threshold, y1=0.9,
        line=dict(color="red", width=5, dash="dash"),  # Red dashed line for emphasis
        xref="x", yref="paper"
    )
    fig_hist.add_annotation(
        x=threshold, y=0.97,
        text=f"Threshold = {threshold:.1f}",
        showarrow=False,
        yshift=10,  # Slight vertical offset for better label placement
        font=dict(color="red"),
        xref="x",
        yref="paper"
    )

    # Step 10: Adjust figure layout (size, margins, legend position)
    fig_hist.update_layout(
        autosize=False,
        width=600,
        height=400,
        margin=dict(l=20, r=20, t=0, b=20),
        legend=dict(
            # Position legend inside the plot area for better alignment
            x=1.20,  # Adjust x to avoid expanding the figure
            xanchor="right",
            y=0.18,  # Adjust y to position the legend better
            yanchor="top"
        )
    )

    # Step 11: Dynamic Pie Charts to show prediction accuracy for each category (Delayed/On-Time)
    
    # Pie chart for Delayed Predictions (True Positives and False Positives)
    fig_pie_delayed = go.Figure()
    fig_pie_delayed.add_trace(go.Pie(labels=['Correctly Predicted (TP)', 'Falsely Predicted (FP)'], 
                                     values=[TP, FP], 
                                     name='Delayed', 
                                     marker=dict(colors=['crimson', 'lightcoral'])))
    fig_pie_delayed.update_traces(hole=.4, hoverinfo="label+percent+name", textinfo="percent", 
                                  marker=dict(line=dict(color='black', width=2)))
    fig_pie_delayed.update_layout(
        title_text='Departure Delay', 
        autosize=False, width=400, height=241, 
        margin=dict(l=0, r=0, t=40, b=20),
        legend=dict(orientation="h")  # Horizontal legend
    )

    # Pie chart for On-Time Predictions (True Negatives and False Negatives)
    fig_pie_ontime = go.Figure()
    fig_pie_ontime.add_trace(go.Pie(labels=['Correctly Predicted (TN)', 'Falsely Predicted (FN)'], 
                                    values=[TN, FN], 
                                    name='On-Time', 
                                    marker=dict(colors=['cornflowerblue', 'lightblue'])))
    fig_pie_ontime.update_traces(hole=.4, hoverinfo="label+percent+name", textinfo="percent", 
                                 marker=dict(line=dict(color='black', width=2)))
    fig_pie_ontime.update_layout(
        title_text='Departure On-Time', 
        autosize=False, width=400, height=241, 
        margin=dict(l=0, r=0, t=40, b=20),
        legend=dict(orientation="h")  # Horizontal legend
    )

    # Step 12: Return prediction statistics and visualization figures
    return df_route_stats, fig_hist, fig_pie_delayed, fig_pie_ontime
