In [None]:
import pandas as pd
import os
import plotly.express as px
import plotly.io as pio

In [None]:
DATASETS_PATH = 'data/Arrival_Departure/Working'
MASTER_DATASET = 'post_covid'
MASTER_DATASET_PATH = os.path.join(DATASETS_PATH, MASTER_DATASET)
imp_routes = [ '22', '29', '15', '45', '44', '42', '17', '23', '31', '26', '111', '24', '33', '14']

In [None]:
def process_csv(path, imp_routes = [ '22', '29', '15', '45', '44', '42', '17', '23', '31', '26', '111', '24', '33', '14']):
    df = pd.read_csv(path)
    
    nan_count = df['delay'].isna().sum()
    df = df.dropna()
    
    df = df[df['route_id'].isin(imp_routes)]
    df['service_date'] = pd.to_datetime(df['service_date'])
    df['delay'] = df['delay'].clip(lower=0)
    
    return df

In [None]:
def delay_freq(path, delay_threshold = 3, period = None, save_plot = False):
    df = process_csv(path)

    df['status'] = df['delay'].apply(lambda x: 'Delayed' if x > delay_threshold else 'On Time')
    
    summary = df.groupby(['route_id', 'status']).size().reset_index(name='count')
    
    pivot = summary.pivot(index='route_id', columns='status', values='count').fillna(0)

    
    melted = pivot.reset_index().melt(id_vars='route_id', value_vars=['Delayed', 'On Time'], var_name='Status', value_name='Count')
    
    fig = px.bar(
        melted,
        x='route_id',
        y='Count',
        color='Status',
        color_discrete_map={
            'On Time': 'blue',
            'Delayed': 'red'
        },
        barmode='stack',
        title='Frequency of Delays vs On-Time by Route',
        labels={'route_id': 'Route ID', 'Count': 'Number of Delays'}
    )
    
    fig.show()

    if save_plot:
        fig.write_html("./Plots/Delay_Freq/delay_freq_"+period+".html")

    return

# Pre - Covid

In [None]:
period = "_pre_covid"

d = "data/Arrival_Departure/Working/delay_pre_covid.csv"

delay_freq(d, 4, period, True)

# Covid

In [None]:
period = "_covid"

d = "data/Arrival_Departure/Working/delay_covid.csv"

delay_freq(d, 4, period, True)

# Post - Covid

In [None]:
period = "_post_covid"

d = "data/Arrival_Departure/Working/delay_post_covid.csv"

delay_freq(d, 4, period, True)