## Packages

In [None]:
# =============================================================================
# TRANSLINK BRISBANE OD ANALYSIS
# Complete Analysis Pipeline
# =============================================================================

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
import calendar
import logging
import warnings
warnings.filterwarnings('ignore')

# Plot style (clean, blog-friendly)
plt.style.use("default")
sns.set_context("talk")

In [None]:
# Setup logging
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s [%(levelname)s] %(message)s"
)

# Create a logger for the analysis
app_logger = logging.getLogger("analysis")
app_logger.setLevel(logging.INFO)

logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("kaleido").setLevel(logging.WARNING)
logging.getLogger("plotly").setLevel(logging.WARNING)

In [None]:
# Directories
PROJECT_ROOT = Path.cwd().resolve().parent
DATA_DIR = PROJECT_ROOT / "data"
FIGURES_PATH = PROJECT_ROOT / "analysis/figures/"

# File Paths
DATA_FILE_PATH = DATA_DIR / "processed/od_data.csv"
SUBURBS_SHAPEFILE_PATH = DATA_DIR / "suburbs/Locality_Boundaries.shp"

In [None]:
# Params
offset_years = 3 # Calendar years
trend_years = 3 # For plotting
analysis_year= 1
lga = 'All' # Use All for all LGAs
locality = "All" # Use All for all suburbs

# Colors
mode_colors = {
    'Bus': '#4E79A7',
    'Rail': '#59A14F',
    'Ferry': '#F28E2B'
}

ticket_colors = {
    'Go Card': "#004b88", 
    'Emv': "#00857C",     
    'Paper': "#B08A00"  
}

In [None]:
# Figure styling
def apply_plotly_style(fig, title=None, x_title=None, y_title=None, analysis_period_text=None):
    """
    Apply consistent formatting to a Plotly figure.
    """
    
    fig.update_layout(
        template='plotly_white',
        dragmode=False,
        font=dict(size=12, color='black'),
        title={'text': f"{title}<br><span style='font-size:12px; color:gray;'>{analysis_period_text}</span>", 
               'x':0.5, 
               'xanchor':'center', 
               'font': {'size':14, 
                        'color':'black'}},
        # hovermode='x unified',
        hoverlabel = dict(bgcolor='white'),
        margin=dict(t=50, l=60, r=30, b=80)
    )

    fig.update_xaxes(
        title_text=x_title,
        showline=True,
        linecolor='black',
        linewidth=1.5,
        mirror=True
    )

    fig.update_yaxes(
        title_text=y_title,
        showline=True,
        linecolor='black',
        linewidth=1.5,
        mirror=True,
        fixedrange=False
    )

    return fig

# Analysis period
def analysis_period(df):
    dates = df['date'].sort_values().unique()
    start = pd.to_datetime(dates[0]).strftime("%Y-%B")
    end   = pd.to_datetime(dates[-1]).strftime("%Y-%B")


    if len(dates) == 1:
        analysis_period_text = f"Analysis Period: {start}"
    else:
        analysis_period_text = f"Analysis Period: {start} to {end}"
    return analysis_period_text

# Load and Prep Data

In [None]:
# =============================================================================
# 1. DATA LOADING AND PREPARATION
# =============================================================================

In [None]:
def load_and_prepare_data(data_file_path: Path, 
                          suburbs_shapefile_path: Path, 
                          offset_years: int, 
                          lga: str, 
                          locality: str) -> pd.DataFrame:
    
    app_logger.info("Loading and preparing data...")

    df = pd.read_csv(data_file_path, low_memory=False)
    suburbs = gpd.read_file(suburbs_shapefile_path)
    suburbs['longitude'] = suburbs.centroid.x
    suburbs['latitude'] = suburbs.centroid.y
    
    # Merge suburbs
    df = (
        pd.merge(df, suburbs[['locality', 'lga', 'loc_code', 'longitude', 'latitude']], 
                 how='left', 
                 left_on='origin_loc_code', 
                 right_on='loc_code'
                 )
        .rename(columns={'locality': 'origin_locality', 'lga': 'origin_lga', 'longitude': 'origin_longitude', 'latitude': 'origin_latitude'})
        .drop('loc_code', axis=1)
    )
    df = (
        pd.merge(df, suburbs[['locality', 'lga', 'loc_code', 'longitude', 'latitude']], 
                 how='left', 
                 left_on='destination_loc_code', right_on='loc_code'
                 )
        .rename(columns={'locality': 'destination_locality', 'lga': 'destination_lga', 'longitude': 'destination_longitude', 'latitude': 'destination_latitude'})
        .drop('loc_code', axis=1)
        )
    
    unfiltered_data = df.copy()
    app_logger.info("Total records before filtering: {}".format(len(df)))

    # Filter time period
    app_logger.info("Filtering to last {} years...".format(offset_years))
    df["date"] = pd.to_datetime(df["month"], format='%Y-%m')
    df['year'] = df['date'].dt.year
    df['month_num'] = df['date'].dt.month
    df['month_name'] = df['date'].dt.month_name()
    df['quarter'] = df['date'].dt.quarter
    df['year_quarter'] = df['year'].astype(str) + '-Q' + df['quarter'].astype(str)
    df = df[df['year'].between( df['date'].max().year - offset_years, df['date'].max().year)]
    app_logger.info("Total records after time filtering: {}".format(len(df)))

    # Filter geopgraphic data
    if lga != "All":
        app_logger.info("Filtering to {} lga...".format(lga))
        df = df[(df['origin_lga'] == lga) | (df['destination_lga'] == lga)]
    if locality != "All":
        app_logger.info("Filtering to {} locality...".format(locality))
        df = df[(df['origin_locality'] == locality) | (df['destination_locality'] == locality)]
    df = df.drop(columns=['origin_loc_code', 'destination_loc_code', 'origin_lga', 'destination_lga'], axis=1)
    app_logger.info("Total records after geographic filtering: {}".format(len(df)))
    

    # Extract weekday/weekend from time_period
    df['day_type'] = df['time_period'].apply(
        lambda x: 'Weekend' if 'Weekend' in str(x) else 'Weekday'
    )
    
    # Standardize 
    df['mode'] = df['mode'].str.title()
    df['ticket_type'] = df['ticket_type'].str.title()

    # Clean suburb names
    df['origin_locality'] = df['origin_locality'].str.strip().str.title()
    df['destination_locality'] = df['destination_locality'].str.strip().str.title()

    # Ensure patronage is numeric
    df['patronage'] = pd.to_numeric(df['quantity'], errors='coerce').fillna(0)

    print(f"Dataset loaded: {len(df):,} records")
    print(f"Date range: {df['date'].min().strftime('%B %Y')} to {df['date'].max().strftime('%B %Y')}")
    print(f"Total patronage: {df['patronage'].sum():,.0f} passengers")
    
    return unfiltered_data, df    

In [None]:
unfiltered_data, data = load_and_prepare_data(
    data_file_path=DATA_FILE_PATH,
    suburbs_shapefile_path=SUBURBS_SHAPEFILE_PATH,
    offset_years=offset_years,
    lga=lga,
    locality=locality
)

## Generate Summary Statistics

In [None]:
def generate_summary_statistics(df: pd.DataFrame) -> dict:
    """
    Generate comprehensive summary statistics for the dataset.
    """
    print("\n" + "="*60)
    print("DATASET SUMMARY STATISTICS")
    print("="*60)

    summary = {
        "Total Records": len(df),
        "Date Range": f"{df['date'].min().strftime('%B %Y')} to {df['date'].max().strftime('%B %Y')}",
        "Total Patronage": f"{df['patronage'].sum():,.0f} patronages",
        "Unique Origins": df['origin_locality'].nunique(),
        "Unique Destinations": df['destination_locality'].nunique(),
        "Unique OD Pairs": df.groupby(['origin_locality', 'destination_locality']).ngroups,
        "Transport modes": df['mode'].unique(),
        "Ticket Types": df['ticket_type'].unique(),
    }

    for key, value in summary.items():
        print(f"{key}: {value}")

    print("\n--- Patronage by Mode ---")
    mode_summary = df.groupby('mode')['patronage'].sum().sort_values(ascending=False)
    for mode, pat in mode_summary.items():
        pct = pat / df['patronage'].sum() * 100
        print(f"{mode}: {pat:,.0f} ({pct:.1f}%)")

    print("\n--- Patronage by Ticket Type ---")
    ticket_summary = df.groupby('ticket_type')['patronage'].sum().sort_values(ascending=False)
    for ticket, pat in ticket_summary.items():
        pct = pat / df['patronage'].sum() * 100
        print(f"{ticket}: {pat:,.0f} ({pct:.1f}%)")  
generate_summary_statistics(data)  

# Temporal Analysis

## Monthly Trends

### Patronage

In [None]:
def plot_monthly_trends(df, trend_years=1, save_path='graphs/'):
    """
    Plot monthly patronage trends
    """
    df = df[df['year'].between( df['date'].max().year - trend_years, df['date'].max().year)]
    # Analysis period
    analysis_period_text = analysis_period(df)
    monthly = df.groupby('date')['patronage'].sum().reset_index()

    fig = px.line(
        monthly,
        x='date',
        y='patronage',
        markers=True,
        hover_data={}
    )
    
    fig.update_traces(line=dict(width=2, color='#1f77b4'), marker=dict(color='#1f77b4', size=6))

    # ---- Yearly average horizontal segments ----
    yearly_avg = (
        monthly
        .assign(year=monthly['date'].dt.year)
        .groupby('year')['patronage']
        .mean()
    )

    for year, avg in yearly_avg.items():
        start_date = pd.Timestamp(year=year, month=1, day=1)
        end_date = pd.Timestamp(year=year, month=12, day=31)

        fig.add_shape(
            type="line",
            x0=start_date,
            x1=end_date,
            y0=avg,
            y1=avg,
            line=dict(dash="dot", width=2),
            opacity=0.7
        )

        fig.add_annotation(
            x=end_date,
            y=avg,
            text=f"{year} Avg: {avg / 1_000_000:.2f}M",
            showarrow=False,
            xanchor="right",
            yanchor="bottom",
            font=dict(size=11)
        )

    # Apply the reusable style
    fig = apply_plotly_style(
        fig,
        title='Monthly Patronage',
        x_title='Month',
        y_title='Patronage',
        analysis_period_text=analysis_period_text
    )

    fig.update_traces(
        customdata=np.array(fig.data[0].y)/1e6,
        hovertemplate='<b>%{x|%b %Y}</b><br>Patronage: %{customdata:,.2f} M<extra></extra>'
    )

    # Add vertical lines for years
    for year in df['year'].unique():
        fig.add_vline(
            x=pd.Timestamp(f'{year}-01-01'),
            line_dash="dash",
            line_color="gray",
            opacity=0.5
        )

    # Save files
    fig.write_html(f'{save_path}/monthly_trends.html', include_plotlyjs='cdn')

    # Statistics
    # First and last year
    start_year = yearly_avg.index.min()
    end_year = yearly_avg.index.max()

    start_avg = yearly_avg.loc[start_year]
    end_avg = yearly_avg.loc[end_year]


    # Absolute change
    abs_change = end_avg - start_avg

    # Percentage change
    pct_change = (abs_change / start_avg) * 100

    print(f"Patronage: {monthly['patronage'].sum():,.0f} patronage")
    print(f"Average patronage increased from {start_year} to {end_year} by {abs_change:,.0f} patronage per month.")
    print(f"That represents a {pct_change:.2f}% increase of patronage per month over the period.")

    return fig

plot_monthly_trends(data, trend_years = trend_years, save_path=FIGURES_PATH)

## Mode Analysis

In [None]:
def plot_mode_trends(df, trend_years=1, save_path='graphs/', mode_colors=None):
    """
    Mode trends over time
    """
    df = df[df['year'].between( df['date'].max().year - trend_years, df['date'].max().year)]
    # Analysis period
    analysis_period_text = analysis_period(df)
    mode_monthly = df.groupby(['date', 'mode'])['patronage'].sum().reset_index().sort_values(by=['date', 'mode'])
    
    # Compute yearly averages
    mode_monthly['year'] = mode_monthly['date'].dt.year
    yearly_avg = mode_monthly.groupby(['year','mode'])['patronage'].mean().reset_index()

    fig = go.Figure()
    for mode in mode_monthly['mode'].unique():
        df_mode = mode_monthly[mode_monthly['mode'] == mode]
        
        # Main line
        fig.add_trace(
            go.Scatter(
                x=df_mode['date'],
                y=df_mode['patronage'],
                mode='lines+markers',
                name=mode,
                line=dict(color=mode_colors[mode], width=2),
                marker=dict(size=6),
                legendgroup=mode, 
                customdata=np.stack((df_mode['mode'], df_mode['patronage'] / 1_000_000), axis=-1),
                hovertemplate='%{customdata[0]}: %{customdata[1]:.1f} M<extra></extra>'

            )
        )
        
        # Average lines (one per year)
        df_avg_mode = yearly_avg[yearly_avg['mode'] == mode]
        for _, row in df_avg_mode.iterrows():
            year = row['year']
            avg = row['patronage']
            start_date = pd.Timestamp(year=year, month=1, day=1)
            end_date = pd.Timestamp(year=year, month=12, day=31)
            
            fig.add_trace(
                go.Scatter(
                    x=[start_date, end_date],
                    y=[avg, avg],
                    mode='lines+text',
                    line=dict(color='gray', dash='dot', width=1),
                    name=f"{mode} {year} Avg",
                    text=[' ', f"{mode} {year} Avg: {avg/1_000_000:.1f}M"],
                    textposition='top left',
                    textfont=dict(size=10),
                    showlegend=False, 
                    legendgroup=mode,
                    hoverinfo='skip'     
                )
            )

    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            line=dict(color='gray', dash='dot', width=1),
            name="Average<br>Monthly<br>Patronage",
            legendgroup=mode
        )
    )


    fig.update_layout(
        template='plotly_white',
        hovermode='x unified'
    )

    fig = apply_plotly_style(
        fig,
        title='Monthly Patronage by Mode',
        x_title='Month',
        y_title='Patronage',
        analysis_period_text=analysis_period_text
    )

    # Save files
    fig.write_html(f'{save_path}/mode_trends.html', include_plotlyjs='cdn')

    # Statistics
    start_year = yearly_avg['year'].min()
    end_year = yearly_avg['year'].max()

    for mode in yearly_avg['mode'].unique():
        start_avg = yearly_avg[(yearly_avg['mode']==mode) & (yearly_avg['year']==start_year)]['patronage'].values[0]
        end_avg = yearly_avg[(yearly_avg['mode']==mode) & (yearly_avg['year']==end_year)]['patronage'].values[0]
        
        abs_change = end_avg - start_avg
        pct_change = (abs_change / start_avg) * 100
        
        print(f"{mode}: Average patronage change from {start_year} to {end_year}: "
            f"{abs_change:,.0f} patronages per month ({pct_change:.2f}%)")

    return fig

plot_mode_trends(data, trend_years=offset_years, mode_colors=mode_colors, save_path=FIGURES_PATH)

In [None]:
def plot_mode_share_grouped_bar(df, trend_years=4, mode_colors=None, save_path='graphs/'):
    """
    Grouped bar chart showing mode share (%) per year.
    x-axis: Mode
    Bars: Year (unstacked)
    """
    # Filter last trend_years
    df = df[df['year'].between( df['date'].max().year - trend_years, df['date'].max().year)]
    # Analysis period
    analysis_period_text = analysis_period(df)
    
    # Compute total patronages per year
    yearly_total = df.groupby('year')['patronage'].sum().reset_index().rename(columns={'patronage':'year_total'})
    
    # Compute monthly average per mode-year (or sum for the year)
    mode_year_sum = df.groupby(['mode', 'year'])['patronage'].sum().reset_index()
    
    # Merge to get percentage share
    mode_year_share = mode_year_sum.merge(yearly_total, on='year')
    mode_year_share['share_percent'] = mode_year_share['patronage'] / mode_year_share['year_total'] * 100
    
    # Convert year to string for discrete color mapping
    mode_year_share['year'] = mode_year_share['year'].astype(str)
    
    # Plot grouped bar chart
    fig = px.bar(
        mode_year_share,
        x='mode',
        y='share_percent',
        color='year',
        barmode='group',  # unstacked
        text=mode_year_share['share_percent'].apply(lambda x: f"{x:.0f}%"),
        color_discrete_sequence=px.colors.qualitative.Safe
    )
    
    fig.update_traces(textposition='outside')
    fig.update_layout(legend_title_text="Year")
    
    fig = apply_plotly_style(
        fig,
        title=f"Mode Share by Year",
        y_title='% of Patronage',
        x_title='Mode',
        analysis_period_text=analysis_period_text
    )
    
    # Save files
    fig.write_html(f'{save_path}/mode_share_grouped_bar.html', include_plotlyjs='cdn')
    
    # Percentage change in share from first to last year
    stats = mode_year_share.pivot(index='mode', columns='year', values='share_percent')
    stats['pct_change'] = stats[stats.columns.max()] - stats[stats.columns.min()]
    
    print("---Change in Mode Share (%) per Mode---")
    for mode, row in stats.iterrows():        
        print(f"{mode}: {row['pct_change']:.1f}% change")
    
    return fig

plot_mode_share_grouped_bar(data, trend_years=3, mode_colors=mode_colors, save_path=FIGURES_PATH)

In [None]:
def plot_mode_by_day_type_pie(df, analysis_years=1, mode_colors=None, save_path='graphs/'):
    """
    Mode distribution by Day Type as two pie charts
    """
    df = df[df['year'].between( df['date'].max().year - analysis_years, df['date'].max().year)] 
    mode_day = df.groupby(['day_type', 'mode'])['patronage'].sum().reset_index()
    mode_day['percent'] = mode_day.groupby('day_type')['patronage'].transform(lambda x: x / x.sum())
    
    # Analysis period
    analysis_period_text = analysis_period(df)
    
    # Create one pie chart per day_type
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                        subplot_titles=[f"{dt}" for dt in mode_day['day_type'].unique()],
                        horizontal_spacing=0.01)
    for ann in fig['layout']['annotations']:
        ann['y'] = -0.10
        ann['yanchor'] = 'top'
    
    for i, day_type in enumerate(mode_day['day_type'].unique(), start=1):
        df_day = mode_day[mode_day['day_type'] == day_type]
        fig.add_trace(
            go.Pie(
                labels=df_day['mode'],
                values=df_day['percent'],
                text=df_day['percent'].apply(lambda x: f"{x:.1%}"),
                textinfo='label+text',
                marker_colors=[mode_colors.get(m, '#999999') for m in df_day['mode']],
                name=day_type,
                showlegend=False,
                hoverinfo='label+text'
            ),
            row=1,
            col=i
        )
    
    
    fig.update_layout(
            template='plotly_white'   
            )
    

    # Apply the reusable style
    fig = apply_plotly_style(
        fig,
        title='Mode Distribution by Day Type',
        x_title=None,
        y_title=None,
        analysis_period_text=analysis_period_text
    )


    fig.write_html(f'{save_path}/mode_day_type_pie.html', include_plotlyjs='cdn')
    
    return fig

plot_mode_by_day_type_pie(data, analysis_years = 0, mode_colors=mode_colors, save_path=FIGURES_PATH)

## Ticket Type

In [None]:
def plot_ticket_type_trends(df, trend_years=1, ticket_colors=None, save_path='graphs/'):
    """
    Ticket type trends over time
    """
    df = df[df['year'].between( df['date'].max().year - trend_years, df['date'].max().year)]
    #Remove 'Count' ticket type
    df = df[df['ticket_type'].str.lower() != 'count']
    monthly = df.groupby(['date', 'ticket_type'])['patronage'].sum().reset_index().sort_values(by=['date', 'ticket_type'])
    monthly['percent'] = monthly.groupby(['date'])['patronage'].transform(lambda x: x/x.sum())
    # Analysis period
    analysis_period_text = analysis_period(df)

    fig = px.bar(
        monthly,
        x='date',
        y='percent',
        color='ticket_type',
        color_discrete_map=ticket_colors,
        labels={'ticket_type': 'Ticket Type'},
        barmode='stack',
        text='percent',
        text_auto='.0%'
        )
    
    fig.update_traces(textangle=270,
                      textfont_size=11)

    
    # Apply the reusable style
    fig = apply_plotly_style(
        fig,
        title='Ticket Type Trends Over Time',
        x_title='Month',
        y_title='% of Total Patronage',
        analysis_period_text=analysis_period_text
    )

    fig.update_layout(template='plotly_white', 
                      hovermode='x',
                      barnorm='percent',
                      legend_itemclick=False,
                      legend_itemdoubleclick=False)
    
    for trace in fig.data:
        trace_data = monthly[monthly['ticket_type']==trace.name]
        trace.customdata = np.stack([trace_data['patronage']/1e6, trace_data['percent']], axis=-1)
        trace.hovertemplate = (
            "<b>%{x|%b %Y}</b><br>" +
            "Ticket Type: %{fullData.name}<br>" + 
            "Patronage: %{customdata[0]:,.2f} M<br>" +
            "Share: %{customdata[1]:.1%}<extra></extra>"
        )


    # Add vertical lines for years
    for year in df['year'].unique():
        fig.add_vline(
            x=pd.Timestamp(f'{year}-01-01'),
            line_dash="dash",
            line_color="gray",
            opacity=0.5
        )


    # Save files
    fig.write_html(f'{save_path}/ticket_type_trends.html', include_plotlyjs='cdn')


    return fig
plot_ticket_type_trends(data, trend_years=1, ticket_colors=ticket_colors, save_path=FIGURES_PATH)

In [None]:
def plot_ticket_by_mode_pie(df, month=4, year=2025, ticket_colors=None, save_path='graphs/'):
    """
    Three pie charts showing ticket type distribution for each mode with percentages as labels
    """
    # Filter data
    df = df[(df['date'].dt.year >= year) & (df['date'].dt.month >= month)]
    df = df[df['ticket_type'].str.lower() != 'count']
    
    # Analysis Period
    analysis_period_text = analysis_period(df)

    # Aggregate
    ticket_mode = df.groupby(['mode', 'ticket_type'])['patronage'].sum().reset_index()
    modes = ticket_mode['mode'].unique()
    
    # Create subplots
    fig = make_subplots(
        rows=1, 
        cols=len(modes), 
        specs=[[{'type':'domain'}]*len(modes)],
        subplot_titles=modes,
        horizontal_spacing=0.05
    )

    for ann in fig['layout']['annotations']:
        ann['y'] = -0.10
        ann['yanchor'] = 'top'

    
    for i, mode in enumerate(modes, start=1):
        df_mode = ticket_mode[ticket_mode['mode'] == mode]
        fig.add_trace(
            go.Pie(
                labels=df_mode['ticket_type'],
                values=df_mode['patronage'],
                textinfo='label+percent',
                marker_colors=[ticket_colors.get(t, '#999999') for t in df_mode['ticket_type']],
                showlegend=False,
                hoverinfo='label+percent'
            ),
            row=1,
            col=i
        )

    fig = apply_plotly_style(
        fig,
        title=f"Ticket Type Distribution by Mode (From {calendar.month_name[month]}-{year})",
        x_title=None,
        y_title='% of Total Patronage',
        analysis_period_text=analysis_period_text
    )
    
    fig.write_html(f'{save_path}/ticket_by_mode_pie.html', include_plotlyjs='cdn')
    
    return fig

plot_ticket_by_mode_pie(data, month=4, year=2025, ticket_colors=ticket_colors, save_path=FIGURES_PATH)

## Origin Destination Analysis

In [None]:
def top_n_locality(df, top_n=20, analysis_years=0, mode_colors=None, save_path='graphs/'):
    """
    Chart showing top localities with the highest patronage
    """
    df = df[df['date'].dt.year.between(df['date'].max().year - analysis_years, df['date'].max().year)]

    df['destination_locality'] = df['destination_locality'].fillna('Unknown')
    df['origin_locality'] = df['origin_locality'].fillna('Unknown')
    # Analysis Period
    analysis_period_text = analysis_period(df)
    movement = (
        df.melt(
            value_vars=['origin_locality', 'destination_locality'],
            value_name='locality',
            id_vars='patronage'
        )
    )
    
    # Aggregate total movement per locality
    total_movement = (
        movement.groupby('locality')['patronage']
        .sum()
        .reset_index(name='total_patronage')
    )

    # Top N localities
    top_n_localities = (
        total_movement.nlargest(top_n, 'total_patronage')
        .sort_values('total_patronage')
        ['locality']
        .tolist()
    )

    # Aggregate movement by mode
    movement_by_mode = (
        df.groupby(['origin_locality', 'destination_locality', 'mode'], as_index=False)['patronage'].sum()
    )


    movement_by_mode = (
        movement_by_mode.assign(locality=movement_by_mode.apply(lambda r: {r['origin_locality'], r['destination_locality']}, axis=1))
        .explode('locality')
        .groupby(['locality', 'mode'], as_index=False)['patronage']
        .sum()
        .rename(columns={'patronage':'total_patronage'})
    )

    # Filter to top N localities
    movement_by_mode = (
        movement_by_mode[movement_by_mode['locality'].isin(top_n_localities)]
    )

    # Add locality_patronage for sorting
    movement_by_mode['locality_patronage'] = movement_by_mode.groupby('locality')['total_patronage'].transform('sum')
    movement_by_mode = movement_by_mode.sort_values(by=['locality_patronage'], ascending=[True])

    # Plot bar chart
    fig = px.bar(
        movement_by_mode,
        x='total_patronage',
        y='locality',
        orientation='h',
        labels={'total_patronage': 'Total Patronage', 'locality': 'Locality', 'mode': 'Mode'},
        color='mode',
        color_discrete_map=mode_colors,
        barmode='stack'
    )

    fig = apply_plotly_style(
        fig,
        title=f'Top {top_n} Locality by Patronage by Mode',
        x_title='Patronage',
        y_title='Locality',
        analysis_period_text = analysis_period_text
    )

    # Save files
    fig.write_html(f'{save_path}/top_n_locality_by_mode.html', include_plotlyjs='cdn')

    fig.show()

    return top_n_localities, df, movement

top_localities, df, movement = top_n_locality(data, top_n=20, analysis_years=0, mode_colors=mode_colors, save_path=FIGURES_PATH)

In [None]:
def top_n_od_pairs(df, top_n=10, analysis_years=1, mode_colors=None, save_path='graphs/'):

    # Filter years
    df = df[df['year'].between(df['date'].max().year - analysis_years, df['date'].max().year)]
    # Analysis Period
    analysis_period_text = analysis_period(df)


    # Remove same-origin/destination
    df_valid = df[df['origin_locality'] != df['destination_locality']]
    

    # Aggregate total patronage per OD pair
    od_totals = (
        df_valid.groupby(['origin_locality', 'destination_locality'])['patronage']
        .sum()
        .reset_index()
    )

    # Top N OD pairs
    top_n_od = (
        od_totals.nlargest(top_n, 'patronage')
        .assign(od_pair=lambda x: x['origin_locality'] + " → " + x['destination_locality'])
    )

    # Filter original df to only those OD pairs
    df_top_n = df_valid.merge(
        top_n_od[['origin_locality', 'destination_locality']],
        on=['origin_locality', 'destination_locality'],
        how='inner'
    )

    # Aggregate by mode
    modal_breakdown = (
        df_top_n.groupby([
            'origin_locality', 'destination_locality', 'mode',
            'origin_latitude', 'origin_longitude',
            'destination_latitude', 'destination_longitude'
        ])['patronage']
        .sum()
        .reset_index()
    )

    # --- Create node labels ---
    origins = modal_breakdown['origin_locality'].unique()
    destinations = modal_breakdown['destination_locality'].unique()

    dest_labels = [f"{d} " for d in destinations]  # add space to make unique
    labels = list(origins) + dest_labels

    label_to_index = {label: i for i, label in enumerate(labels)}

    # --- Build links ---
    source, target, value, link_color, hover_text = [], [], [], [], []

    for _, row in modal_breakdown.iterrows():
        o = row['origin_locality']
        d = f"{row['destination_locality']} "
        m = row['mode']
        p = row['patronage']

        source.append(label_to_index[o])
        target.append(label_to_index[d])
        value.append(p)
        link_color.append(mode_colors.get(m, '#888888'))
        hover_text.append(
            f"Origin: {o}<br>"
            f"Destination: {row['destination_locality']}<br>"
            f"Mode: {m}<br>"
            f"Patronage: {p:,}"
        )


    # NODE-LEVEL HOVER: total patronage + mode share
    # Total patronage per origin
    origin_totals = (
        modal_breakdown.groupby('origin_locality')['patronage']
        .sum()
    )

    # Total patronage per destination
    destination_totals = (
        modal_breakdown.groupby('destination_locality')['patronage']
        .sum()
    )

    # Mode share per origin
    origin_mode_share = (
        modal_breakdown.groupby(['origin_locality', 'mode'])['patronage']
        .sum()
        .groupby(level=0)
        .apply(lambda x: (x / x.sum() * 100).round(1))
    )

    # Mode share per destination
    destination_mode_share = (
        modal_breakdown.groupby(['destination_locality', 'mode'])['patronage']
        .sum()
        .groupby(level=0)
        .apply(lambda x: (x / x.sum() * 100).round(1))
    )

    # Build hover text for each node
    node_hover = []

    for label in labels:
        if label in origin_totals.index:
            total = origin_totals[label]
            shares = origin_mode_share.loc[label]

            share_text = "<br>".join([f"{m[1]}: {s}%" for m, s in shares.items()])

            node_hover.append(
                f"<b>Origin: {label}</b><br>"
                f"Total patronage: {total:,}<br><br>"
                f"<b>Mode share</b><br>{share_text}"
            )

        else:
            d = label.rstrip()
            total = destination_totals[d]
            shares = destination_mode_share.loc[d]

            share_text = "<br>".join([f"{m[1]}: {s}%" for m, s in shares.items()])

            node_hover.append(
                f"<b>Destination: {d}</b><br>"
                f"Total patronage: {total:,}<br><br>"
                f"<b>Mode share</b><br>{share_text}"
            )


    # Create Sankey
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            label=labels,
            line=dict(color="black", width=1),
            customdata=node_hover,
            hovertemplate="%{customdata}<extra></extra>"
        ),
        link=dict(
            source=source,
            target=target,
            value=value,
            color=link_color,
            customdata=hover_text,
            hovertemplate="%{customdata}<extra></extra>"
        )

    ))


    # Legend markers
    for mode, color in mode_colors.items():
        fig.add_trace(go.Scatter(
            x=[0], y=[0],
            mode='lines',
            marker=dict(size=12, color=color),
            name=mode,
            showlegend=True,
            hoverinfo='skip',
            xaxis='x2',
            yaxis='y2',
            opacity=1,
        ))

    fig.update_layout(
        xaxis2=dict(visible=False),
        yaxis2=dict(visible=False),
        legend_title_text="Mode"
    )

    fig = apply_plotly_style(
        fig,
        title=f"{top_n} Origin Destination Pairs",
        analysis_period_text = analysis_period_text
    )

    # Save files
    fig.write_html(f'{save_path}/top_n_OD_pairs.html', include_plotlyjs='cdn')

    fig.show()
top_n_od_pairs(data, top_n=20, analysis_years = 0, mode_colors=mode_colors, save_path=FIGURES_PATH)

In [None]:
def analyze_suburb_growth(df, top_n=10, analysis_years=1, save_path='graphs/'):
    """
    Analyze suburb growth/decline over years
    """
    # Filter to analysis days
    df = df[df['year'].between( df['date'].max().year - analysis_years, df['date'].max().year)]
    # Analysis Period
    analysis_period_text = analysis_period(df)

    monthly_movement = (
        df.melt(
            value_vars=['origin_locality', 'destination_locality'],
            value_name='locality',
            id_vars=['patronage', 'date']
        )
        .drop(columns=['variable'])
        .groupby(['date', 'locality'])['patronage']
        .sum()
        .reset_index()
    )

    # ---- Yearly average horizontal segments ----
    yearly_avg = (
        monthly_movement
        .assign(year=monthly_movement['date'].dt.year)
        .groupby(['year', 'locality'])['patronage']
        .mean()
        .reset_index()
    )

    yearly_avg = (
        yearly_avg
        .sort_values(['locality', 'year'])
        .assign(
            change=lambda yearly_avg: yearly_avg.groupby('locality')['patronage'].diff(),
            pct_change=lambda yearly_avg: yearly_avg.groupby('locality')['patronage'].pct_change() * 100,
            impact_score=lambda yearly_avg: yearly_avg['change'] * np.log1p(yearly_avg.groupby('locality')['patronage'].shift(1))
        )
    )

    yearly_avg = yearly_avg[yearly_avg['year'] == yearly_avg['year'].max()]

    df_sorted = yearly_avg.sort_values('impact_score').dropna()
    lowest = df_sorted.head(top_n)
    highest = df_sorted.tail(top_n)
    df_plot = pd.concat([lowest, highest]).sort_values('pct_change')

    # Assign muted colors
    df_plot['color'] = np.where(
        df_plot['impact_score'] >= 0,
        '#6BAF92',   # muted green
        '#C97A7A'    # muted red
    )

    fig = px.bar(
        df_plot,
        x='pct_change',
        y='locality',
        orientation='h',
        color='color',
        color_discrete_map='identity' 
    )


    fig.add_vline(x=0, line_width=1, line_color="black")
    divider_y = top_n - 0.5  
    fig.add_hline(
        y=divider_y,
        line_width=2,
        line_color="black",
        line_dash="dash"
    )

    fig = apply_plotly_style(fig, 
                             title=f'Top {top_n} Localities with Highest and Lowest Year-over-Year Impactful Change',
                             x_title='Year‑over‑Year Percent Change in Average Monthly Patronage',
                             y_title='Locality',
                             analysis_period_text = analysis_period_text
                            )

    # Save files
    fig.write_html(f'{save_path}/top_n_highest_lowest_change.html', include_plotlyjs='cdn')

    fig.show()

    

analyze_suburb_growth(data, top_n = 20, analysis_years = 1, save_path=FIGURES_PATH)