In [3]:
import pandas as pd
import numpy as np

# Step 0 - Environment preparation

# Connect with google drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#!/usr/bin/env python3
"""
Hypothesis 1: Small Hub Revolution Analysis
Testing: Small hub airports demonstrate superior performance compared to other airport categories in post-COVID recovery

This script performs comprehensive statistical analysis to test whether small hub airports
are significantly outperforming other airport types during the post-COVID recovery period.

Statistical Analysis Framework:
- FAA-compliant airport classification system
- Growth rate analysis (aggregate and individual airport)
- Winner/loser distribution analysis
- Statistical significance testing (ANOVA, t-tests, chi-square)

Author: Aviation Recovery Research
Date: 2025
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, f_oneway
import warnings
warnings.filterwarnings('ignore')

def main():
    print("=== HYPOTHESIS 1: SMALL HUB REVOLUTION ANALYSIS ===")
    print("Testing: Small hub airports demonstrate superior performance compared to other airport categories")
    print()

    # Data Loading and Preprocessing
    print("Data Loading and Initial Processing...")

    try:
        # Load quarterly aggregated T-100 data
        df = pd.read_csv('/content/drive/MyDrive/airline_data_analysis_v2/t100_quarterly_aggregated.csv')
        print(f"Loaded quarterly aggregated data: {len(df):,} rows")
        data_source = "quarterly"
    except FileNotFoundError:
        # Fallback to consolidated data
        df = pd.read_csv('/content/drive/MyDrive/airline_data_analysis_v2/consolidated_t100_data_cleaned.csv')
        print(f"Loaded consolidated data: {len(df):,} rows")
        data_source = "consolidated"

    print(f"Data source: {data_source}")
    print(f"Columns available: {list(df.columns)}")
    print(f"Years in data: {sorted(df['YEAR'].unique())}")
    print()

    # Time Period Definition
    print("Defining Analysis Time Periods...")
    PRE_COVID_YEARS = [2018, 2019]
    POST_COVID_YEARS = [2023, 2024]

    print(f"Pre-COVID baseline: {PRE_COVID_YEARS}")
    print(f"Post-COVID recovery: {POST_COVID_YEARS}")

    pre_covid_data = df[df['YEAR'].isin(PRE_COVID_YEARS)].copy()
    post_covid_data = df[df['YEAR'].isin(POST_COVID_YEARS)].copy()

    print(f"Pre-COVID records: {len(pre_covid_data):,}")
    print(f"Post-COVID records: {len(post_covid_data):,}")
    print()

    # Airport Classification System
    print("Implementing FAA-Compliant Airport Classification System...")

    def classify_airport_hub(passengers, total_us_passengers):
        """Classify airports according to FAA hub designation criteria"""
        if total_us_passengers == 0:
            return "Regional/Local"

        market_share = (passengers / total_us_passengers) * 100

        if market_share >= 1.0:
            return "Major Hub"
        elif market_share >= 0.25:
            return "Medium Hub"
        elif market_share >= 0.05:
            return "Small Hub"
        elif passengers >= 10000:
            return "Non-Hub Primary"
        else:
            return "Regional/Local"

    # Calculate passenger totals by airport for each period
    print("Calculating passenger totals by airport...")

    pre_covid_totals = pre_covid_data.groupby('ORIGIN')['PASSENGERS'].sum().reset_index()
    pre_covid_totals.columns = ['AIRPORT', 'PASSENGERS_PRE']

    post_covid_totals = post_covid_data.groupby('ORIGIN')['PASSENGERS'].sum().reset_index()
    post_covid_totals.columns = ['AIRPORT', 'PASSENGERS_POST']

    # Merge and calculate changes
    airport_comparison = pd.merge(pre_covid_totals, post_covid_totals, on='AIRPORT', how='outer')
    airport_comparison = airport_comparison.fillna(0)

    airport_comparison['CHANGE_ABSOLUTE'] = airport_comparison['PASSENGERS_POST'] - airport_comparison['PASSENGERS_PRE']
    airport_comparison['CHANGE_PERCENT'] = np.where(
        airport_comparison['PASSENGERS_PRE'] > 0,
        (airport_comparison['CHANGE_ABSOLUTE'] / airport_comparison['PASSENGERS_PRE']) * 100,
        0
    )

    # Filter for meaningful airports (minimum threshold: 10,000 annual passengers in both periods)
    meaningful_airports = airport_comparison[
        (airport_comparison['PASSENGERS_PRE'] >= 10000) &
        (airport_comparison['PASSENGERS_POST'] >= 10000)
    ].copy()

    print(f"Total airports in analysis: {len(meaningful_airports)}")

    # Calculate total US passengers for hub classification
    total_us_passengers_pre = meaningful_airports['PASSENGERS_PRE'].sum()
    print(f"Total pre-COVID passengers (baseline): {total_us_passengers_pre:,}")

    # Apply hub classification
    meaningful_airports['HUB_TYPE'] = meaningful_airports.apply(
        lambda row: classify_airport_hub(row['PASSENGERS_PRE'], total_us_passengers_pre), axis=1
    )

    print()

    # Metric 1: Growth Rate Analysis
    print("METRIC 1: GROWTH RATE ANALYSIS")
    print("=" * 60)

    def analyze_growth_by_hub_type():
        """Analyze growth rates by hub type with comprehensive statistics"""

        growth_stats = meaningful_airports.groupby('HUB_TYPE').agg({
            'PASSENGERS_PRE': ['count', 'sum'],
            'PASSENGERS_POST': 'sum',
            'CHANGE_PERCENT': ['mean', 'median', 'std']
        }).round(1)

        # Flatten column names
        growth_stats.columns = ['Airport_Count', 'Total_Pre', 'Total_Post', 'Average_Growth', 'Median_Growth', 'Std_Growth']

        # Calculate aggregate growth rates
        growth_stats['Aggregate_Growth'] = ((growth_stats['Total_Post'] - growth_stats['Total_Pre']) /
                                          growth_stats['Total_Pre'] * 100).round(1)

        return growth_stats

    growth_by_hub = analyze_growth_by_hub_type()

    print("GROWTH PERFORMANCE BY HUB TYPE:")
    print("-" * 80)
    print(f"{'Hub Type':<15} {'Count':<6} {'Pre-COVID (M)':<12} {'Post-COVID (M)':<13} {'Aggregate %':<11} {'Average %':<10}")
    print("-" * 80)

    for hub_type in growth_by_hub.index:
        row = growth_by_hub.loc[hub_type]
        pre_millions = row['Total_Pre'] / 1_000_000
        post_millions = row['Total_Post'] / 1_000_000
        print(f"{hub_type:<15} {int(row['Airport_Count']):<6} {pre_millions:<12.1f} {post_millions:<13.1f} "
              f"{row['Aggregate_Growth']:<11.1f} {row['Average_Growth']:<10.1f}")

    print()

    # Metric 2: Winner/Loser Distribution Analysis
    print("METRIC 2: WINNER/LOSER DISTRIBUTION ANALYSIS")
    print("=" * 60)

    def analyze_winner_distribution():
        """Analyze distribution of winning vs losing airports by hub type"""

        # Define performance categories
        meaningful_airports['PERFORMANCE_CATEGORY'] = pd.cut(
            meaningful_airports['CHANGE_PERCENT'],
            bins=[-np.inf, -5, 5, np.inf],
            labels=['Loser', 'Stable', 'Winner']
        )

        # Calculate distribution by hub type
        distribution = meaningful_airports.groupby('HUB_TYPE')['PERFORMANCE_CATEGORY'].value_counts(normalize=True).unstack(fill_value=0) * 100
        distribution = distribution.round(1)

        # Add counts for context
        counts = meaningful_airports.groupby('HUB_TYPE')['PERFORMANCE_CATEGORY'].value_counts().unstack(fill_value=0)

        return distribution, counts

    winner_distribution, winner_counts = analyze_winner_distribution()

    print("WINNER/LOSER DISTRIBUTION BY HUB TYPE:")
    print("-" * 60)
    print(f"{'Hub Type':<15} {'Total':<6} {'Winners':<8} {'Stable':<7} {'Losers':<7} {'Winner %':<9}")
    print("-" * 60)

    for hub_type in winner_distribution.index:
        total = winner_counts.loc[hub_type].sum()
        winners = winner_counts.loc[hub_type, 'Winner'] if 'Winner' in winner_counts.columns else 0
        stable = winner_counts.loc[hub_type, 'Stable'] if 'Stable' in winner_counts.columns else 0
        losers = winner_counts.loc[hub_type, 'Loser'] if 'Loser' in winner_counts.columns else 0
        winner_pct = winner_distribution.loc[hub_type, 'Winner'] if 'Winner' in winner_distribution.columns else 0

        print(f"{hub_type:<15} {total:<6} {winners:<8} {stable:<7} {losers:<7} {winner_pct:<9.1f}")

    print()

    # Metric 3: Statistical Significance Testing
    print("METRIC 3: STATISTICAL SIGNIFICANCE TESTING")
    print("=" * 60)

    # Test 1: ANOVA across all hub types
    print("TEST 1: Analysis of Variance (ANOVA) - Overall Differences")
    print("-" * 50)

    hub_groups = []
    hub_labels = []

    for hub_type in meaningful_airports['HUB_TYPE'].unique():
        if pd.notna(hub_type):
            group_data = meaningful_airports[meaningful_airports['HUB_TYPE'] == hub_type]['CHANGE_PERCENT'].dropna()
            if len(group_data) >= 3:  # Minimum sample size
                hub_groups.append(group_data)
                hub_labels.append(hub_type)

    if len(hub_groups) >= 2:
        f_statistic, p_value_anova = f_oneway(*hub_groups)
        print(f"F-statistic: {f_statistic:.4f}")
        print(f"p-value: {p_value_anova:.6f}")
        print(f"Degrees of freedom: Between groups = {len(hub_groups) - 1}, Within groups = {sum(len(g) for g in hub_groups) - len(hub_groups)}")

        if p_value_anova < 0.05:
            print("Statistical result: Significant differences exist between hub types")
        else:
            print("Statistical result: No significant differences between hub types")
    else:
        print("Error: Insufficient groups for ANOVA analysis")

    print()

    # Test 2: Pairwise t-tests (Small Hub vs others)
    print("TEST 2: Pairwise Comparisons - Small Hub vs Other Categories")
    print("-" * 62)

    small_hub_data = meaningful_airports[meaningful_airports['HUB_TYPE'] == 'Small Hub']['CHANGE_PERCENT'].dropna()

    if len(small_hub_data) >= 3:
        for hub_type in ['Major Hub', 'Medium Hub', 'Non-Hub Primary']:
            comparison_data = meaningful_airports[meaningful_airports['HUB_TYPE'] == hub_type]['CHANGE_PERCENT'].dropna()

            if len(comparison_data) >= 3:
                # Perform t-test
                t_statistic, p_value = ttest_ind(small_hub_data, comparison_data, equal_var=False)
                mean_diff = small_hub_data.mean() - comparison_data.mean()

                print(f"Small Hub vs {hub_type}:")
                print(f"  Sample sizes: Small Hub = {len(small_hub_data)}, {hub_type} = {len(comparison_data)}")
                print(f"  Mean difference: {mean_diff:.2f} percentage points")
                print(f"  t-statistic: {t_statistic:.4f}")
                print(f"  p-value: {p_value:.6f}")

                if p_value < 0.05:
                    direction = "OUTPERFORM" if mean_diff > 0 else "UNDERPERFORM"
                    print(f"  Result: Small hubs significantly {direction} {hub_type}")
                else:
                    print(f"  Result: No significant difference")
                print()
            else:
                print(f"Small Hub vs {hub_type}: Insufficient sample size")
                print()

    # Test 3: Chi-square test for winner distribution
    print("TEST 3: Chi-square test - Winner distribution across hub types")
    print("-" * 54)

    if 'PERFORMANCE_CATEGORY' in meaningful_airports.columns:
        contingency_table = pd.crosstab(meaningful_airports['HUB_TYPE'], meaningful_airports['PERFORMANCE_CATEGORY'])

        # Filter to Winners vs Non-Winners for simpler analysis
        if 'Winner' in contingency_table.columns:
            contingency_simple = pd.DataFrame({
                'Winners': contingency_table['Winner'] if 'Winner' in contingency_table.columns else 0,
                'Non-Winners': contingency_table.drop('Winner', axis=1).sum(axis=1)
            })

            print("Contingency Table (Winners vs Non-Winners):")
            print(contingency_simple)
            print()

            chi2_stat, p_value_chi2, dof, expected = chi2_contingency(contingency_simple)

            print(f"Chi-square statistic: {chi2_stat:.4f}")
            print(f"p-value: {p_value_chi2:.6f}")
            print(f"Degrees of freedom: {dof}")

            if p_value_chi2 < 0.05:
                print("Result: Winner distribution varies significantly across hub types")
            else:
                print("Result: No significant variation in winner distribution")

    print()

    # Hypothesis Evaluation
    print("HYPOTHESIS EVALUATION")
    print("=" * 60)
    print("HYPOTHESIS: Small hub airports demonstrate superior performance compared to other airport categories")
    print()

    # Success criteria assessment
    criteria_met = 0

    # Criterion 1: Highest aggregate growth rate
    print("CRITERION 1: Highest Aggregate Growth Rate")
    small_hub_growth = growth_by_hub.loc['Small Hub', 'Aggregate_Growth'] if 'Small Hub' in growth_by_hub.index else 0
    max_growth_hub = growth_by_hub['Aggregate_Growth'].idxmax()
    max_growth_value = growth_by_hub['Aggregate_Growth'].max()

    print(f"  Small Hub growth: {small_hub_growth:.1f}%")
    print(f"  Highest growth: {max_growth_value:.1f}% ({max_growth_hub})")

    if max_growth_hub == 'Small Hub':
        print("  CRITERION 1 MET: Small hubs demonstrate highest aggregate growth rate")
        criteria_met += 1
    else:
        print("  CRITERION 1 NOT MET: Small hubs do not have highest aggregate growth rate")

    print()

    # Criterion 2: Highest winner percentage
    print("CRITERION 2: Highest Winner Percentage")
    if 'Winner' in winner_distribution.columns:
        small_hub_winner_rate = winner_distribution.loc['Small Hub', 'Winner'] if 'Small Hub' in winner_distribution.index else 0
        max_winner_hub = winner_distribution['Winner'].idxmax()
        max_winner_rate = winner_distribution['Winner'].max()

        print(f"  Small Hub winner rate: {small_hub_winner_rate:.1f}%")
        print(f"  Highest winner rate: {max_winner_rate:.1f}% ({max_winner_hub})")

        if max_winner_hub == 'Small Hub':
            print("  CRITERION 2 MET: Small hubs have highest winner percentage")
            criteria_met += 1
        else:
            print("  CRITERION 2 NOT MET: Small hubs do not have highest winner percentage")
    else:
        print("  CRITERION 2 CANNOT BE EVALUATED: Winner data unavailable")

    print()

    # Criterion 3: Statistical significance
    print("CRITERION 3: Statistical Significance")
    if 'p_value_anova' in locals():
        print(f"  ANOVA p-value: {p_value_anova:.6f}")
    else:
        print("  ANOVA p-value: Not calculated")

    if 'p_value_anova' in locals() and p_value_anova < 0.05:
        print("  CRITERION 3 MET: Statistically significant differences exist between hub types")
        criteria_met += 1
    else:
        print("  CRITERION 3 NOT MET: No statistically significant differences found")

    print()

    # Final verdict
    print("FINAL VERDICT:")
    print("=" * 40)
    print(f"Criteria met: {criteria_met}/3")

    if criteria_met >= 2:
        print("HYPOTHESIS SUPPORTED: Statistical analysis reveals that small hubs demonstrate superior performance")
        print("Evidence supports the 'Small Hub Revolution' phenomenon")
    elif criteria_met == 1:
        print("HYPOTHESIS PARTIALLY SUPPORTED: Mixed evidence for small hub outperformance")
        print("Some indicators support the hypothesis, but conclusive evidence is limited")
    else:
        print("HYPOTHESIS NOT SUPPORTED: Insufficient evidence for small hub outperformance")
        print("The data does not support the 'Small Hub Revolution' hypothesis")

    print()

    # Additional Analysis: Top Performers
    print("ADDITIONAL INSIGHTS: TOP PERFORMERS BY HUB TYPE")
    print("=" * 60)

    for hub_type in ['Major Hub', 'Medium Hub', 'Small Hub', 'Non-Hub Primary']:
        if hub_type in meaningful_airports['HUB_TYPE'].values:
            top_performers = meaningful_airports[meaningful_airports['HUB_TYPE'] == hub_type].nlargest(5, 'CHANGE_PERCENT')

            print(f"{hub_type.upper()} - Top 5 Performers:")
            print("-" * 40)
            for i, (_, airport) in enumerate(top_performers.iterrows(), 1):
                print(f"{i}. {airport['AIRPORT']}: {airport['CHANGE_PERCENT']:+.1f}%")
            print()

    # Save results for further analysis
    try:
        meaningful_airports.to_csv('/content/drive/MyDrive/airline_data_analysis_v2/hypothesis1_airport_analysis.csv', index=False)
        growth_by_hub.to_csv('/content/drive/MyDrive/airline_data_analysis_v2/hypothesis1_growth_by_hub.csv')
        winner_distribution.to_csv('/content/drive/MyDrive/airline_data_analysis_v2/hypothesis1_winner_distribution.csv')
        winner_counts.to_csv('/content/drive/MyDrive/airline_data_analysis_v2/hypothesis1_winner_counts.csv')
        print("Results saved for subsequent analysis:")
        print("  - hypothesis1_airport_analysis.csv")
        print("  - hypothesis1_growth_by_hub.csv")
        print("  - hypothesis1_winner_distribution.csv")
        print("  - hypothesis1_winner_counts.csv")
    except Exception as e:
        print(f"Note: Results not saved ({e})")

    print()
    print("=== ANALYSIS COMPLETE ===")

    return {
        'meaningful_airports': meaningful_airports,
        'growth_by_hub': growth_by_hub,
        'winner_distribution': winner_distribution,
        'winner_counts': winner_counts,
        'criteria_met': criteria_met
    }

if __name__ == "__main__":
    results = main()

=== HYPOTHESIS 1: SMALL HUB REVOLUTION ANALYSIS ===
Testing: Small hub airports demonstrate superior performance compared to other airport categories

Data Loading and Initial Processing...
Loaded quarterly aggregated data: 371,481 rows
Data source: quarterly
Columns available: ['YEAR', 'QUARTER', 'ORIGIN', 'DEST', 'PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'ORIGIN_STATE_ABR', 'DEST_STATE_ABR']
Years in data: [np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]

Defining Analysis Time Periods...
Pre-COVID baseline: [2018, 2019]
Post-COVID recovery: [2023, 2024]
Pre-COVID records: 103,640
Post-COVID records: 112,700

Implementing FAA-Compliant Airport Classification System...
Calculating passenger totals by airport...
Total airports in analysis: 419
Total pre-COVID passengers (baseline): 1,550,135,650.0

METRIC 1: GROWTH RATE ANALYSIS
GROWTH PERFORMANCE BY HUB TYPE:
----