In [1]:
import pandas as pd
import numpy as np

df_raw = pd.read_csv("On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2025_1.csv", parse_dates=["FlightDate"], low_memory=False)
df_final = pd.read_csv('jan_2025_sanitized.csv')
def professional_data_profile(raw, final):
    print("=== 1. DATA VOLUME & DATA LOSS ===")
    raw_count = len(raw)
    final_count = len(final)
    print(f"Initial Raw Observations: {raw_count:,}")
    print(f"Final Cleaned Observations: {final_count:,}")
    print(f"Data Retention Rate: {(final_count/raw_count)*100:.2f}%")
    print(f"Records Removed (Cancellations/Missing Logic): {raw_count - final_count:,}")

    print("\n=== 2. RAW FEATURE COMPLETENESS (Top 10) ===")
    missing = raw.isnull().sum().sort_values(ascending=False).head(10)
    print(missing)

    print("\n=== 3. TARGET CLASS BALANCE (RAW vs. FINAL) ===")
    
    #DepDel15 exists in raw only so no leakage
    
    if 'DepDel15' in raw.columns:
        raw_delay_rate = raw['DepDel15'].mean() * 100
        final_delay_rate = final['DepDel15'].mean() * 100
        print(f"Raw Delay Rate: {raw_delay_rate:.2f}%")
        print(f"Final Delay Rate: {final_delay_rate:.2f}%")

    print("\n=== 4. NETWORK BREADTH ===")
    print(f"Unique Carriers: {final['Reporting_Airline'].nunique()}")
    print(f"Unique Origin Airports: {final['Origin'].nunique()}")
    print(f"Unique Tail Numbers (Airframes): {final['Tail_Number'].nunique():,}")

professional_data_profile(df_raw, df_final)

=== 1. DATA VOLUME & DATA LOSS ===
Initial Raw Observations: 539,747
Final Cleaned Observations: 522,224
Data Retention Rate: 96.75%
Records Removed (Cancellations/Missing Logic): 17,523

=== 2. RAW FEATURE COMPLETENESS (Top 10) ===
Div5WheelsOn        539747
Div5TotalGTime      539747
Div5LongestGTime    539747
Div5WheelsOff       539747
Div5TailNum         539747
Div4TailNum         539747
Div4WheelsOff       539747
Div4LongestGTime    539747
Div4TotalGTime      539747
Div4WheelsOn        539747
dtype: int64

=== 3. TARGET CLASS BALANCE (RAW vs. FINAL) ===
Raw Delay Rate: 18.26%
Final Delay Rate: 18.20%

=== 4. NETWORK BREADTH ===
Unique Carriers: 14
Unique Origin Airports: 329
Unique Tail Numbers (Airframes): 5,609
