In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import time
import sklearn
import warnings
import pickle

warnings.filterwarnings(action='ignore', category=FutureWarning)

pd.__version__, sklearn.__version__

('2.1.1', '1.3.1')

In [2]:
SCRATCH_DIR = "/scratch/siads696f23_class_root/siads696f23_class/psollars"

# For local dev
SCRATCH_DIR = "./../data"

In [4]:
start_time = time.time()

df_top_airline_airport_2019 = pd.read_parquet(f"{SCRATCH_DIR}/top_airline_airport_2019.parquet")

end_time = time.time()

print(f"Elapsed time: {(end_time - start_time):.4f} seconds")

# df_top_airline_airport_2019
# 2769026 rows × 109 columns

Elapsed time: 14.8701 seconds


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginCityMarketID,...,faa_registration_origin,Origin_DISPLAY_AIRPORT_NAME,Origin_DISPLAY_AIRPORT_CITY_NAME_FULL,Origin_LATITUDE,Origin_LONGITUDE,Dest_DISPLAY_AIRPORT_NAME,Dest_DISPLAY_AIRPORT_CITY_NAME_FULL,Dest_LATITUDE,Dest_LONGITUDE,delayed
24798424,2019,1,1,11,5,2019-01-11 00:00:00,AS,364VA,1592,34262,...,2022.0,Palm Springs International,"Palm Springs, CA",33.829722,-116.506667,San Francisco International,"San Francisco, CA",37.618889,-122.375556,False
24798426,2019,1,1,11,5,2019-01-11 00:00:00,AS,628VA,1595,34262,...,2023.0,Palm Springs International,"Palm Springs, CA",33.829722,-116.506667,San Francisco International,"San Francisco, CA",37.618889,-122.375556,True
24798428,2019,1,1,11,5,2019-01-11 00:00:00,AS,844VA,1621,30423,...,2023.0,Austin - Bergstrom International,"Austin, TX",30.194444,-97.670000,Seattle/Tacoma International,"Seattle, WA",47.450000,-122.311667,False
24798429,2019,1,1,11,5,2019-01-11 00:00:00,AS,844VA,1621,30423,...,2023.0,Robert Mueller Municipal,"Austin, TX",30.298056,-97.701389,Seattle/Tacoma International,"Seattle, WA",47.450000,-122.311667,False
24798430,2019,1,1,11,5,2019-01-11 00:00:00,AS,839VA,1625,32211,...,2023.0,Harry Reid International,"Las Vegas, NV",36.080000,-115.152222,Portland International,"Portland, OR",45.588611,-122.596944,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32288689,2019,4,12,1,7,2019-12-01 00:00:00,DL,680DA,825,30397,...,2023.0,Hartsfield-Jackson Atlanta International,"Atlanta, GA",33.636667,-84.427778,Norman Y. Mineta San Jose International,"San Jose, CA",37.363056,-121.928611,True
32288690,2019,4,12,1,7,2019-12-01 00:00:00,DL,680DA,825,32457,...,2023.0,Norman Y. Mineta San Jose International,"San Jose, CA",37.363056,-121.928611,Hartsfield-Jackson Atlanta International,"Atlanta, GA",33.636667,-84.427778,False
32288692,2019,4,12,1,7,2019-12-01 00:00:00,DL,678DL,827,32467,...,2023.0,Fort Lauderdale-Hollywood International,"Fort Lauderdale, FL",26.071667,-80.149722,Hartsfield-Jackson Atlanta International,"Atlanta, GA",33.636667,-84.427778,False
32288693,2019,4,12,1,7,2019-12-01 00:00:00,DL,865DA,828,31650,...,2023.0,Minneapolis-St Paul International,"Minneapolis, MN",44.881944,-93.221667,Hartsfield-Jackson Atlanta International,"Atlanta, GA",33.636667,-84.427778,True


In [4]:
# Feature selection from "feature_selection_2019_UA.ipynb"

df_top_airline_airport_2019_num_cols = [
    "Quarter",
    "Month",
    "DayofMonth",
    "DayOfWeek",
    "CRSDepTime",
    "CRSArrTime",
    "CRSElapsedTime",
    "Distance",
    "DistanceGroup",
    "YEAR MFR",
    # "HORSEPOWER", # Most are 0, drop this
    # "THRUST", # Most are 0, drop this
    "NO-SEATS",  # Chunk into groups of 50
    "Origin_LATITUDE",
    "Origin_LONGITUDE",
    "Dest_LATITUDE",
    "Dest_LONGITUDE",
]

df_top_airline_airport_2019_cat_cols = [
    # Our selected airlines
    # ["WN", "DL", "AA", "B6", "UA", "AS"]
    "Reporting_Airline",
    "Tail_Number",
    # These are our top 10 for these airlines, we have lat/lon as the model feature
    # ["ATL", "LAX", "DFW", "DEN", "ORD", "PHX", "SEA", "SFO", "LAS", "BOS"]
    "Origin",
    "Dest",
    "TYPE REGISTRANT",  # 3 or 7 - Corporations and LLCs
    "NAME",
    "CERTIFICATION",  # 1T, standard transport
    "TYPE ENGINE",  # 4 or 5
    "STATUS CODE",
    "MFR",  # Engine manufacturer
    "MODEL",  # Engine model
    "TYPE",  # 4 or 5 - Fixed wing single or multi engine
    "MFR_aircraft",
    "MODEL_aircraft",
    "BUILD-CERT-IND",  # 0 only, we don't care about kit built aircraft, these will be filtered out
    # "Origin_DISPLAY_AIRPORT_NAME",
    # "Origin_DISPLAY_AIRPORT_CITY_NAME_FULL",
    # "Dest_DISPLAY_AIRPORT_NAME",
    # "Dest_DISPLAY_AIRPORT_CITY_NAME_FULL",
]

delay_cols = [
    "DepDel15",
    "ArrDel15",
    "CarrierDelay",
    "WeatherDelay",
    "NASDelay",
    "SecurityDelay",
    "LateAircraftDelay",
]

In [5]:
# Add delay columns so we can filter later
consolidated_df = df_top_airline_airport_2019[
    df_top_airline_airport_2019_num_cols
    + df_top_airline_airport_2019_cat_cols
    + delay_cols
].copy().reset_index(drop=True)

# Identify problematic columns
# for col in consolidated_df.columns:
#     try:
#         consolidated_df[col].astype(float)
#     except ValueError:
#         print(f"Column {col} contains non-convertible values.")

cols_to_convert = [
    "YEAR MFR",
    "TYPE REGISTRANT",
    "TYPE",
    "TYPE ENGINE",
    "BUILD-CERT-IND",
]

for col in cols_to_convert:
    try:
        consolidated_df[col] = pd.to_numeric(
            consolidated_df[col].str.strip(), errors="coerce"
        )
    except:
        print(col, "isn't a string")
        consolidated_df[col] = consolidated_df[col].astype("float")

# Clean up some of the weird, non-commercial aircraft registrations
consolidated_df = consolidated_df[
    (  # 7 or 3
        consolidated_df["TYPE REGISTRANT"].eq(7)
        | consolidated_df["TYPE REGISTRANT"].eq(3)
    )
    & (  # 4 or 5
        consolidated_df["TYPE"].eq(4) | consolidated_df["TYPE"].eq(5)
    )  # 4 or 5
    & (consolidated_df["TYPE ENGINE"].eq(4) | consolidated_df["TYPE ENGINE"].eq(5))
    & consolidated_df["BUILD-CERT-IND"].eq(0)  # 0 only
]

# Drop any features that have all identical values
nunique = consolidated_df.nunique()
consolidated_df = consolidated_df.drop(nunique[nunique.eq(1)].index, axis=1)

# Drop NaNs
consolidated_df = consolidated_df.dropna(how="any", axis="index")

print(f"Removed {len(df_top_airline_airport_2019) - len(consolidated_df)} rows")


consolidated_df["YEAR MFR"] = consolidated_df["YEAR MFR"].astype(int)
consolidated_df["NO-SEATS"] = consolidated_df["NO-SEATS"].astype(int)
consolidated_df["TYPE REGISTRANT"] = consolidated_df["TYPE REGISTRANT"].astype(int)
consolidated_df["TYPE ENGINE"] = consolidated_df["TYPE ENGINE"].astype(int)
consolidated_df["TYPE"] = consolidated_df["TYPE"].astype(int)

consolidated_df
# 2626181 rows × 34 columns

BUILD-CERT-IND isn't a string
Removed 142845 rows


Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,DistanceGroup,YEAR MFR,...,TYPE,MFR_aircraft,MODEL_aircraft,DepDel15,ArrDel15,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1,1,11,5,0940,1110,90,421,2,2016,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
1,1,1,11,5,1810,1945,95,421,2,2007,...,5,AIRBUS,A320-214,1,1,0,0,0,0,29
2,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
3,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
4,1,1,11,5,1420,1640,140,763,4,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769021,4,12,1,7,1924,2144,320,2116,9,1992,...,5,BOEING,757-232,1,1,17,0,1,0,0
2769022,4,12,1,7,2245,0611,266,2116,9,1992,...,5,BOEING,757-232,0,0,0,0,0,0,0
2769023,4,12,1,7,0630,0823,113,581,3,1992,...,5,BOEING,757-232,0,0,0,0,0,0,0
2769024,4,12,1,7,2237,0204,147,907,4,1999,...,5,BOEING,777-232,1,1,62,0,1,0,0


In [6]:
consolidated_df["NO-SEATS"] = consolidated_df["NO-SEATS"].apply(
    lambda x: int(x // 50) * 50
)

consolidated_df.value_counts("NO-SEATS")

NO-SEATS
150    1016593
100     923547
200     346340
350     204845
0        65285
250      36220
400      20819
300      10696
450        948
550        888
Name: count, dtype: int64

In [7]:
keep_NAME = [
    "SOUTHWEST AIRLINES CO",
    "DELTA AIR LINES INC",
    "UNITED AIRLINES INC",
    "AMERICAN AIRLINES INC",
    "BANK OF UTAH TRUSTEE",
    "WELLS FARGO TRUST CO NA TRUSTEE",
    "WILMINGTON TRUST CO TRUSTEE",
    "ALASKA AIRLINES INC",
    "UMB BANK NA TRUSTEE",
    "JETBLUE AIRWAYS CORP",
    "U S BANK NA TRUSTEE",
]

consolidated_df["NAME"] = consolidated_df["NAME"].str.strip()

consolidated_df["NAME"] = consolidated_df["NAME"].apply(
    lambda x: x if x in keep_NAME else "other"
)

consolidated_df.value_counts("NAME")

NAME
SOUTHWEST AIRLINES CO              491928
DELTA AIR LINES INC                450301
UNITED AIRLINES INC                412838
AMERICAN AIRLINES INC              327693
BANK OF UTAH TRUSTEE               211526
WELLS FARGO TRUST CO NA TRUSTEE    135171
other                              127532
ALASKA AIRLINES INC                122346
WILMINGTON TRUST CO TRUSTEE        122169
UMB BANK NA TRUSTEE                109401
JETBLUE AIRWAYS CORP                87278
U S BANK NA TRUSTEE                 27998
Name: count, dtype: int64

In [8]:
# CERTIFICATION
# 1T            2627937
# 1N               4388
# 41               3802
# 1                3338
# 410              2487
# 1A               2155
# 40               1788
# 4103             1475
# 1C                 64
# 1NU                12

# 1T is standard transport
keep_CERTIFICATION = ["1T"]

consolidated_df["CERTIFICATION"] = consolidated_df["CERTIFICATION"].str.strip()

consolidated_df["CERTIFICATION"] = consolidated_df["CERTIFICATION"].apply(
    lambda x: x if x in keep_CERTIFICATION else "other"
)

consolidated_df.value_counts("CERTIFICATION")

CERTIFICATION
1T       2606672
other      19509
Name: count, dtype: int64

In [9]:
# BOEING                            1631691
# AIRBUS                             443497
# AIRBUS INDUSTRIE                   282916
# MCDONNELL DOUGLAS AIRCRAFT CO      102934
# EMBRAER                             65131
# MCDONNELL DOUGLAS                   60941
# MCDONNELL DOUGLAS CORPORATION       25814
# AIRBUS SAS                          24686
# C SERIES AIRCRAFT LTD PTNRSP         5818
# AIRBUS CANADA LTD PTNRSP             3864
# GULFSTREAM AEROSPACE                   64
# PILATUS AIRCRAFT LTD                   64
# DASSAULT                               14
# CESSNA                                 12

consolidated_df["MFR_aircraft"] = (
    consolidated_df["MFR_aircraft"]
    .replace(r".*AIRBUS.*", "AIRBUS", regex=True)
    .replace(r".*EMBRAER.*", "EMBRAER", regex=True)
    .replace(r".*MCDONNELL DOUGLAS.*", "MCDONNELL DOUGLAS", regex=True)
)

keep_MFR_aircraft = ["BOEING", "AIRBUS", "MCDONNELL DOUGLAS", "EMBRAER"]

consolidated_df["MFR_aircraft"] = consolidated_df["MFR_aircraft"].str.strip()

consolidated_df["MFR_aircraft"] = consolidated_df["MFR_aircraft"].apply(
    lambda x: x if x in keep_MFR_aircraft else "other"
)

consolidated_df.value_counts("MFR_aircraft")

MFR_aircraft
BOEING               1615002
AIRBUS                754907
MCDONNELL DOUGLAS     185169
EMBRAER                65131
other                   5972
Name: count, dtype: int64

In [10]:
# STATUS CODE
# V     2620654
# 25      21180
# 26       3756
# 28       1270
# 27        586

# Valid Registration, others are on the renewal list
keep_STATUS_CODE = ["V"]

consolidated_df["STATUS CODE"] = consolidated_df["STATUS CODE"].str.strip()

consolidated_df["STATUS CODE"] = consolidated_df["STATUS CODE"].apply(
    lambda x: x if x in keep_STATUS_CODE else "other"
)

consolidated_df.value_counts("STATUS CODE")

STATUS CODE
V        2599389
other      26792
Name: count, dtype: int64

In [11]:
# MFR
# CFM INTL      1136507
# CFM INTL.      550122
# IAE            489794
# P & W          260097
# GE              93741
# ROLLS-ROYC      93724
# P&W             16997
# ROLLS DEUT       5441
# IVCHENKO          923
# WILLIAMS           64
# HONEYWELL          14
# P&W CANADA         12
# ROLLS-ROY          10

consolidated_df["MFR"] = (
    consolidated_df["MFR"]
    .replace(r".*CFM.*", "CFM", regex=True)
    .replace(r".*ROLLS.*", "ROLLS-ROYCE", regex=True)
    .replace(r".*P&W.*", "P & W", regex=True)
)

keep_MFR = ["CFM", "IAE", "P & W", "ROLLS-ROYCE", "GE"]

consolidated_df["MFR"] = consolidated_df["MFR"].str.strip()

consolidated_df["MFR"] = consolidated_df["MFR"].apply(
    lambda x: x if x in keep_MFR else "other"
)

consolidated_df.value_counts("MFR")

MFR
CFM            1677275
IAE             477939
P & W           277106
ROLLS-ROYCE      99119
GE               93741
other             1001
Name: count, dtype: int64

In [12]:
MODEL_counts = consolidated_df.value_counts("MODEL")

MODEL_counts = MODEL_counts[MODEL_counts.gt(10_000)]

consolidated_df["MODEL"] = consolidated_df["MODEL"].apply(
    lambda x: x.strip() if x in MODEL_counts.index else "other"
)

consolidated_df.value_counts("MODEL")

MODEL
CFM56 SERIES     558219
CFM56-7B27E      287030
V2500SERIES      231913
V2533-A5         201963
CFM56-7B24E      164024
JT8D SERIES      155009
CFM56-7B24       133859
other            110650
CFM56-7B27E/F     89425
CFM56-7B24/3      79288
PW2037            74430
CFM56-5B3/3       67656
CF34-10E6         64128
RB-211 SERIES     42833
CFM56-7B26/3      38740
BR 700 SERIES     33598
CFM56-5B4         29326
CFM56-7B26E       27994
CFM56-7B22        27922
CFM56-7B27        25980
CFM56-7B26        23940
CFM56-5B2         23446
PW2040            18748
V2527-A5          18635
PW4000 SER        16997
CFM56-7B27E/B     15711
CFM56-5B4/P       14455
CFM56-5B6         14353
CFM56-5B4/3       12778
V2524-A5          12235
V2528-D5          10896
Name: count, dtype: int64

In [13]:
MODEL_aircraft_counts = consolidated_df.value_counts("MODEL_aircraft")

MODEL_aircraft_counts = MODEL_aircraft_counts[MODEL_aircraft_counts.gt(5_000)]

consolidated_df["MODEL_aircraft"] = consolidated_df["MODEL_aircraft"].apply(
    lambda x: x.strip() if x in MODEL_aircraft_counts.index else "other"
)

consolidated_df.value_counts("MODEL_aircraft")

MODEL_aircraft
737-7H4            288419
737-823            256235
A321-231           203180
A320-232           146493
MD-88              127756
737-924ER          124121
737-800             97808
A321-211            93800
737-8H4             92056
737-824             91877
737-900ER           84980
other               67061
ERJ 190-100 IGW     65131
A320-214            58917
737-932ER           57510
A319-131            50760
737-832             47485
737-990ER           45947
A319-132            42605
A319-112            41305
717-200             40014
757-232             38971
MD-90-30            35792
737-890             35734
A320-212            27029
DC-9-83(MD-83)      26345
A319-114            25855
737-7BD             24710
A321-213            23446
737-76N             20697
757-224             19515
757-251             19188
737-724             16514
737-990             14120
A319-115            13759
757-351             13423
777-222             11895
A320-211            118

In [14]:
# These columns are confusingly named
consolidated_df = consolidated_df.rename(
    columns={
        "YEAR MFR": "year_of_manufacture",
        "NO-SEATS": "num_seats",
        "TYPE REGISTRANT": "company_type",
        "NAME": "registrant",
        "CERTIFICATION": "aircraft_usage",
        "STATUS CODE": "registration_status",
        "TYPE ENGINE": "engine_type",
        "MFR": "engine_manufacturer",
        "MODEL": "engine_model",
        "TYPE": "aircraft_type",
        "MFR_aircraft": "aircraft_manufacturer",
        "MODEL_aircraft": "aircraft_model",
    }
)

In [15]:
consolidated_df


Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,DistanceGroup,year_of_manufacture,...,aircraft_type,aircraft_manufacturer,aircraft_model,DepDel15,ArrDel15,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1,1,11,5,0940,1110,90,421,2,2016,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
1,1,1,11,5,1810,1945,95,421,2,2007,...,5,AIRBUS,A320-214,1,1,0,0,0,0,29
2,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
3,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
4,1,1,11,5,1420,1640,140,763,4,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769021,4,12,1,7,1924,2144,320,2116,9,1992,...,5,BOEING,757-232,1,1,17,0,1,0,0
2769022,4,12,1,7,2245,0611,266,2116,9,1992,...,5,BOEING,757-232,0,0,0,0,0,0,0
2769023,4,12,1,7,0630,0823,113,581,3,1992,...,5,BOEING,757-232,0,0,0,0,0,0,0
2769024,4,12,1,7,2237,0204,147,907,4,1999,...,5,BOEING,other,1,1,62,0,1,0,0


In [16]:
# Checkpoint

consolidated_df.to_parquet(f"{SCRATCH_DIR}/top_airline_airport_consolidated_features_2019.parquet")

# consolidated_df = pd.read_parquet(f"{SCRATCH_DIR}/19_top_airline_airport_consolidated.parquet")