In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import time
import sklearn
import warnings
import pickle

warnings.filterwarnings(action='ignore', category=FutureWarning)

pd.__version__, sklearn.__version__

('2.1.1', '1.3.1')

In [2]:
SCRATCH_DIR = "/scratch/siads696f23_class_root/siads696f23_class/psollars"

# For local dev
SCRATCH_DIR = "./../data"

In [3]:
consolidated_df = pd.read_parquet(f"{SCRATCH_DIR}/19_top_airline_airport_consolidated.parquet")

consolidated_df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,DistanceGroup,year_of_manufacture,...,aircraft_type,aircraft_manufacturer,aircraft_model,DepDel15,ArrDel15,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
24798424,1,1,11,5,940,1110,90,421,2,2016,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
24798426,1,1,11,5,1810,1945,95,421,2,2007,...,5,AIRBUS,A320-214,1,1,0,0,0,0,29
24798428,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
24798429,1,1,11,5,1915,2150,275,1770,8,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0
24798430,1,1,11,5,1420,1640,140,763,4,2011,...,5,AIRBUS,A320-214,0,0,0,0,0,0,0


In [4]:
consolidated_df.columns

Index(['Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime',
       'CRSArrTime', 'CRSElapsedTime', 'Distance', 'DistanceGroup',
       'year_of_manufacture', 'num_seats', 'Origin_LATITUDE',
       'Origin_LONGITUDE', 'Dest_LATITUDE', 'Dest_LONGITUDE',
       'Reporting_Airline', 'Tail_Number', 'company_type', 'registrant',
       'aircraft_usage', 'engine_type', 'registration_status',
       'engine_manufacturer', 'engine_model', 'aircraft_type',
       'aircraft_manufacturer', 'aircraft_model', 'DepDel15', 'ArrDel15',
       'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay'],
      dtype='object')

In [5]:
num_cols = [
    "Quarter",
    "Month",
    "DayofMonth",
    "DayOfWeek",
    "CRSDepTime",
    "CRSArrTime",
    "CRSElapsedTime",
    "Distance",
    "DistanceGroup",
    "year_of_manufacture",
    "num_seats",
    "Origin_LATITUDE",
    "Origin_LONGITUDE",
    "Dest_LATITUDE",
    "Dest_LONGITUDE",
]

cat_cols = [
    "Reporting_Airline",
    "Tail_Number",
    "company_type",
    "registrant",
    "aircraft_usage",
    "engine_type",
    "registration_status",
    "engine_manufacturer",
    "engine_model",
    "aircraft_type",
    "aircraft_manufacturer",
    "aircraft_model",
]

delay_cols = [
    "DepDel15",
    "ArrDel15",
    "CarrierDelay",
    "WeatherDelay",
    "NASDelay",
    "SecurityDelay",
    "LateAircraftDelay",
]

In [6]:
delay_df = (
    consolidated_df[num_cols + cat_cols + ["DepDel15", "ArrDel15"]]
    .copy()
    .reset_index(drop=True)
)

# Convert "delayed" label to boolean
delay_df["delayed"] = (delay_df["DepDel15"].eq(1)) | (delay_df["ArrDel15"].eq(1))
delay_df = delay_df.drop(["DepDel15", "ArrDel15"], axis=1)

delay_df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,DistanceGroup,year_of_manufacture,...,registrant,aircraft_usage,engine_type,registration_status,engine_manufacturer,engine_model,aircraft_type,aircraft_manufacturer,aircraft_model,delayed
0,1,1,11,5,940,1110,90,421,2,2016,...,other,1T,5,V,CFM,CFM56-5B4/3,5,AIRBUS,A320-214,False
1,1,1,11,5,1810,1945,95,421,2,2007,...,WELLS FARGO TRUST CO NA TRUSTEE,1T,5,V,CFM,CFM56-5B4/P,5,AIRBUS,A320-214,True
2,1,1,11,5,1915,2150,275,1770,8,2011,...,BANK OF UTAH TRUSTEE,1T,5,V,CFM,CFM56-5B4,5,AIRBUS,A320-214,False
3,1,1,11,5,1915,2150,275,1770,8,2011,...,BANK OF UTAH TRUSTEE,1T,5,V,CFM,CFM56-5B4,5,AIRBUS,A320-214,False
4,1,1,11,5,1420,1640,140,763,4,2011,...,WELLS FARGO TRUST CO NA TRUSTEE,1T,5,V,CFM,CFM56-5B4,5,AIRBUS,A320-214,False


In [7]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer


start_time = time.time()

X = delay_df.drop("delayed", axis=1)
y = delay_df["delayed"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OrdinalEncoder(), cat_cols),
    ]
)

X_transformed = preprocessor.fit_transform(X)

end_time = time.time()

print(f"Elapsed time: {(end_time - start_time):.4f} seconds")

Elapsed time: 17.1163 seconds


In [8]:
from sklearn.model_selection import train_test_split

start_time = time.time()

X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

end_time = time.time()

print(f"Elapsed time: {(end_time - start_time):.4f} seconds")

Elapsed time: 1.5973 seconds


In [9]:
from sklearn.linear_model import Ridge


ridge = Ridge(alpha=1.0)
# Run before and after resampling to compare
# ridge.fit(X_res, y_res)
ridge.fit(X_train, y_train)

feature_names = num_cols + list(
    preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
)

feature_importance = list(zip(feature_names, ridge.coef_))

sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

sorted_features

# [('CRSElapsedTime', 0.058797395314588476),
#  ('CRSDepTime', 0.054962718107172674),
#  ('Distance', -0.03845101592826966),
#  ('Quarter', -0.038094548416350356),
#  ('registration_status', -0.03543082965616708),
#  ('aircraft_type', 0.03133828402863409),
#  ('Month', 0.027004226802218027),
#  ('CRSArrTime', 0.024341084535749142),
#  ('aircraft_manufacturer', -0.015470553260812498),
#  ('engine_type', 0.011535754258068019),
#  ('year_of_manufacture', -0.01005103714121433),
#  ('DistanceGroup', -0.00999339975708305),
#  ('aircraft_usage', -0.009437157225809015),
#  ('Origin_LATITUDE', 0.007279142710498935),
#  ('Dest_LONGITUDE', 0.005757960807406346),
#  ('company_type', -0.004769433011575718),
#  ('Reporting_Airline', -0.003999530932439397),
#  ('Dest_LATITUDE', 0.0037647388631305684),
#  ('DayofMonth', 0.00358174127006179),
#  ('num_seats', -0.0034141379582565546),
#  ('engine_manufacturer', -0.0029729629081447495),
#  ('Origin_LONGITUDE', 0.002474202892780725),
#  ('DayOfWeek', 0.0011776804474205008),
#  ('registrant', 0.001027923676222743),
#  ('engine_model', 0.0010081569815473365),
#  ('aircraft_model', 3.110818903387603e-05),
#  ('Tail_Number', -1.285832808810039e-06)]

[('CRSElapsedTime', 0.05963607760504255),
 ('CRSDepTime', 0.054715355656140094),
 ('Distance', -0.03836627947445691),
 ('registration_status', -0.038045288837413954),
 ('Quarter', -0.03709220492901056),
 ('aircraft_type', 0.03098632762566204),
 ('Month', 0.025973290330834474),
 ('CRSArrTime', 0.024638340975429773),
 ('aircraft_usage', -0.014724062012299139),
 ('aircraft_manufacturer', -0.014695402979223322),
 ('engine_type', 0.011957675032119341),
 ('DistanceGroup', -0.010921499464425192),
 ('year_of_manufacture', -0.010263257005629524),
 ('Origin_LATITUDE', 0.007254060241894922),
 ('Dest_LONGITUDE', 0.00572832096906162),
 ('company_type', -0.0041815994560149075),
 ('Dest_LATITUDE', 0.0038941946598902307),
 ('Reporting_Airline', -0.003845721679336085),
 ('DayofMonth', 0.003457897360706589),
 ('engine_manufacturer', -0.0031703334553380485),
 ('Origin_LONGITUDE', 0.002604978389422402),
 ('num_seats', -0.002441069843584137),
 ('DayOfWeek', 0.0011776404945832378),
 ('registrant', 0.0010876

In [10]:
from sklearn.linear_model import Lasso


lasso = Lasso(alpha=0.1)
# Run before and after resampling to compare
# lasso.fit(X_res, y_res)
lasso.fit(X_train, y_train)

feature_names = num_cols + list(
    preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
)

important_features = np.nonzero(lasso.coef_)[0]

[feature_names[i] for i in important_features]

# ['Tail_Number'] huh?

['Tail_Number']

In [11]:
from imblearn.over_sampling import SMOTE


start_time = time.time()

smote = SMOTE(sampling_strategy="auto", random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

end_time = time.time()

print(f"Elapsed time: {(end_time - start_time):.4f} seconds")

print(" Original:", X_train.shape, y_train.shape)
print("Resampled:", X_res.shape, y_res.shape)

# Elapsed time: 571.2915 seconds
#  Original: (2100944, 27) (2100944,)
# Resampled: (3210460, 27) (3210460,)

Elapsed time: 777.7924 seconds
 Original: (2100944, 27) (2100944,)
Resampled: (3210262, 27) (3210262,)


In [12]:
from sklearn.linear_model import Ridge


ridge = Ridge(alpha=1.0)
ridge.fit(X_res, y_res)

feature_names = num_cols + list(
    preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
)

feature_importance = list(zip(feature_names, ridge.coef_))

sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

# [('CRSElapsedTime', 0.08981884548913494),
#  ('CRSDepTime', 0.08178813712084468),
#  ('registration_status', -0.059265756180862654),
#  ('Distance', -0.05531908449996627),
#  ('Quarter', -0.053458079295902856),
#  ('aircraft_type', 0.04336089057775277),
#  ('CRSArrTime', 0.03872814419484962),
#  ('Month', 0.03713229810912497),
#  ('DistanceGroup', -0.023790334346930255),
#  ('aircraft_manufacturer', -0.021091675045188323),
#  ('engine_type', 0.016783046752453495),
#  ('year_of_manufacture', -0.01357410077257083),
#  ('aircraft_usage', -0.011003516199439485),
#  ('Origin_LATITUDE', 0.009934255179438333),
#  ('Dest_LONGITUDE', 0.008402504294804342),
#  ('company_type', -0.007959626736081676),
#  ('Reporting_Airline', -0.006806991426982998),
#  ('DayofMonth', 0.006071482005436058),
#  ('Dest_LATITUDE', 0.00421629658131746),
#  ('num_seats', -0.003998122148951938),
#  ('engine_manufacturer', -0.003506668447531717),
#  ('registrant', 0.0015409811822836726),
#  ('engine_model', 0.001453691612936399),
#  ('Origin_LONGITUDE', -0.0014481015377547594),
#  ('DayOfWeek', 0.0014195829747761706),
#  ('aircraft_model', 1.2055987827169126e-05),
#  ('Tail_Number', -2.8825649472524914e-06)]

[('CRSElapsedTime', 0.08994441614498917),
 ('CRSDepTime', 0.08151455045905233),
 ('registration_status', -0.06393330932724607),
 ('Distance', -0.05627265590745997),
 ('Quarter', -0.05457060508996335),
 ('aircraft_type', 0.04239320642416624),
 ('CRSArrTime', 0.03911088202055778),
 ('Month', 0.03792558263484903),
 ('DistanceGroup', -0.022577242415909256),
 ('aircraft_usage', -0.02159979115032986),
 ('aircraft_manufacturer', -0.020595520474938067),
 ('engine_type', 0.01568546618531647),
 ('year_of_manufacture', -0.013928755330237875),
 ('Origin_LATITUDE', 0.00965426552600732),
 ('Dest_LONGITUDE', 0.008864721010500122),
 ('Reporting_Airline', -0.006358367255558869),
 ('DayofMonth', 0.006041177904060091),
 ('company_type', -0.00508759979810754),
 ('Dest_LATITUDE', 0.004505349256116086),
 ('engine_manufacturer', -0.004186997076355235),
 ('num_seats', -0.0031694701623587677),
 ('registrant', 0.0014894605420313523),
 ('engine_model', 0.001467033489274472),
 ('DayOfWeek', 0.0014278312888372493)

In [13]:
from sklearn.linear_model import Lasso


start_time = time.time()

lasso = Lasso(alpha=0.1)
lasso.fit(X_res, y_res)

feature_names = num_cols + list(
    preprocessor.named_transformers_["cat"].get_feature_names_out(cat_cols)
)

important_features = np.nonzero(lasso.coef_)[0]

[feature_names[i] for i in important_features]

['Tail_Number']

In [14]:
# Checkpoint

DATASET = "19_consolidated_all_delays"

with open(f"{SCRATCH_DIR}/{DATASET}_X_res.pkl", "wb") as f:
    pickle.dump(X_res, f)

with open(f"{SCRATCH_DIR}/{DATASET}_y_res.pkl", "wb") as f:
    pickle.dump(y_res, f)

with open(f"{SCRATCH_DIR}/{DATASET}_X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)

with open(f"{SCRATCH_DIR}/{DATASET}_y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)