What's new in this version?

- Hyperparameter tuning

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
from datetime import datetime

from pathlib import Path

%matplotlib notebook

In [2]:
csv_glob = list(Path('../data/').glob('*_weather.csv'))

mapper = {x.name.split('_')[0]: x for x in csv_glob}
print(mapper)

{'EWR': PosixPath('../data/EWR_weather.csv'), 'DEN': PosixPath('../data/DEN_weather.csv'), 'IAD': PosixPath('../data/IAD_weather.csv'), 'ORD': PosixPath('../data/ORD_weather.csv'), 'SYR': PosixPath('../data/SYR_weather.csv')}


In [116]:
flight_df = pd.read_csv('../data/Detailed_Statistics_Arrivals.csv', parse_dates=['Date (MM/DD/YYYY)'])

# Drop NAs here.
flight_df.dropna(axis='rows', inplace=True)

def convert_to_timedelta(x):
    if isinstance(x, str):
        hh, mm = x.split(':')[0], x.split(':')[1]
    return pd.Timedelta(hours=int(hh), minutes=int(mm))

flight_df['TimeDT'] = flight_df['Scheduled Arrival Time'].apply(convert_to_timedelta)

# We need another column with Date + Time
flight_df['Arrival DateTime'] = flight_df['Date (MM/DD/YYYY)'] + flight_df['TimeDT']

# Retain only the relevant columns.
flight_df = flight_df[['Origin Airport', 'Arrival DateTime', 'Arrival Delay (Minutes)']]

# Sort by date.
flight_df = flight_df.sort_values(
    by='Arrival DateTime',
    ascending=True,
    axis=0,
    inplace=False
).reset_index(drop=True, inplace=False)

flight_df['Arrival DateTime (rounded)'] = flight_df['Arrival DateTime'].dt.round('H')

display(flight_df.head())

Unnamed: 0,Origin Airport,Arrival DateTime,Arrival Delay (Minutes),Arrival DateTime (rounded)
0,ORD,2017-06-08 16:37:00,24.0,2017-06-08 17:00:00
1,ORD,2017-06-09 16:37:00,-6.0,2017-06-09 17:00:00
2,ORD,2017-06-10 16:37:00,-15.0,2017-06-10 17:00:00
3,ORD,2017-06-10 20:41:00,20.0,2017-06-10 21:00:00
4,ORD,2017-06-11 16:37:00,-11.0,2017-06-11 17:00:00


In [117]:
from datetime import time

def get_time_of_day(x):
    
    x = x.time()
    
    midnight_start = time.fromisoformat('00:00:00')
    morning_start = time.fromisoformat('06:00:00')
    afternoon_start = time.fromisoformat('12:00:00')
    evening_start = time.fromisoformat('17:00:00')
    night_start = time.fromisoformat('21:00:00')

    if midnight_start <= x < morning_start:
        return "Midnight"
    if morning_start <= x < afternoon_start:
        return "Morning"
    if afternoon_start <= x < evening_start:
        return "Afternoon"
    if evening_start <= x < night_start:
        return "Evening"
    
    return "Night"
    

def create_label(delay):
    if delay < -10:
        return "Early"
    if -10 <= delay <= 10:
        return "On Time"
    if 10 < delay <= 30:
        return "Late"
    
    return "Severely Late"

In [118]:
syr_weather = pd.read_csv(mapper['SYR'], parse_dates=['time'])
syr_weather.columns = ['syr_' + x for x  in syr_weather.columns]
syr_weather.rename(columns={'syr_time': 'Syr_time'}, inplace=True)

origin_dfs = list()

for origin in ['EWR', 'DEN', 'ORD', 'IAD']:
    weather_df = pd.read_csv(mapper[origin], parse_dates=['time'])
    sub_df = flight_df.loc[flight_df['Origin Airport'] == origin, :].reset_index(
        drop=True, inplace=False
    )
    
    weather_df.rename(columns={'time': 'Date'}, inplace=True)
    
    merged = sub_df.merge(
        right=weather_df,
        left_on='Arrival DateTime (rounded)',
        right_on='Date',
        how='inner'
    )
    
    # Add Syracuse data now.
    merged = merged.merge(
        right=syr_weather,
        left_on='Arrival DateTime (rounded)',
        right_on='Syr_time',
        how='inner'
    )
    
    origin_dfs.append(merged)

main_df = pd.concat(origin_dfs, axis=0)

# Let's add some features.
main_df['DayOfWeek'] = main_df['Date'].dt.dayofweek
main_df['TimeOfDay'] = main_df['Date'].apply(get_time_of_day)
main_df['MonthOfYear'] = main_df['Date'].dt.month

main_df.sort_values(
    by=['Arrival DateTime (rounded)'],
    axis=0,
    inplace=True,
    ascending=True
)

main_df['status'] = main_df['Arrival Delay (Minutes)'].apply(create_label)

# Drop the irrelevant features.
main_df.drop(
    columns=[
        'Arrival DateTime',
        'Arrival DateTime (rounded)',
        'Date',
        'Arrival Delay (Minutes)'
    ], axis='columns', inplace=True
)

In [119]:
display(main_df.head())

Unnamed: 0,Origin Airport,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,...,syr_cloudcover_high,syr_windspeed_10m,syr_winddirection_10m,syr_windgusts_10m,syr_et0_fao_evapotranspiration,syr_vapor_pressure_deficit,DayOfWeek,TimeOfDay,MonthOfYear,status
0,ORD,25.6,37,9.9,25.8,1011.9,988.7,0.0,0.0,0.0,...,15,2.7,23,19.8,0.23,0.81,3,Evening,6,Late
1,ORD,24.9,58,16.2,26.1,1007.9,984.8,0.0,0.0,0.0,...,0,14.0,293,38.2,0.39,1.28,4,Evening,6,On Time
2,ORD,29.7,36,12.9,27.0,1007.8,985.0,0.0,0.0,0.0,...,35,8.1,249,31.0,0.37,1.14,5,Evening,6,Early
3,ORD,26.7,37,10.7,23.6,1007.8,984.8,0.0,0.0,0.0,...,24,9.9,199,11.9,0.01,0.15,5,Night,6,Late
4,ORD,31.4,42,17.0,31.2,1011.5,988.8,0.0,0.0,0.0,...,61,14.3,260,34.9,0.51,2.0,6,Evening,6,Early


In [120]:
# Make airport-specific models.
from sklearn.preprocessing import StandardScaler, OneHotEncoder

origin_encoder, time_encoder, day_encoder  = OneHotEncoder(sparse_output=False), \
OneHotEncoder(sparse_output=False), OneHotEncoder(sparse_output=False)

origin_encoder.fit(main_df[['Origin Airport']])
time_encoder.fit(main_df[['TimeOfDay']])
day_encoder.fit(main_df[['DayOfWeek']])

origin_encoded = pd.DataFrame(
    columns=origin_encoder.get_feature_names_out(), 
    data=origin_encoder.transform(main_df[['Origin Airport']]),
    index=main_df.index
)

time_encoded = pd.DataFrame(
    columns=time_encoder.get_feature_names_out(),
    data=time_encoder.transform(main_df[['TimeOfDay']]),
    index=main_df.index
)

day_encoded = pd.DataFrame(
    columns=day_encoder.get_feature_names_out(),
    data=day_encoder.transform(main_df[['DayOfWeek']]),
    index=main_df.index
)

# Join.
main_df = pd.concat([main_df, origin_encoded, time_encoded, day_encoded], axis=1)

main_df.drop(
    columns=['TimeOfDay', 'DayOfWeek', 'Syr_time'],
    axis='columns', inplace=True
)

display(main_df.head())

Unnamed: 0,Origin Airport,temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,...,TimeOfDay_Midnight,TimeOfDay_Morning,TimeOfDay_Night,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6
0,ORD,25.6,37,9.9,25.8,1011.9,988.7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,ORD,24.9,58,16.2,26.1,1007.9,984.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,ORD,29.7,36,12.9,27.0,1007.8,985.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ORD,26.7,37,10.7,23.6,1007.8,984.8,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,ORD,31.4,42,17.0,31.2,1011.5,988.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [133]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

models = []
important_features = set()

for origin in ['EWR', 'DEN', 'ORD', 'IAD']:
    sub_df = main_df.loc[main_df['Origin Airport'] == origin, :].reset_index(drop=True, inplace=False)
    sub_df.drop(columns=['Origin Airport'], inplace=True, axis='columns')
    
    origin_scaler = StandardScaler()
    
    X = sub_df.loc[:, ~sub_df.columns.isin(['status'])]
    Y = sub_df.loc[:, 'status']
    
    X_tr, X_te, Y_tr, Y_te = train_test_split(
        X, Y, stratify=Y, shuffle=True, random_state=250
    )
    
    X_tr_scaled = pd.DataFrame(columns=X_tr.columns, data=origin_scaler.fit_transform(X_tr))
    X_te_scaled = pd.DataFrame(columns=X_te.columns, data=origin_scaler.transform(X_te))
    
    clf = GradientBoostingClassifier(
        n_estimators=150,
        max_depth=100,
        random_state=50
    ).fit(X_tr_scaled, Y_tr)
    
    print(f"Train score for {origin}: {clf.score(X_tr_scaled, Y_tr)}")
    print(f"Test score for {origin}: {clf.score(X_te_scaled, Y_te)}")
    
    importances = sorted(
        list(zip(clf.feature_names_in_, clf.feature_importances_)),
        key=lambda x: x[-1],
        reverse=True
    )[:10]
    
    for f, i in importances:
        important_features.add(f)
    
    print(importances)
    
    print(50*'-')

Train score for EWR: 1.0
Test score for EWR: 0.3125
[('syr_relativehumidity_2m', 0.14549619134867833), ('windgusts_10m', 0.11822745545545553), ('syr_dewpoint_2m', 0.07557591882780454), ('syr_cloudcover_low', 0.07303044536767472), ('winddirection_10m', 0.06381514772980698), ('precipitation', 0.046926222066595524), ('syr_cloudcover', 0.04442056666103416), ('rain', 0.03978479484312657), ('syr_surface_pressure', 0.037499188429179486), ('apparent_temperature', 0.03587816826216374)]
--------------------------------------------------
Train score for DEN: 1.0
Test score for DEN: 0.4318181818181818
[('windgusts_10m', 0.07613376265835854), ('cloudcover_mid', 0.06539606584910461), ('et0_fao_evapotranspiration', 0.056850308500347176), ('pressure_msl', 0.052782817293966486), ('temperature_2m', 0.045355499753345206), ('syr_cloudcover', 0.04228709986786115), ('winddirection_10m', 0.04215845305113838), ('dewpoint_2m', 0.04208624940122352), ('syr_vapor_pressure_deficit', 0.04171924006206141), ('syr_win

In [135]:
important_features

{'apparent_temperature',
 'cloudcover_mid',
 'dewpoint_2m',
 'et0_fao_evapotranspiration',
 'precipitation',
 'pressure_msl',
 'rain',
 'relativehumidity_2m',
 'surface_pressure',
 'syr_cloudcover',
 'syr_cloudcover_low',
 'syr_dewpoint_2m',
 'syr_relativehumidity_2m',
 'syr_surface_pressure',
 'syr_vapor_pressure_deficit',
 'syr_winddirection_10m',
 'syr_windgusts_10m',
 'syr_windspeed_10m',
 'temperature_2m',
 'vapor_pressure_deficit',
 'winddirection_10m',
 'windgusts_10m',
 'windspeed_10m'}