In [173]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error


pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

## First Flight

In [174]:
import pandas as pd

# Read the Excel file
first_df = pd.read_excel("First_Flight_Details.xlsx")

first_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes)
0,UA,2023-01-01,2645,N23721,ORD,SYR,21:10:00,21:07:00,-3,23:57:00,23:47:00,-10
1,UA,2023-01-02,1998,N802UA,ORD,SYR,18:14:00,18:17:00,3,21:07:00,20:46:00,-21
2,UA,2023-01-03,1998,N854UA,ORD,SYR,18:14:00,18:13:00,-1,21:07:00,21:04:00,-3
3,UA,2023-01-04,1998,N893UA,ORD,SYR,18:24:00,18:41:00,17,21:17:00,21:31:00,14
4,UA,2020-01-05,2012,N825UA,ORD,SYR,19:00:00,19:21:00,21,21:47:00,21:55:00,8


In [175]:
first_df.dtypes

Carrier Code                         object
Date (MM/DD/YYYY)            datetime64[ns]
Flight Number                         int64
Tail Number                          object
Origin Airport                       object
Destination Airport                  object
Scheduled departure time             object
Actual departure time                object
Departure delay (Minutes)             int64
Scheduled Arrival Time               object
Actual Arrival Time                  object
Arrival Delay (Minutes)               int64
dtype: object

In [176]:
first_df.isna().sum()

Carrier Code                  0
Date (MM/DD/YYYY)             0
Flight Number                 0
Tail Number                  53
Origin Airport                0
Destination Airport           0
Scheduled departure time      0
Actual departure time         0
Departure delay (Minutes)     0
Scheduled Arrival Time        0
Actual Arrival Time           0
Arrival Delay (Minutes)       0
dtype: int64

In [177]:
first_df.dropna(subset=['Tail Number'], inplace=True)

In [178]:
All_weather = pd.read_csv("Weather_Data.csv")

All_weather.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather_description,pres,slp,vis,snow,Location
0,2019-01-01:05,2019-01-01T00:00:00,9.7,9.7,100,2.5,100,5.15,11.6,160,Light rain,1008,1009,3,0.0,JFK
1,2019-01-01:06,2019-01-01T01:00:00,9.4,9.4,100,2.0,100,5.09,10.4,180,Light rain,1007,1007,0,0.0,JFK
2,2019-01-01:07,2019-01-01T02:00:00,8.9,8.9,100,1.0,100,3.22,11.6,180,Light rain,1004,1005,0,0.0,JFK
3,2019-01-01:08,2019-01-01T03:00:00,8.9,8.9,100,0.0,100,4.34,13.6,205,Fog,1002,1003,1,0.0,JFK
4,2019-01-01:09,2019-01-01T04:00:00,9.4,9.4,100,0.0,100,1.5,13.6,280,Fog,1002,1003,2,0.0,JFK


In [179]:
syr_weather = pd.read_csv("SY_Weather_Data.csv")

syr_weather.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow
0,2019-01-01:05,2019-01-01T00:00:00,7.2,4.3,87.0,1.5,85.0,4.59,9.8,150.0,Light rain,982.0,997.0,16.0,0.0
1,2019-01-01:06,2019-01-01T01:00:00,8.9,8.9,100.0,0.5,79.0,5.09,11.8,190.0,Overcast clouds,984.0,999.0,16.0,0.0
2,2019-01-01:07,2019-01-01T02:00:00,9.4,9.4,100.0,0.0,83.0,4.59,12.9,180.0,Overcast clouds,982.0,997.0,16.0,0.0
3,2019-01-01:08,2019-01-01T03:00:00,11.1,11.1,87.0,0.0,80.0,8.19,13.9,200.0,Overcast clouds,980.0,996.0,16.0,0.0
4,2019-01-01:09,2019-01-01T04:00:00,10.6,10.6,100.0,0.5,85.0,6.2,16.8,230.0,Overcast clouds,981.0,996.0,16.0,0.0


In [180]:
All_weather.dtypes

datetime                object
timestamp_local         object
temp                   float64
app_temp               float64
clouds                   int64
precip                 float64
rh                       int64
wind_spd               float64
wind_gust_spd          float64
wind_dir                 int64
weather_description     object
pres                     int64
slp                      int64
vis                      int64
snow                   float64
Location                object
dtype: object

In [181]:
All_weather['timestamp_local'] = pd.to_datetime(All_weather['timestamp_local'])
syr_weather['timestamp_local'] = pd.to_datetime(syr_weather['timestamp_local'])
syr_weather.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow
0,2019-01-01:05,2019-01-01 00:00:00,7.2,4.3,87.0,1.5,85.0,4.59,9.8,150.0,Light rain,982.0,997.0,16.0,0.0
1,2019-01-01:06,2019-01-01 01:00:00,8.9,8.9,100.0,0.5,79.0,5.09,11.8,190.0,Overcast clouds,984.0,999.0,16.0,0.0
2,2019-01-01:07,2019-01-01 02:00:00,9.4,9.4,100.0,0.0,83.0,4.59,12.9,180.0,Overcast clouds,982.0,997.0,16.0,0.0
3,2019-01-01:08,2019-01-01 03:00:00,11.1,11.1,87.0,0.0,80.0,8.19,13.9,200.0,Overcast clouds,980.0,996.0,16.0,0.0
4,2019-01-01:09,2019-01-01 04:00:00,10.6,10.6,100.0,0.5,85.0,6.2,16.8,230.0,Overcast clouds,981.0,996.0,16.0,0.0


In [182]:
All_weather['Date'] = All_weather['timestamp_local'].dt.date
All_weather['Hour'] = All_weather['timestamp_local'].dt.hour

syr_weather['Date'] = syr_weather['timestamp_local'].dt.date
syr_weather['Hour'] = syr_weather['timestamp_local'].dt.hour

syr_weather.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow,Date,Hour
0,2019-01-01:05,2019-01-01 00:00:00,7.2,4.3,87.0,1.5,85.0,4.59,9.8,150.0,Light rain,982.0,997.0,16.0,0.0,2019-01-01,0
1,2019-01-01:06,2019-01-01 01:00:00,8.9,8.9,100.0,0.5,79.0,5.09,11.8,190.0,Overcast clouds,984.0,999.0,16.0,0.0,2019-01-01,1
2,2019-01-01:07,2019-01-01 02:00:00,9.4,9.4,100.0,0.0,83.0,4.59,12.9,180.0,Overcast clouds,982.0,997.0,16.0,0.0,2019-01-01,2
3,2019-01-01:08,2019-01-01 03:00:00,11.1,11.1,87.0,0.0,80.0,8.19,13.9,200.0,Overcast clouds,980.0,996.0,16.0,0.0,2019-01-01,3
4,2019-01-01:09,2019-01-01 04:00:00,10.6,10.6,100.0,0.5,85.0,6.2,16.8,230.0,Overcast clouds,981.0,996.0,16.0,0.0,2019-01-01,4


In [183]:
All_weather['Date'] = pd.to_datetime(All_weather['Date'])

syr_weather['Date'] = pd.to_datetime(syr_weather['Date'])

In [184]:
syr_weather.drop(['datetime', 'timestamp_local'], axis=1, inplace=True)

All_weather.drop(['datetime', 'timestamp_local'], axis=1, inplace=True)

In [185]:
syr_weather.dtypes

temp                    float64
app_temp                float64
clouds                  float64
precip                  float64
rh                      float64
wind_spd                float64
wind_gust_spd           float64
wind_dir                float64
weather                  object
pres                    float64
slp                     float64
vis                     float64
snow                    float64
Date             datetime64[ns]
Hour                      int32
dtype: object

In [186]:
len(syr_weather)
len(first_df)

46321

4488

In [187]:
first_df.dtypes

Carrier Code                         object
Date (MM/DD/YYYY)            datetime64[ns]
Flight Number                         int64
Tail Number                          object
Origin Airport                       object
Destination Airport                  object
Scheduled departure time             object
Actual departure time                object
Departure delay (Minutes)             int64
Scheduled Arrival Time               object
Actual Arrival Time                  object
Arrival Delay (Minutes)               int64
dtype: object

In [188]:
#even_rows['Date (MM/DD/YYYY)'] = pd.to_datetime(even_rows['Date (MM/DD/YYYY)'])

first_df['Day of Week'] = first_df['Date (MM/DD/YYYY)'].dt.dayofweek

# Map the day of the week to day names
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
first_df['Day Name'] = first_df['Day of Week'].map(lambda x: day_names[x])

first_df['Day Name'] = pd.Categorical(first_df['Day Name'], categories=day_names)

print(first_df['Day Name'].cat.categories)


Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'],
      dtype='object')


In [189]:
first_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Day of Week,Day Name
0,UA,2023-01-01,2645,N23721,ORD,SYR,21:10:00,21:07:00,-3,23:57:00,23:47:00,-10,6,Sunday
1,UA,2023-01-02,1998,N802UA,ORD,SYR,18:14:00,18:17:00,3,21:07:00,20:46:00,-21,0,Monday
2,UA,2023-01-03,1998,N854UA,ORD,SYR,18:14:00,18:13:00,-1,21:07:00,21:04:00,-3,1,Tuesday
3,UA,2023-01-04,1998,N893UA,ORD,SYR,18:24:00,18:41:00,17,21:17:00,21:31:00,14,2,Wednesday
4,UA,2020-01-05,2012,N825UA,ORD,SYR,19:00:00,19:21:00,21,21:47:00,21:55:00,8,6,Sunday


In [190]:
columns_to_keep = ['Carrier Code', 'Date (MM/DD/YYYY)','Origin Airport', 'Scheduled departure time', 'Actual departure time','Arrival Delay (Minutes)',
                   'Scheduled Arrival Time', 'Actual Arrival Time','Day Name']

first_df = first_df[columns_to_keep]

In [191]:
first_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Origin Airport,Scheduled departure time,Actual departure time,Arrival Delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Day Name
0,UA,2023-01-01,ORD,21:10:00,21:07:00,-10,23:57:00,23:47:00,Sunday
1,UA,2023-01-02,ORD,18:14:00,18:17:00,-21,21:07:00,20:46:00,Monday
2,UA,2023-01-03,ORD,18:14:00,18:13:00,-3,21:07:00,21:04:00,Tuesday
3,UA,2023-01-04,ORD,18:24:00,18:41:00,14,21:17:00,21:31:00,Wednesday
4,UA,2020-01-05,ORD,19:00:00,19:21:00,8,21:47:00,21:55:00,Sunday


In [192]:
first_df.dtypes

Carrier Code                        object
Date (MM/DD/YYYY)           datetime64[ns]
Origin Airport                      object
Scheduled departure time            object
Actual departure time               object
Arrival Delay (Minutes)              int64
Scheduled Arrival Time              object
Actual Arrival Time                 object
Day Name                          category
dtype: object

In [193]:
# Convert 'Scheduled departure time' to datetime
first_df['Scheduled departure time'] = pd.to_datetime(first_df['Scheduled departure time'], format='%H:%M:%S')

first_df['Scheduled Arrival Time'] = pd.to_datetime(first_df['Scheduled Arrival Time'], format='%H:%M:%S')

In [194]:
# Extract hour, minute, and second components
first_df['Scheduled departure hour'] = first_df['Scheduled departure time'].dt.hour
first_df['Scheduled departure minute'] = first_df['Scheduled departure time'].dt.minute

first_df['Scheduled Arrival hour'] = first_df['Scheduled Arrival Time'].dt.hour
first_df['Scheduled Arrival minute'] = first_df['Scheduled Arrival Time'].dt.minute

In [195]:
first_df.dtypes

Carrier Code                          object
Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Scheduled departure time      datetime64[ns]
Actual departure time                 object
Arrival Delay (Minutes)                int64
Scheduled Arrival Time        datetime64[ns]
Actual Arrival Time                   object
Day Name                            category
Scheduled departure hour               int32
Scheduled departure minute             int32
Scheduled Arrival hour                 int32
Scheduled Arrival minute               int32
dtype: object

In [196]:
first_df.drop(['Scheduled departure time', 'Actual departure time', 'Scheduled Arrival Time', 'Actual Arrival Time'], axis=1, inplace=True)

In [197]:
first_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Origin Airport,Arrival Delay (Minutes),Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute
0,UA,2023-01-01,ORD,-10,Sunday,21,10,23,57
1,UA,2023-01-02,ORD,-21,Monday,18,14,21,7
2,UA,2023-01-03,ORD,-3,Tuesday,18,14,21,7
3,UA,2023-01-04,ORD,14,Wednesday,18,24,21,17
4,UA,2020-01-05,ORD,8,Sunday,19,0,21,47


In [198]:
# Define a function to determine the arrival status
def determine_arrival_status(delay):
    if delay > 5:
        return 2
    elif delay < -5:
        return 1
    else:
        return 0

# Apply the function to create the "Arrival Status" column
first_df['Arrival Status'] = first_df['Arrival Delay (Minutes)'].apply(determine_arrival_status)

In [199]:
first_df.drop('Arrival Delay (Minutes)', axis=1, inplace=True)

In [200]:
first_df.dtypes

Carrier Code                          object
Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Day Name                            category
Scheduled departure hour               int32
Scheduled departure minute             int32
Scheduled Arrival hour                 int32
Scheduled Arrival minute               int32
Arrival Status                         int64
dtype: object

In [201]:
merged_df = pd.merge(first_df, syr_weather, left_on=["Date (MM/DD/YYYY)", "Scheduled Arrival hour"], right_on=["Date", "Hour"], how="left")

In [202]:
merged_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Origin Airport,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,Arrival Status,temp,...,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow,Date,Hour
0,UA,2023-01-01,ORD,Sunday,21,10,23,57,1,3.9,...,1.6,3.6,280.0,Overcast clouds,1000.0,1015.0,11.0,0.0,2023-01-01,23
1,UA,2023-01-02,ORD,Monday,18,14,21,7,1,4.4,...,1.2,2.4,55.0,Overcast clouds,1006.0,1021.0,16.0,0.0,2023-01-02,21
2,UA,2023-01-03,ORD,Tuesday,18,14,21,7,0,5.0,...,1.5,4.8,70.0,Overcast clouds,991.0,1006.0,10.0,0.0,2023-01-03,21
3,UA,2023-01-04,ORD,Wednesday,18,24,21,17,2,7.2,...,3.1,13.2,80.0,Overcast clouds,990.0,1005.0,10.0,0.0,2023-01-04,21
4,UA,2020-01-05,ORD,Sunday,19,0,21,47,2,-0.6,...,2.6,3.2,210.0,Overcast clouds,998.0,1013.0,16.0,0.0,2020-01-05,21


In [203]:
merged_df.isna().sum()

Carrier Code                  0
Date (MM/DD/YYYY)             0
Origin Airport                0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
Arrival Status                0
temp                          0
app_temp                      0
clouds                        0
precip                        0
rh                            0
wind_spd                      0
wind_gust_spd                 0
wind_dir                      0
weather                       0
pres                          0
slp                           0
vis                           0
snow                          0
Date                          0
Hour                          0
dtype: int64

In [204]:
len(merged_df)
len(All_weather)

4488

177793

In [205]:
merged_df = pd.merge(merged_df, All_weather, how='left', left_on=['Date (MM/DD/YYYY)', 'Scheduled departure hour', 'Origin Airport'], right_on=['Date', 'Hour', 'Location'])

In [206]:
merged_df.drop(['Date_x','Hour_x','Date_y', 'Hour_y'], axis=1, inplace=True)
merged_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Origin Airport,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,Arrival Status,temp_x,...,rh_y,wind_spd_y,wind_gust_spd_y,wind_dir_y,weather_description,pres_y,slp_y,vis_y,snow_y,Location
0,UA,2023-01-01,ORD,Sunday,21,10,23,57,1,3.9,...,87,1.6,3.6,250,Overcast clouds,992,1017,6,0.0,ORD
1,UA,2023-01-02,ORD,Monday,18,14,21,7,1,4.4,...,85,4.59,6.4,80,Overcast clouds,991,1015,11,0.0,ORD
2,UA,2023-01-03,ORD,Tuesday,18,14,21,7,0,5.0,...,88,2.1,2.8,20,Haze,977,1001,0,0.0,ORD
3,UA,2023-01-04,ORD,Wednesday,18,24,21,17,2,7.2,...,88,3.6,8.4,220,Overcast clouds,984,1009,6,0.0,ORD
4,UA,2020-01-05,ORD,Sunday,19,0,21,47,2,-0.6,...,69,7.32,11.6,280,Scattered clouds,988,1014,16,0.0,ORD


In [207]:
len(merged_df)

4488

In [208]:
merged_df.isna().sum()

Carrier Code                  0
Date (MM/DD/YYYY)             0
Origin Airport                0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
Arrival Status                0
temp_x                        0
app_temp_x                    0
clouds_x                      0
precip_x                      0
rh_x                          0
wind_spd_x                    0
wind_gust_spd_x               0
wind_dir_x                    0
weather                       0
pres_x                        0
slp_x                         0
vis_x                         0
snow_x                        0
temp_y                        0
app_temp_y                    0
clouds_y                      0
precip_y                      0
rh_y                          0
wind_spd_y                    0
wind_gust_spd_y               0
wind_dir_y                    0
weather_description           0
pres_y  

In [209]:
merged_df.drop(['Location'], axis=1, inplace=True)

In [210]:
# Convert columns to object type
merged_df['Scheduled departure hour'] = merged_df['Scheduled departure hour'].astype(str)
merged_df['Scheduled departure minute'] = merged_df['Scheduled departure minute'].astype(str)
merged_df['Scheduled Arrival hour'] = merged_df['Scheduled Arrival hour'].astype(str)
merged_df['Scheduled Arrival minute'] = merged_df['Scheduled Arrival minute'].astype(str)

In [211]:
merged_df.dtypes

Carrier Code                          object
Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Day Name                            category
Scheduled departure hour              object
Scheduled departure minute            object
Scheduled Arrival hour                object
Scheduled Arrival minute              object
Arrival Status                         int64
temp_x                               float64
app_temp_x                           float64
clouds_x                             float64
precip_x                             float64
rh_x                                 float64
wind_spd_x                           float64
wind_gust_spd_x                      float64
wind_dir_x                           float64
weather                               object
pres_x                               float64
slp_x                                float64
vis_x                                float64
snow_x                               float64
temp_y    

In [212]:
all_hours = pd.Series(range(24))

# Convert to categorical variable with specified categories
merged_df['Scheduled departure hour'] = pd.Categorical(merged_df['Scheduled departure hour'], categories=all_hours, ordered=True)
merged_df['Scheduled Arrival hour'] = pd.Categorical(merged_df['Scheduled Arrival hour'], categories=all_hours, ordered=True)

In [213]:
all_minutes = pd.Series(range(60))

# Convert to categorical variable with specified categories
merged_df['Scheduled departure minute'] = pd.Categorical(merged_df['Scheduled departure minute'], categories=all_minutes, ordered=True)
merged_df['Scheduled Arrival minute'] = pd.Categorical(merged_df['Scheduled Arrival minute'], categories=all_minutes, ordered=True)

In [214]:
merged_df['weather'] = pd.Categorical(merged_df['weather'], ordered=True)
merged_df['weather_description'] = pd.Categorical(merged_df['weather_description'], ordered=True)

In [215]:
merged_df.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Origin Airport', 'Day Name',
       'Scheduled departure hour', 'Scheduled departure minute',
       'Scheduled Arrival hour', 'Scheduled Arrival minute', 'Arrival Status',
       'temp_x', 'app_temp_x', 'clouds_x', 'precip_x', 'rh_x', 'wind_spd_x',
       'wind_gust_spd_x', 'wind_dir_x', 'weather', 'pres_x', 'slp_x', 'vis_x',
       'snow_x', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_y', 'rh_y',
       'wind_spd_y', 'wind_gust_spd_y', 'wind_dir_y', 'weather_description',
       'pres_y', 'slp_y', 'vis_y', 'snow_y'],
      dtype='object')

In [216]:
merged_df.drop(['weather','weather_description','Carrier Code'], axis=1, inplace=True)

In [217]:
date_column = first_df['Date (MM/DD/YYYY)']

# Drop the date column before encoding
first_df_encoded = pd.get_dummies(merged_df.drop(columns=['Date (MM/DD/YYYY)']), drop_first=True)

# Add the date column back to the encoded DataFrame
#first_df_encoded['Date (MM/DD/YYYY)'] = date_column
#first_df_encoded.head()

In [218]:
# Convert 'Arrival Status' to categorical
first_df_encoded['Arrival Status'] = first_df_encoded['Arrival Status'].astype('category')

In [219]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier


# Define your base models
base_models = [
    ('logistic', LogisticRegression(fit_intercept=True, solver = 'newton-cg',multi_class='multinomial', penalty = None, max_iter = 100000)),
   ('random_forest', RandomForestClassifier(
    n_estimators=200,  # Number of trees in the forest
    max_depth=None,  # Maximum depth of the tree
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_features=10,  # Number of features to consider when looking for the best split
    random_state=42  # Random state for reproducibility
)) 
]

# Define meta-model
meta_model = GradientBoostingClassifier(learning_rate = 0.001, max_features = 'log2', n_estimators = 500, min_samples_leaf = 1)

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Create pipeline with standard scaler and stacking classifier
pipeline = make_pipeline(stacking_clf)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(first_df_encoded.drop(columns = ['Arrival Status']), first_df_encoded['Arrival Status'], test_size=0.20, random_state=42)

# Train the stacking model
pipeline.fit(X_train_1, y_train_1)

# Make predictions
train_accuracy = pipeline.score(X_train_1, y_train_1)
test_accuracy = pipeline.score(X_test_1, y_test_1)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.5596100278551532
Test Accuracy: 0.5011135857461024


In [220]:
#X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(first_df_encoded.drop(columns = ['Arrival Status']), first_df_encoded['Arrival Status'], test_size=0.20, random_state=35)

#X_train_1
#X_test_1
#y_train_1
#y_test_1

In [221]:
#X_train_1 = X_train_1.drop(columns=['Date (MM/DD/YYYY)'])
#X_test_1 = X_test_1.drop(columns=['Date (MM/DD/YYYY)'])

#X_train_1
#X_test_1

In [222]:
# rf_model = LogisticRegression(fit_intercept=True, solver = 'newton-cg',multi_class='multinomial', penalty = None, max_iter = 100000)
# rf_model.fit(X_train_1, y_train_1)

# rf_model.score(X_train_1, y_train_1)

In [223]:
#from sklearn.ensemble import GradientBoostingClassifier

#gb_model = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 7, max_features = 'sqrt', n_estimators = 200, min_samples_leaf = 2)
#gb_model.fit(X_train_1, y_train_1)

#gb_model.score(X_train_1, y_train_1)

In [224]:
#gb_y_train_1_pred = gb_model.predict(X_train_1)
#gb_y_train_1_pred

In [225]:
#gb_y_test_1_pred = gb_model.predict(X_test_1)
#rf_y_test_1_pred

In [226]:
# Create DataFrame with predictions
#predictions_df = pd.DataFrame({'Prediction': gb_y_test_1_pred})

# Append DataFrame with ground truth
#predictions_df['Ground_Truth'] = y_test_1.values  # Assuming y_test_1 is a pandas Series

# Now predictions_df contains both predictions and ground truth
#predictions_df.head()

In [227]:
#predictions_df_train = pd.DataFrame({'Prediction_train': gb_y_train_1_pred})

# Append DataFrame with ground truth
#predictions_df_train['Ground_Truth'] = y_train_1.values  # Assuming y_test_1 is a pandas Series

# Now predictions_df contains both predictions and ground truth
#predictions_df_train.head()

In [228]:
#X_test_1.head()

In [229]:
#gb_model.score(X_train_1, y_train_1)

In [230]:
#gb_model.score(X_test_1, y_test_1)

## Second Flight

In [231]:
# Read the Excel file
second_df = pd.read_excel("Second_Flight_Details.xlsx")
first_df_reload = pd.read_excel("First_Flight_Details.xlsx")
second_df.head()
first_df_reload.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes)
0,MQ,2019-01-01,3538,N523AE,ORD,SYR,11:45:00,11:43:00,-2,14:27:00,14:35:00,8
1,MQ,2019-01-01,3685,N538EG,ORD,SYR,18:50:00,18:50:00,0,21:39:00,21:59:00,20
2,MQ,2019-01-01,3946,N256NN,ORD,SYR,14:47:00,16:30:00,103,17:34:00,19:31:00,117
3,MQ,2019-01-01,4011,N223NN,ORD,SYR,22:10:00,22:09:00,-1,00:54:00,00:54:00,0
4,MQ,2022-01-01,4316,N900AE,ORD,SYR,15:28:00,00:00:00,0,18:15:00,00:00:00,0


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes)
0,UA,2023-01-01,2645,N23721,ORD,SYR,21:10:00,21:07:00,-3,23:57:00,23:47:00,-10
1,UA,2023-01-02,1998,N802UA,ORD,SYR,18:14:00,18:17:00,3,21:07:00,20:46:00,-21
2,UA,2023-01-03,1998,N854UA,ORD,SYR,18:14:00,18:13:00,-1,21:07:00,21:04:00,-3
3,UA,2023-01-04,1998,N893UA,ORD,SYR,18:24:00,18:41:00,17,21:17:00,21:31:00,14
4,UA,2020-01-05,2012,N825UA,ORD,SYR,19:00:00,19:21:00,21,21:47:00,21:55:00,8


In [232]:
second_df.isna().sum()
first_df_reload.isna().sum()

Carrier Code                  0
Date (MM/DD/YYYY)             0
Flight Number                 0
Tail Number                  31
Origin Airport                0
Destination Airport           0
Scheduled departure time      0
Actual departure time         0
Departure delay (Minutes)     0
Scheduled Arrival Time        0
Actual Arrival Time           0
Arrival Delay (Minutes)       0
dtype: int64

Carrier Code                  0
Date (MM/DD/YYYY)             0
Flight Number                 0
Tail Number                  53
Origin Airport                0
Destination Airport           0
Scheduled departure time      0
Actual departure time         0
Departure delay (Minutes)     0
Scheduled Arrival Time        0
Actual Arrival Time           0
Arrival Delay (Minutes)       0
dtype: int64

In [233]:
second_df.dropna(subset=['Tail Number'], inplace=True)
second_df.isna().sum()

first_df_reload.dropna(subset=['Tail Number'], inplace=True)
first_df_reload.isna().sum()

Carrier Code                 0
Date (MM/DD/YYYY)            0
Flight Number                0
Tail Number                  0
Origin Airport               0
Destination Airport          0
Scheduled departure time     0
Actual departure time        0
Departure delay (Minutes)    0
Scheduled Arrival Time       0
Actual Arrival Time          0
Arrival Delay (Minutes)      0
dtype: int64

Carrier Code                 0
Date (MM/DD/YYYY)            0
Flight Number                0
Tail Number                  0
Origin Airport               0
Destination Airport          0
Scheduled departure time     0
Actual departure time        0
Departure delay (Minutes)    0
Scheduled Arrival Time       0
Actual Arrival Time          0
Arrival Delay (Minutes)      0
dtype: int64

In [234]:
len(second_df)

7584

In [235]:
# Define a function to determine the arrival status
def determine_arrival_status(delay):
    if delay > 5:
        return 2
    elif delay < -5:
        return 1
    else:
        return 0

# Apply the function to create the "Arrival Status" column
second_df['Arrival Status Second'] = second_df['Arrival Delay (Minutes)'].apply(determine_arrival_status)
first_df_reload['Arrival Status First'] = first_df_reload['Arrival Delay (Minutes)'].apply(determine_arrival_status)

first_df_reload.head()
second_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status First
0,UA,2023-01-01,2645,N23721,ORD,SYR,21:10:00,21:07:00,-3,23:57:00,23:47:00,-10,1
1,UA,2023-01-02,1998,N802UA,ORD,SYR,18:14:00,18:17:00,3,21:07:00,20:46:00,-21,1
2,UA,2023-01-03,1998,N854UA,ORD,SYR,18:14:00,18:13:00,-1,21:07:00,21:04:00,-3,0
3,UA,2023-01-04,1998,N893UA,ORD,SYR,18:24:00,18:41:00,17,21:17:00,21:31:00,14,2
4,UA,2020-01-05,2012,N825UA,ORD,SYR,19:00:00,19:21:00,21,21:47:00,21:55:00,8,2


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second
0,MQ,2019-01-01,3538,N523AE,ORD,SYR,11:45:00,11:43:00,-2,14:27:00,14:35:00,8,2
1,MQ,2019-01-01,3685,N538EG,ORD,SYR,18:50:00,18:50:00,0,21:39:00,21:59:00,20,2
2,MQ,2019-01-01,3946,N256NN,ORD,SYR,14:47:00,16:30:00,103,17:34:00,19:31:00,117,2
3,MQ,2019-01-01,4011,N223NN,ORD,SYR,22:10:00,22:09:00,-1,00:54:00,00:54:00,0,0
4,MQ,2022-01-01,4316,N900AE,ORD,SYR,15:28:00,00:00:00,0,18:15:00,00:00:00,0,0


In [236]:
first_df_reload.head()
second_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status First
0,UA,2023-01-01,2645,N23721,ORD,SYR,21:10:00,21:07:00,-3,23:57:00,23:47:00,-10,1
1,UA,2023-01-02,1998,N802UA,ORD,SYR,18:14:00,18:17:00,3,21:07:00,20:46:00,-21,1
2,UA,2023-01-03,1998,N854UA,ORD,SYR,18:14:00,18:13:00,-1,21:07:00,21:04:00,-3,0
3,UA,2023-01-04,1998,N893UA,ORD,SYR,18:24:00,18:41:00,17,21:17:00,21:31:00,14,2
4,UA,2020-01-05,2012,N825UA,ORD,SYR,19:00:00,19:21:00,21,21:47:00,21:55:00,8,2


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second
0,MQ,2019-01-01,3538,N523AE,ORD,SYR,11:45:00,11:43:00,-2,14:27:00,14:35:00,8,2
1,MQ,2019-01-01,3685,N538EG,ORD,SYR,18:50:00,18:50:00,0,21:39:00,21:59:00,20,2
2,MQ,2019-01-01,3946,N256NN,ORD,SYR,14:47:00,16:30:00,103,17:34:00,19:31:00,117,2
3,MQ,2019-01-01,4011,N223NN,ORD,SYR,22:10:00,22:09:00,-1,00:54:00,00:54:00,0,0
4,MQ,2022-01-01,4316,N900AE,ORD,SYR,15:28:00,00:00:00,0,18:15:00,00:00:00,0,0


In [237]:
columns_to_keep = ['Carrier Code','Date (MM/DD/YYYY)','Origin Airport','Scheduled departure time','Arrival Status First']

first_df_reload = first_df_reload[columns_to_keep]

first_df_reload.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Origin Airport,Scheduled departure time,Arrival Status First
0,UA,2023-01-01,ORD,21:10:00,1
1,UA,2023-01-02,ORD,18:14:00,1
2,UA,2023-01-03,ORD,18:14:00,0
3,UA,2023-01-04,ORD,18:24:00,2
4,UA,2020-01-05,ORD,19:00:00,2


In [238]:
merged_df_new = pd.merge(second_df, first_df_reload, on=['Origin Airport', 'Date (MM/DD/YYYY)'], how='left')

merged_df_new.head()

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time_x,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second,Carrier Code_y,Scheduled departure time_y,Arrival Status First
0,MQ,2019-01-01,3538,N523AE,ORD,SYR,11:45:00,11:43:00,-2,14:27:00,14:35:00,8,2,,,
1,MQ,2019-01-01,3685,N538EG,ORD,SYR,18:50:00,18:50:00,0,21:39:00,21:59:00,20,2,,,
2,MQ,2019-01-01,3946,N256NN,ORD,SYR,14:47:00,16:30:00,103,17:34:00,19:31:00,117,2,,,
3,MQ,2019-01-01,4011,N223NN,ORD,SYR,22:10:00,22:09:00,-1,00:54:00,00:54:00,0,0,,,
4,MQ,2022-01-01,4316,N900AE,ORD,SYR,15:28:00,00:00:00,0,18:15:00,00:00:00,0,0,,,


In [239]:
len(merged_df_new)
merged_df_new.dtypes

10799

Carrier Code_x                        object
Date (MM/DD/YYYY)             datetime64[ns]
Flight Number                          int64
Tail Number                           object
Origin Airport                        object
Destination Airport                   object
Scheduled departure time_x            object
Actual departure time                 object
Departure delay (Minutes)              int64
Scheduled Arrival Time                object
Actual Arrival Time                   object
Arrival Delay (Minutes)                int64
Arrival Status Second                  int64
Carrier Code_y                        object
Scheduled departure time_y            object
Arrival Status First                 float64
dtype: object

In [240]:
syr_weather.head()

Unnamed: 0,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow,Date,Hour
0,7.2,4.3,87.0,1.5,85.0,4.59,9.8,150.0,Light rain,982.0,997.0,16.0,0.0,2019-01-01,0
1,8.9,8.9,100.0,0.5,79.0,5.09,11.8,190.0,Overcast clouds,984.0,999.0,16.0,0.0,2019-01-01,1
2,9.4,9.4,100.0,0.0,83.0,4.59,12.9,180.0,Overcast clouds,982.0,997.0,16.0,0.0,2019-01-01,2
3,11.1,11.1,87.0,0.0,80.0,8.19,13.9,200.0,Overcast clouds,980.0,996.0,16.0,0.0,2019-01-01,3
4,10.6,10.6,100.0,0.5,85.0,6.2,16.8,230.0,Overcast clouds,981.0,996.0,16.0,0.0,2019-01-01,4


In [241]:
# Convert 'Scheduled departure time_x' and 'Scheduled departure time_y' columns to datetime format
merged_df_new['Scheduled departure time_x'] = pd.to_datetime(merged_df_new['Scheduled departure time_x'], format='%H:%M:%S')
merged_df_new['Scheduled departure time_y'] = pd.to_datetime(merged_df_new['Scheduled departure time_y'], format='%H:%M:%S')

# Calculate the time difference in hours
merged_df_new['time_diff'] = (merged_df_new['Scheduled departure time_x'] - merged_df_new['Scheduled departure time_y']).dt.total_seconds() / 3600

# Drop rows where the time difference is greater than or equal to 3 hours or less than 0
merged_df_new = merged_df_new.drop(merged_df_new[(merged_df_new['time_diff'] > 3) | (merged_df_new['time_diff'] < 0)].index)

# Drop the 'time_diff' column if not needed anymore
merged_df_new = merged_df_new.drop(columns=['time_diff'])

merged_df_new.head()

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time_x,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second,Carrier Code_y,Scheduled departure time_y,Arrival Status First
0,MQ,2019-01-01,3538,N523AE,ORD,SYR,1900-01-01 11:45:00,11:43:00,-2,14:27:00,14:35:00,8,2,,NaT,
1,MQ,2019-01-01,3685,N538EG,ORD,SYR,1900-01-01 18:50:00,18:50:00,0,21:39:00,21:59:00,20,2,,NaT,
2,MQ,2019-01-01,3946,N256NN,ORD,SYR,1900-01-01 14:47:00,16:30:00,103,17:34:00,19:31:00,117,2,,NaT,
3,MQ,2019-01-01,4011,N223NN,ORD,SYR,1900-01-01 22:10:00,22:09:00,-1,00:54:00,00:54:00,0,0,,NaT,
4,MQ,2022-01-01,4316,N900AE,ORD,SYR,1900-01-01 15:28:00,00:00:00,0,18:15:00,00:00:00,0,0,,NaT,


In [242]:
merged_df_new.isna().sum()

Carrier Code_x                   0
Date (MM/DD/YYYY)                0
Flight Number                    0
Tail Number                      0
Origin Airport                   0
Destination Airport              0
Scheduled departure time_x       0
Actual departure time            0
Departure delay (Minutes)        0
Scheduled Arrival Time           0
Actual Arrival Time              0
Arrival Delay (Minutes)          0
Arrival Status Second            0
Carrier Code_y                1810
Scheduled departure time_y    1810
Arrival Status First          1810
dtype: int64

In [243]:
merged_df_new.dropna(inplace=True)

In [244]:
merged_df_new.head()
len(merged_df_new)

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time_x,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second,Carrier Code_y,Scheduled departure time_y,Arrival Status First
48,MQ,2019-01-07,3685,N533AE,ORD,SYR,1900-01-01 18:50:00,19:09:00,19,21:37:00,21:48:00,11,2,UA,1900-01-01 18:00:00,1.0
54,MQ,2019-01-08,3685,N527EA,ORD,SYR,1900-01-01 18:50:00,18:47:00,-3,21:37:00,21:27:00,-10,1,UA,1900-01-01 18:00:00,2.0
61,MQ,2019-01-09,3685,N543EA,ORD,SYR,1900-01-01 18:50:00,18:42:00,-8,21:37:00,21:47:00,10,2,UA,1900-01-01 18:00:00,1.0
67,MQ,2019-01-10,3685,N532EA,ORD,SYR,1900-01-01 18:50:00,18:48:00,-2,21:37:00,21:48:00,11,2,UA,1900-01-01 18:00:00,2.0
71,MQ,2019-01-11,3685,N239NN,ORD,SYR,1900-01-01 18:50:00,18:46:00,-4,21:37:00,21:29:00,-8,1,UA,1900-01-01 18:00:00,0.0


1465

In [245]:
merged_df_new.tail()

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Destination Airport,Scheduled departure time_x,Actual departure time,Departure delay (Minutes),Scheduled Arrival Time,Actual Arrival Time,Arrival Delay (Minutes),Arrival Status Second,Carrier Code_y,Scheduled departure time_y,Arrival Status First
10054,B6,2023-06-10,656,N566JB,MCO,SYR,1900-01-01 16:15:00,16:04:00,-11,19:02:00,18:45:00,-17,1,WN,1900-01-01 13:40:00,1.0
10079,B6,2023-06-17,656,N339JB,MCO,SYR,1900-01-01 16:39:00,17:56:00,77,19:20:00,21:12:00,112,2,WN,1900-01-01 13:40:00,2.0
10104,B6,2023-06-24,656,N328JB,MCO,SYR,1900-01-01 16:39:00,16:41:00,2,19:20:00,19:33:00,13,2,WN,1900-01-01 13:40:00,1.0
10129,B6,2023-07-01,656,N329JB,MCO,SYR,1900-01-01 16:39:00,16:36:00,-3,19:20:00,19:26:00,6,2,WN,1900-01-01 13:40:00,2.0
10734,B6,2021-12-18,656,N294JB,MCO,SYR,1900-01-01 11:15:00,11:03:00,-12,13:52:00,13:31:00,-21,1,WN,1900-01-01 10:30:00,1.0


In [246]:
columns_to_keep = ['Carrier Code_x', 'Date (MM/DD/YYYY)','Origin Airport', 'Scheduled departure time_x', 'Actual departure time',
                   'Scheduled Arrival Time', 'Actual Arrival Time','Arrival Status Second','Carrier Code_y','Arrival Status First']

#'Departure Status Second' 'Departure Status First'
merged_df_new = merged_df_new[columns_to_keep]
merged_df_new.head()

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Origin Airport,Scheduled departure time_x,Actual departure time,Scheduled Arrival Time,Actual Arrival Time,Arrival Status Second,Carrier Code_y,Arrival Status First
48,MQ,2019-01-07,ORD,1900-01-01 18:50:00,19:09:00,21:37:00,21:48:00,2,UA,1.0
54,MQ,2019-01-08,ORD,1900-01-01 18:50:00,18:47:00,21:37:00,21:27:00,1,UA,2.0
61,MQ,2019-01-09,ORD,1900-01-01 18:50:00,18:42:00,21:37:00,21:47:00,2,UA,1.0
67,MQ,2019-01-10,ORD,1900-01-01 18:50:00,18:48:00,21:37:00,21:48:00,2,UA,2.0
71,MQ,2019-01-11,ORD,1900-01-01 18:50:00,18:46:00,21:37:00,21:29:00,1,UA,0.0


In [247]:
merged_df_new.dtypes

Carrier Code_x                        object
Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Scheduled departure time_x    datetime64[ns]
Actual departure time                 object
Scheduled Arrival Time                object
Actual Arrival Time                   object
Arrival Status Second                  int64
Carrier Code_y                        object
Arrival Status First                 float64
dtype: object

In [248]:
merged_df_new['Day of Week'] = merged_df_new['Date (MM/DD/YYYY)'].dt.dayofweek

# Map the day of the week to day names
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
merged_df_new['Day Name'] = merged_df_new['Day of Week'].map(lambda x: day_names[x])

merged_df_new['Day Name'] = pd.Categorical(merged_df_new['Day Name'], categories=day_names)

In [249]:
# Convert 'Scheduled departure time' to datetime
merged_df_new['Scheduled Arrival Time'] = pd.to_datetime(merged_df_new['Scheduled Arrival Time'], format='%H:%M:%S')

In [250]:
# Extract hour, minute, and second components
merged_df_new['Scheduled departure hour'] = merged_df_new['Scheduled departure time_x'].dt.hour
merged_df_new['Scheduled departure minute'] = merged_df_new['Scheduled departure time_x'].dt.minute

merged_df_new['Scheduled Arrival hour'] = merged_df_new['Scheduled Arrival Time'].dt.hour
merged_df_new['Scheduled Arrival minute'] = merged_df_new['Scheduled Arrival Time'].dt.minute

In [251]:
merged_df_new = pd.merge(merged_df_new, syr_weather, left_on=["Date (MM/DD/YYYY)", "Scheduled Arrival hour"], right_on=["Date", "Hour"], how="left")
merged_df_new.head()

Unnamed: 0,Carrier Code_x,Date (MM/DD/YYYY),Origin Airport,Scheduled departure time_x,Actual departure time,Scheduled Arrival Time,Actual Arrival Time,Arrival Status Second,Carrier Code_y,Arrival Status First,...,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow,Date,Hour
0,MQ,2019-01-07,ORD,1900-01-01 18:50:00,19:09:00,1900-01-01 21:37:00,21:48:00,2,UA,1.0,...,5.7,13.2,130.0,Light snow,1001.0,1017.0,16.0,7.5,2019-01-07,21
1,MQ,2019-01-08,ORD,1900-01-01 18:50:00,18:47:00,1900-01-01 21:37:00,21:27:00,1,UA,2.0,...,2.1,7.6,100.0,Mix snow/rain,985.0,1000.0,8.0,1.0,2019-01-08,21
2,MQ,2019-01-09,ORD,1900-01-01 18:50:00,18:42:00,1900-01-01 21:37:00,21:47:00,2,UA,1.0,...,4.59,11.3,280.0,Light snow,989.0,1004.0,1.0,10.0,2019-01-09,21
3,MQ,2019-01-10,ORD,1900-01-01 18:50:00,18:48:00,1900-01-01 21:37:00,21:48:00,2,UA,2.0,...,8.8,11.8,290.0,Overcast clouds,1002.0,1018.0,5.0,0.0,2019-01-10,21
4,MQ,2019-01-11,ORD,1900-01-01 18:50:00,18:46:00,1900-01-01 21:37:00,21:29:00,1,UA,0.0,...,3.6,6.0,250.0,Overcast clouds,1015.0,1031.0,14.0,0.0,2019-01-11,21


In [252]:
merged_df_new['weather'] = pd.Categorical(merged_df_new['weather'], ordered=True)

In [253]:
merged_df_new = pd.merge(merged_df_new, All_weather, how='left', left_on=['Date (MM/DD/YYYY)', 'Scheduled departure hour', 'Origin Airport'], right_on=['Date', 'Hour', 'Location'])

In [254]:
merged_df_new.drop(['Scheduled departure time_x', 'Actual departure time', 'Scheduled Arrival Time', 'Actual Arrival Time','Date (MM/DD/YYYY)','Day of Week'], axis=1, inplace=True)

In [255]:
merged_df_new.drop(['Date_x','Hour_x','Date_y', 'Hour_y'], axis=1, inplace=True)

In [256]:
merged_df_new.head()

Unnamed: 0,Carrier Code_x,Origin Airport,Arrival Status Second,Carrier Code_y,Arrival Status First,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,...,rh_y,wind_spd_y,wind_gust_spd_y,wind_dir_y,weather_description,pres_y,slp_y,vis_y,snow_y,Location
0,MQ,ORD,2,UA,1.0,Monday,18,50,21,37,...,82,9.8,15.4,250,Broken clouds,980,1003,16,0.0,ORD
1,MQ,ORD,1,UA,2.0,Tuesday,18,50,21,37,...,59,10.3,17.0,280,Overcast clouds,988,1012,16,0.0,ORD
2,MQ,ORD,2,UA,1.0,Wednesday,18,50,21,37,...,58,3.6,10.0,330,Scattered clouds,1002,1027,16,0.0,ORD
3,MQ,ORD,2,UA,2.0,Thursday,18,50,21,37,...,71,0.8,1.2,190,Overcast clouds,1004,1029,16,0.0,ORD
4,MQ,ORD,1,UA,0.0,Friday,18,50,21,37,...,68,1.5,3.2,190,Overcast clouds,1002,1027,16,0.0,ORD


In [257]:
merged_df_new.drop(['Location'], axis=1, inplace=True)

In [258]:
# Convert columns to object type
merged_df_new['Scheduled departure hour'] = merged_df_new['Scheduled departure hour'].astype(str)
merged_df_new['Scheduled departure minute'] = merged_df_new['Scheduled departure minute'].astype(str)
merged_df_new['Scheduled Arrival hour'] = merged_df_new['Scheduled Arrival hour'].astype(str)
merged_df_new['Scheduled Arrival minute'] = merged_df_new['Scheduled Arrival minute'].astype(str)

In [259]:
all_hours = pd.Series(range(24))

# Convert to categorical variable with specified categories
merged_df_new['Scheduled departure hour'] = pd.Categorical(merged_df_new['Scheduled departure hour'], categories=all_hours, ordered=True)
merged_df_new['Scheduled Arrival hour'] = pd.Categorical(merged_df_new['Scheduled departure hour'], categories=all_hours, ordered=True)

In [260]:
all_minutes = pd.Series(range(60))

# Convert to categorical variable with specified categories
merged_df_new['Scheduled departure minute'] = pd.Categorical(merged_df_new['Scheduled departure minute'], categories=all_minutes, ordered=True)
merged_df_new['Scheduled Arrival minute'] = pd.Categorical(merged_df_new['Scheduled Arrival minute'], categories=all_minutes, ordered=True)

In [261]:
merged_df_new.dtypes

Carrier Code_x                  object
Origin Airport                  object
Arrival Status Second            int64
Carrier Code_y                  object
Arrival Status First           float64
Day Name                      category
Scheduled departure hour      category
Scheduled departure minute    category
Scheduled Arrival hour        category
Scheduled Arrival minute      category
temp_x                         float64
app_temp_x                     float64
clouds_x                       float64
precip_x                       float64
rh_x                           float64
wind_spd_x                     float64
wind_gust_spd_x                float64
wind_dir_x                     float64
weather                       category
pres_x                         float64
slp_x                          float64
vis_x                          float64
snow_x                         float64
temp_y                         float64
app_temp_y                     float64
clouds_y                 

In [262]:
len(merged_df_new)

1465

In [263]:
merged_df_new['Arrival Status Second'] = merged_df_new['Arrival Status Second'].astype('category')
merged_df_new['Arrival Status First'] = merged_df_new['Arrival Status First'].astype('category')

#merged_df_new['Departure Status Second'] = merged_df_new['Departure Status Second'].astype('category')
#merged_df_new['Departure Status First'] = merged_df_new['Departure Status First'].astype('category')

In [264]:
merged_df_new.drop(['weather','weather_description'], axis=1, inplace=True)

In [265]:
Arr_status = merged_df_new['Arrival Status Second']

merged_df_new = pd.get_dummies(merged_df_new.drop(columns=['Arrival Status Second','Carrier Code_x','Carrier Code_y']), drop_first=True)

merged_df_new['Arrival Status Second'] = Arr_status

merged_df_new.head()

Unnamed: 0,temp_x,app_temp_x,clouds_x,precip_x,rh_x,wind_spd_x,wind_gust_spd_x,wind_dir_x,pres_x,slp_x,...,Scheduled Arrival minute_51,Scheduled Arrival minute_52,Scheduled Arrival minute_53,Scheduled Arrival minute_54,Scheduled Arrival minute_55,Scheduled Arrival minute_56,Scheduled Arrival minute_57,Scheduled Arrival minute_58,Scheduled Arrival minute_59,Arrival Status Second
0,-0.6,-6.1,87.0,0.5,49.0,5.7,13.2,130.0,1001.0,1017.0,...,False,False,False,False,False,False,False,False,False,2
1,1.1,-1.3,100.0,0.5,88.0,2.1,7.6,100.0,985.0,1000.0,...,False,False,False,False,False,False,False,False,False,1
2,-2.8,-8.1,100.0,0.5,100.0,4.59,11.3,280.0,989.0,1004.0,...,False,False,False,False,False,False,False,False,False,2
3,-7.2,-16.1,100.0,0.0,80.0,8.8,11.8,290.0,1002.0,1018.0,...,False,False,False,False,False,False,False,False,False,2
4,-9.4,-15.4,87.0,0.0,69.0,3.6,6.0,250.0,1015.0,1031.0,...,False,False,False,False,False,False,False,False,False,1


In [579]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Define your base models
base_models = [
    ('logistic', LogisticRegression(fit_intercept=True, solver = 'newton-cg',multi_class='multinomial', penalty = None, max_iter = 100000)),
    ('random_forest', RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=None,  # Maximum depth of the tree
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_features=10,  # Number of features to consider when looking for the best split
    random_state=42  # Random state for reproducibility
))]

# Define your meta-model
meta_model = GradientBoostingClassifier(learning_rate = 0.01, max_features = 'sqrt', n_estimators = 300, min_samples_leaf = 2)

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Create pipeline with standard scaler and stacking classifier
pipeline_2 = make_pipeline(stacking_clf)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(merged_df_new.drop(columns = ['Arrival Status Second']), merged_df_new['Arrival Status Second'], test_size=0.20, random_state=42)

# Train the stacking model
pipeline_2.fit(X_train_2, y_train_2)

# Make predictions
train_accuracy = pipeline_2.score(X_train_2, y_train_2)
test_accuracy = pipeline_2.score(X_test_2, y_test_2)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.5938566552901023
Test Accuracy: 0.5392491467576792


In [267]:
#X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(merged_df_new.drop(columns = ['Arrival Status Second']), merged_df_new['Arrival Status Second'], test_size=0.20, random_state=35)

#X_train_2
#X_test_2
#y_train_2
#y_test_2

In [268]:

# Assuming X_train and y_train are your training data
# Calculate sample weights based on the importance of Arrival Status First_1.0 and Arrival Status First_2.0
# Assign higher weights to samples where these features are present
# custom_weights = np.where((X_train_2['Arrival Status First_1.0'] == 1) | (X_train_2['Arrival Status First_2.0'] == 1), 2, 1)

In [269]:
#from sklearn.ensemble import GradientBoostingClassifier

#rf_model = LogisticRegression(fit_intercept=True, solver = 'newton-cg',multi_class='multinomial', penalty = None, max_iter = 100000)
#gb_model_2 = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 7, max_features = 'sqrt', n_estimators = 200)
#gb_model_2.fit(X_train_2, y_train_2, sample_weight=custom_weights)

#gb_model_2.score(X_train_2, y_train_2)

In [270]:
#gb_model_2.score(X_test_2, y_test_2)

## Initial Predictions Previous Flight

In [580]:
even_rows = pd.read_excel("Final_Earlier_flights.xlsx")

odd_rows = pd.read_excel("Final_Later_flights.xlsx")

In [581]:
even_rows.head()
odd_rows.head()

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,2024-04-19,FRIDAY,UA 1400,ORD,18:52:00,21:47:00,,,,
1,2024-04-19,FRIDAY,B6 116,JFK,13:34:00,14:51:00,,,,
2,2024-04-19,FRIDAY,WN 5285,MCO,11:35:00,14:20:00,,,,
3,2024-04-20,SATURDAY,UA 1400,ORD,18:52:00,21:47:00,,,,
4,2024-04-20,SATURDAY,B6 116,JFK,13:25:00,14:41:00,,,,


Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,2024-04-19,FRIDAY,AA 3402,ORD,19:59:00,22:52:00,,,,
1,2024-04-19,FRIDAY,DL 5182,JFK,14:55:00,16:21:00,,,,
2,2024-04-19,FRIDAY,B6 656,MCO,13:35:00,16:25:00,,,,
3,2024-04-20,SATURDAY,AA 3402,ORD,19:59:00,22:52:00,,,,
4,2024-04-20,SATURDAY,DL 5182,JFK,14:55:00,16:21:00,,,,


In [582]:
test_merged_weather_df = pd.read_csv("merged_forecast_data.csv")
test_merged_weather_df.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,weather,pres,slp,vis,snow,Location
0,2024-04-18:20,2024-04-18T16:00:00,5.2,0.9,94,0.0,76,6.4,9.3,50,Overcast clouds,1019.5,1019.6,16.0,0,JFK
1,2024-04-18:21,2024-04-18T17:00:00,9.0,9.0,92,0.0,74,6.4,9.8,50,Overcast clouds,1019.0,1019.1,17.49,0,JFK
2,2024-04-18:22,2024-04-18T18:00:00,8.9,8.9,90,0.0,73,6.0,9.0,50,Overcast clouds,1019.5,1019.6,16.99,0,JFK
3,2024-04-18:23,2024-04-18T19:00:00,8.4,8.4,94,0.0,76,5.2,8.1,50,Overcast clouds,1021.0,1021.1,15.8,0,JFK
4,2024-04-19:00,2024-04-18T20:00:00,8.2,8.2,95,0.0,72,5.2,7.7,50,Overcast clouds,1021.0,1021.1,15.7,0,JFK


In [583]:
columns_to_keep = ['datetime', 'timestamp_local', 'temp', 'app_temp', 'clouds', 'precip', 'rh', 'wind_spd', 'wind_gust_spd', 'wind_dir', 'pres', 'slp', 'vis', 'snow', 'Location']
test_merged_weather_df = test_merged_weather_df.loc[:, columns_to_keep]

test_merged_weather_df.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,pres,slp,vis,snow,Location
0,2024-04-18:20,2024-04-18T16:00:00,5.2,0.9,94,0.0,76,6.4,9.3,50,1019.5,1019.6,16.0,0,JFK
1,2024-04-18:21,2024-04-18T17:00:00,9.0,9.0,92,0.0,74,6.4,9.8,50,1019.0,1019.1,17.49,0,JFK
2,2024-04-18:22,2024-04-18T18:00:00,8.9,8.9,90,0.0,73,6.0,9.0,50,1019.5,1019.6,16.99,0,JFK
3,2024-04-18:23,2024-04-18T19:00:00,8.4,8.4,94,0.0,76,5.2,8.1,50,1021.0,1021.1,15.8,0,JFK
4,2024-04-19:00,2024-04-18T20:00:00,8.2,8.2,95,0.0,72,5.2,7.7,50,1021.0,1021.1,15.7,0,JFK


In [584]:
test_merged_weather_df.dtypes

datetime            object
timestamp_local     object
temp               float64
app_temp           float64
clouds               int64
precip             float64
rh                   int64
wind_spd           float64
wind_gust_spd      float64
wind_dir             int64
pres               float64
slp                float64
vis                float64
snow                 int64
Location            object
dtype: object

In [585]:
#test_merged_weather_df = test_merged_weather_df.rename(columns={'Source': 'Location', 'weather.description': 'weather_description'})

In [586]:
#columns_to_keep = ['datetime', 'timestamp_local', 'temp', 'app_temp', 'clouds', 'precip', 'rh', 'wind_spd', 'wind_gust_spd', 'wind_dir', 'weather.description', 'pres', 'slp', 'vis', 'snow']

#pred_weather_SYR = pred_weather_SYR.loc[:, columns_to_keep]

In [587]:
#pred_weather_SYR = pred_weather_SYR.rename(columns={'weather.description': 'weather'})
#pred_weather_SYR.head()

In [588]:
#pred_weather_SYR.dtypes

In [589]:
test_merged_weather_df['timestamp_local'] = pd.to_datetime(test_merged_weather_df['timestamp_local'])

In [590]:
test_merged_weather_df['Date'] = test_merged_weather_df['timestamp_local'].dt.date
test_merged_weather_df['Hour'] = test_merged_weather_df['timestamp_local'].dt.hour

test_merged_weather_df.head()

Unnamed: 0,datetime,timestamp_local,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,pres,slp,vis,snow,Location,Date,Hour
0,2024-04-18:20,2024-04-18 16:00:00,5.2,0.9,94,0.0,76,6.4,9.3,50,1019.5,1019.6,16.0,0,JFK,2024-04-18,16
1,2024-04-18:21,2024-04-18 17:00:00,9.0,9.0,92,0.0,74,6.4,9.8,50,1019.0,1019.1,17.49,0,JFK,2024-04-18,17
2,2024-04-18:22,2024-04-18 18:00:00,8.9,8.9,90,0.0,73,6.0,9.0,50,1019.5,1019.6,16.99,0,JFK,2024-04-18,18
3,2024-04-18:23,2024-04-18 19:00:00,8.4,8.4,94,0.0,76,5.2,8.1,50,1021.0,1021.1,15.8,0,JFK,2024-04-18,19
4,2024-04-19:00,2024-04-18 20:00:00,8.2,8.2,95,0.0,72,5.2,7.7,50,1021.0,1021.1,15.7,0,JFK,2024-04-18,20


In [591]:
even_rows.dtypes

DATE                                 datetime64[ns]
DAY                                          object
FLIGHT NUMBER                                object
ORIGIN                                       object
DEPARTURE TIME                               object
ARRIVAL TIME                                 object
ARRIVAL STATUS                              float64
ARRIVAL STATUS_Prev_flight_early            float64
ARRIVAL STATUS_Prev_flight_ontime           float64
ARRIVAL STATUS_Prev_flight_late             float64
dtype: object

In [592]:
#even_rows['Carrier Code'] = even_rows['FLIGHT NUMBER'].str[:2]

even_rows = even_rows.rename(columns={'DATE': 'Date (MM/DD/YYYY)'})

In [593]:
even_rows['Date (MM/DD/YYYY)'] = pd.to_datetime(even_rows['Date (MM/DD/YYYY)'])

even_rows['Day of Week'] = even_rows['Date (MM/DD/YYYY)'].dt.dayofweek

# Map the day of the week to day names
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
even_rows['Day Name'] = even_rows['Day of Week'].map(lambda x: day_names[x])

even_rows['Day Name'] = pd.Categorical(even_rows['Day Name'], categories=day_names)

print(even_rows['Day Name'].cat.categories)

Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'],
      dtype='object')


In [594]:
# Convert 'Scheduled departure time' to datetime
even_rows['Scheduled departure time'] = pd.to_datetime(even_rows['DEPARTURE TIME'], format='%H:%M:%S')

even_rows['Scheduled Arrival Time'] = pd.to_datetime(even_rows['ARRIVAL TIME'], format='%H:%M:%S')

In [595]:
# Extract hour, minute, and second components
even_rows['Scheduled departure hour'] = even_rows['Scheduled departure time'].dt.hour
even_rows['Scheduled departure minute'] = even_rows['Scheduled departure time'].dt.minute

even_rows['Scheduled Arrival hour'] = even_rows['Scheduled Arrival Time'].dt.hour
even_rows['Scheduled Arrival minute'] = even_rows['Scheduled Arrival Time'].dt.minute

In [596]:
even_rows.drop(['FLIGHT NUMBER', 'Scheduled departure time', 'Scheduled Arrival Time', 'Day of Week'], axis=1, inplace=True)
even_rows.drop(['DEPARTURE TIME', 'ARRIVAL TIME', 'ARRIVAL STATUS','ARRIVAL STATUS_Prev_flight_early','ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'], axis=1, inplace=True)           

In [597]:
# Define the new column names and order
new_column_names = [ 'Date (MM/DD/YYYY)', 'Origin Airport', 'Day Name', 
                    'Scheduled departure hour', 'Scheduled departure minute', 
                    'Scheduled Arrival hour', 'Scheduled Arrival minute']

# Rename the columns
even_rows = even_rows.rename(columns={
    'ORIGIN': 'Origin Airport'
})

# Reorder the columns
even_rows = even_rows.reindex(columns=new_column_names)
even_rows.dtypes

Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Day Name                            category
Scheduled departure hour               int32
Scheduled departure minute             int32
Scheduled Arrival hour                 int32
Scheduled Arrival minute               int32
dtype: object

In [598]:
odd_rows.dtypes

DATE                                 datetime64[ns]
DAY                                          object
FLIGHT NUMBER                                object
ORIGIN                                       object
DEPARTURE TIME                               object
ARRIVAL TIME                                 object
ARRIVAL STATUS                              float64
ARRIVAL STATUS_Prev_flight_early            float64
ARRIVAL STATUS_Prev_flight_ontime           float64
ARRIVAL STATUS_Prev_flight_late             float64
dtype: object

In [599]:
#odd_rows['Carrier Code_x'] = odd_rows['FLIGHT NUMBER'].str[:2]

odd_rows = odd_rows.rename(columns={'DATE': 'Date (MM/DD/YYYY)'})

In [600]:
odd_rows['Date (MM/DD/YYYY)'] = pd.to_datetime(odd_rows['Date (MM/DD/YYYY)'])

odd_rows['Day of Week'] = odd_rows['Date (MM/DD/YYYY)'].dt.dayofweek

# Map the day of the week to day names
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
odd_rows['Day Name'] = odd_rows['Day of Week'].map(lambda x: day_names[x])

odd_rows['Day Name'] = pd.Categorical(odd_rows['Day Name'], categories=day_names)

print(odd_rows['Day Name'].cat.categories)


Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'],
      dtype='object')


In [601]:
# Convert 'Scheduled departure time' to datetime
odd_rows['Scheduled departure time'] = pd.to_datetime(odd_rows['DEPARTURE TIME'], format='%H:%M:%S')

odd_rows['Scheduled Arrival Time'] = pd.to_datetime(odd_rows['ARRIVAL TIME'], format='%H:%M:%S')

In [602]:
# Extract hour, minute, and second components
odd_rows['Scheduled departure hour'] = odd_rows['Scheduled departure time'].dt.hour
odd_rows['Scheduled departure minute'] = odd_rows['Scheduled departure time'].dt.minute

odd_rows['Scheduled Arrival hour'] = odd_rows['Scheduled Arrival Time'].dt.hour
odd_rows['Scheduled Arrival minute'] = odd_rows['Scheduled Arrival Time'].dt.minute

In [603]:
odd_rows.drop(['FLIGHT NUMBER', 'Scheduled departure time', 'Scheduled Arrival Time', 'Day of Week'], axis=1, inplace=True)
odd_rows.drop(['DEPARTURE TIME', 'ARRIVAL TIME', 'ARRIVAL STATUS','ARRIVAL STATUS_Prev_flight_early','ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'], axis=1, inplace=True)           

In [604]:
# Map the day of the week to day names
status = ['0', '1', '2']
odd_rows['Arrival Status First'] = '1'

odd_rows['Arrival Status First'] = pd.Categorical(odd_rows['Arrival Status First'], categories=status)

print(odd_rows['Arrival Status First'].cat.categories)

Index(['0', '1', '2'], dtype='object')


In [605]:
odd_rows.head()

Unnamed: 0,Date (MM/DD/YYYY),DAY,ORIGIN,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,Arrival Status First
0,2024-04-19,FRIDAY,ORD,Friday,19,59,22,52,1
1,2024-04-19,FRIDAY,JFK,Friday,14,55,16,21,1
2,2024-04-19,FRIDAY,MCO,Friday,13,35,16,25,1
3,2024-04-20,SATURDAY,ORD,Saturday,19,59,22,52,1
4,2024-04-20,SATURDAY,JFK,Saturday,14,55,16,21,1


In [606]:
# Define the new column names and order
new_column_names = ['Date (MM/DD/YYYY)', 'Origin Airport',
                    'Arrival Status First', 'Day Name', 
                    'Scheduled departure hour', 'Scheduled departure minute', 
                    'Scheduled Arrival hour', 'Scheduled Arrival minute']

# Rename the columns
odd_rows = odd_rows.rename(columns={
    'ORIGIN': 'Origin Airport'
})

# Reorder the columns
odd_rows = odd_rows.reindex(columns=new_column_names)
odd_rows.dtypes

Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Arrival Status First                category
Day Name                            category
Scheduled departure hour               int32
Scheduled departure minute             int32
Scheduled Arrival hour                 int32
Scheduled Arrival minute               int32
dtype: object

In [607]:
test_merged_weather_df['Date'] = pd.to_datetime(test_merged_weather_df['Date'])

In [608]:
test_merged_weather_df.drop(['datetime', 'timestamp_local'], axis=1, inplace=True)

In [609]:
even_rows.dtypes

Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Day Name                            category
Scheduled departure hour               int32
Scheduled departure minute             int32
Scheduled Arrival hour                 int32
Scheduled Arrival minute               int32
dtype: object

In [610]:
# Filter test_merged_weather_df to include only rows where Location is SYR
filtered_weather_df = test_merged_weather_df[test_merged_weather_df['Location'] == 'SYR']
filtered_weather_df.head()

# Merge even_rows with the filtered_weather_df
merged_df_test_even = pd.merge(even_rows, filtered_weather_df, left_on=["Date (MM/DD/YYYY)", "Scheduled Arrival hour"], right_on=["Date", "Hour"], how="left")

merged_df_test_even.head()

Unnamed: 0,temp,app_temp,clouds,precip,rh,wind_spd,wind_gust_spd,wind_dir,pres,slp,vis,snow,Location,Date,Hour
324,4.8,2.1,87,0.0,76,3.2,4.9,90,1002.0,1017.4,12.4,0,SYR,2024-04-18,16
325,12.6,12.6,91,0.76,77,3.2,4.2,70,1002.5,1017.9,12.5,0,SYR,2024-04-18,17
326,12.1,12.1,88,0.5,77,3.6,4.2,80,1003.5,1018.9,12.4,0,SYR,2024-04-18,18
327,11.6,11.6,92,0.25,78,3.2,4.1,80,1003.5,1018.9,12.5,0,SYR,2024-04-18,19
328,11.0,11.0,89,0.0,80,2.8,4.1,90,1003.5,1019.0,12.7,0,SYR,2024-04-18,20


Unnamed: 0,Date (MM/DD/YYYY),Origin Airport,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,temp,app_temp,clouds,...,wind_spd,wind_gust_spd,wind_dir,pres,slp,vis,snow,Location,Date,Hour
0,2024-04-19,ORD,Friday,18,52,21,47,12.3,12.3,68,...,5.2,8.0,260,999.5,1014.9,18.8,0,SYR,2024-04-19,21
1,2024-04-19,JFK,Friday,13,34,14,51,16.3,16.3,84,...,9.2,14.2,170,996.0,1011.2,13.3,0,SYR,2024-04-19,14
2,2024-04-19,MCO,Friday,11,35,14,20,16.3,16.3,84,...,9.2,14.2,170,996.0,1011.2,13.3,0,SYR,2024-04-19,14
3,2024-04-20,ORD,Saturday,18,52,21,47,6.6,2.9,28,...,6.26,9.2,280,998.5,1014.2,24.13,0,SYR,2024-04-20,21
4,2024-04-20,JFK,Saturday,13,25,14,41,10.6,10.6,75,...,8.4,11.8,280,997.0,1012.6,17.5,0,SYR,2024-04-20,14


In [611]:
merged_df_test_odd = pd.merge(odd_rows, filtered_weather_df, left_on=["Date (MM/DD/YYYY)", "Scheduled Arrival hour"], right_on=["Date", "Hour"], how="left")

merged_df_test_odd.head()

Unnamed: 0,Date (MM/DD/YYYY),Origin Airport,Arrival Status First,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,temp,app_temp,...,wind_spd,wind_gust_spd,wind_dir,pres,slp,vis,snow,Location,Date,Hour
0,2024-04-19,ORD,1,Friday,19,59,22,52,11.5,11.5,...,5.6,8.6,260,1000.0,1015.5,20.9,0,SYR,2024-04-19,22
1,2024-04-19,JFK,1,Friday,14,55,16,21,15.9,15.9,...,9.6,14.4,180,996.0,1011.2,16.59,0,SYR,2024-04-19,16
2,2024-04-19,MCO,1,Friday,13,35,16,25,15.9,15.9,...,9.6,14.4,180,996.0,1011.2,16.59,0,SYR,2024-04-19,16
3,2024-04-20,ORD,1,Saturday,19,59,22,52,6.0,2.3,...,5.73,8.4,280,999.0,1014.7,24.13,0,SYR,2024-04-20,22
4,2024-04-20,JFK,1,Saturday,14,55,16,21,10.3,10.3,...,8.66,12.0,280,996.5,1012.0,21.3,0,SYR,2024-04-20,16


In [612]:
merged_df_test_even.isna().sum()

Date (MM/DD/YYYY)             0
Origin Airport                0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
temp                          0
app_temp                      0
clouds                        0
precip                        0
rh                            0
wind_spd                      0
wind_gust_spd                 0
wind_dir                      0
pres                          0
slp                           0
vis                           0
snow                          0
Location                      0
Date                          0
Hour                          0
dtype: int64

In [613]:
merged_df_test_odd.isna().sum()

Date (MM/DD/YYYY)             0
Origin Airport                0
Arrival Status First          0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
temp                          0
app_temp                      0
clouds                        0
precip                        0
rh                            0
wind_spd                      0
wind_gust_spd                 0
wind_dir                      0
pres                          0
slp                           0
vis                           0
snow                          0
Location                      0
Date                          0
Hour                          0
dtype: int64

In [614]:
test_merged_weather_df.dtypes

temp                    float64
app_temp                float64
clouds                    int64
precip                  float64
rh                        int64
wind_spd                float64
wind_gust_spd           float64
wind_dir                  int64
pres                    float64
slp                     float64
vis                     float64
snow                      int64
Location                 object
Date             datetime64[ns]
Hour                      int32
dtype: object

In [615]:
merged_df_test_even = pd.merge(merged_df_test_even, test_merged_weather_df, how='left', left_on=['Date (MM/DD/YYYY)', 'Scheduled departure hour', 'Origin Airport'], right_on=['Date', 'Hour', 'Location'])

In [616]:
merged_df_test_even.isna().sum()

Date (MM/DD/YYYY)             0
Origin Airport                0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
temp_x                        0
app_temp_x                    0
clouds_x                      0
precip_x                      0
rh_x                          0
wind_spd_x                    0
wind_gust_spd_x               0
wind_dir_x                    0
pres_x                        0
slp_x                         0
vis_x                         0
snow_x                        0
Location_x                    0
Date_x                        0
Hour_x                        0
temp_y                        0
app_temp_y                    0
clouds_y                      0
precip_y                      0
rh_y                          0
wind_spd_y                    0
wind_gust_spd_y               0
wind_dir_y                    0
pres_y                        0
slp_y   

In [617]:
len(merged_df_test_even)

12

In [618]:
merged_df_test_odd = pd.merge(merged_df_test_odd, test_merged_weather_df, how='left', left_on=['Date (MM/DD/YYYY)', 'Scheduled departure hour', 'Origin Airport'], right_on=['Date', 'Hour', 'Location'])

In [619]:
merged_df_test_odd.head()
merged_df_test_even.head()

Unnamed: 0,Date (MM/DD/YYYY),Origin Airport,Arrival Status First,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,temp_x,app_temp_x,...,wind_spd_y,wind_gust_spd_y,wind_dir_y,pres_y,slp_y,vis_y,snow_y,Location_y,Date_y,Hour_y
0,2024-04-19,ORD,1,Friday,19,59,22,52,11.5,11.5,...,7.2,11.0,280,996.5,1021.1,24.0,0,ORD,2024-04-19,19
1,2024-04-19,JFK,1,Friday,14,55,16,21,15.9,15.9,...,5.6,7.4,120,1020.0,1020.1,24.0,0,JFK,2024-04-19,14
2,2024-04-19,MCO,1,Friday,13,35,16,25,15.9,15.9,...,2.0,3.2,280,1013.0,1015.7,24.0,0,MCO,2024-04-19,13
3,2024-04-20,ORD,1,Saturday,19,59,22,52,6.0,2.3,...,4.0,6.4,300,994.5,1019.1,24.13,0,ORD,2024-04-20,19
4,2024-04-20,JFK,1,Saturday,14,55,16,21,10.3,10.3,...,4.4,6.2,260,1009.5,1009.6,24.13,0,JFK,2024-04-20,14


Unnamed: 0,Date (MM/DD/YYYY),Origin Airport,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,temp_x,app_temp_x,clouds_x,...,wind_spd_y,wind_gust_spd_y,wind_dir_y,pres_y,slp_y,vis_y,snow_y,Location_y,Date_y,Hour_y
0,2024-04-19,ORD,Friday,18,52,21,47,12.3,12.3,68,...,8.0,12.0,280,996.0,1020.5,24.0,0,ORD,2024-04-19,18
1,2024-04-19,JFK,Friday,13,34,14,51,16.3,16.3,84,...,5.2,7.2,110,1020.5,1020.6,24.0,0,JFK,2024-04-19,13
2,2024-04-19,MCO,Friday,11,35,14,20,16.3,16.3,84,...,1.6,2.4,290,1015.0,1017.8,24.0,0,MCO,2024-04-19,11
3,2024-04-20,ORD,Saturday,18,52,21,47,6.6,2.9,28,...,4.53,7.1,300,994.0,1018.6,24.13,0,ORD,2024-04-20,18
4,2024-04-20,JFK,Saturday,13,25,14,41,10.6,10.6,75,...,4.13,5.73,270,1010.5,1010.6,24.13,0,JFK,2024-04-20,13


In [620]:
merged_df_test_odd.isna().sum()

Date (MM/DD/YYYY)             0
Origin Airport                0
Arrival Status First          0
Day Name                      0
Scheduled departure hour      0
Scheduled departure minute    0
Scheduled Arrival hour        0
Scheduled Arrival minute      0
temp_x                        0
app_temp_x                    0
clouds_x                      0
precip_x                      0
rh_x                          0
wind_spd_x                    0
wind_gust_spd_x               0
wind_dir_x                    0
pres_x                        0
slp_x                         0
vis_x                         0
snow_x                        0
Location_x                    0
Date_x                        0
Hour_x                        0
temp_y                        0
app_temp_y                    0
clouds_y                      0
precip_y                      0
rh_y                          0
wind_spd_y                    0
wind_gust_spd_y               0
wind_dir_y                    0
pres_y  

In [621]:
len(merged_df_test_odd)

11

In [622]:
merged_df_test_even.drop(['Date_x','Hour_x','Date_y', 'Hour_y','Location_y'], axis=1, inplace=True)

In [623]:
merged_df_test_odd.drop(['Date_x','Hour_x','Date_y', 'Hour_y','Location_y'], axis=1, inplace=True)

In [624]:
#Even rows
all_hours = pd.Series(range(24))

# Convert to categorical variable with specified categories
merged_df_test_even['Scheduled departure hour'] = pd.Categorical(merged_df_test_even['Scheduled departure hour'], categories=all_hours, ordered=True)
merged_df_test_even['Scheduled Arrival hour'] = pd.Categorical(merged_df_test_even['Scheduled Arrival hour'], categories=all_hours, ordered=True)

In [625]:
all_minutes = pd.Series(range(60))

# Convert to categorical variable with specified categories
merged_df_test_even['Scheduled departure minute'] = pd.Categorical(merged_df_test_even['Scheduled departure minute'], categories=all_minutes, ordered=True)
merged_df_test_even['Scheduled Arrival minute'] = pd.Categorical(merged_df_test_even['Scheduled Arrival minute'], categories=all_minutes, ordered=True)

In [626]:
date_column = merged_df_test_even['Date (MM/DD/YYYY)']

# Drop the date column before encoding
merged_df_test_even = pd.get_dummies(merged_df_test_even.drop(columns=['Date (MM/DD/YYYY)']), drop_first=True)

# Add the date column back to the encoded DataFrame
#merged_df_test['Date (MM/DD/YYYY)'] = date_column
merged_df_test_even.head()

Unnamed: 0,temp_x,app_temp_x,clouds_x,precip_x,rh_x,wind_spd_x,wind_gust_spd_x,wind_dir_x,pres_x,slp_x,...,Scheduled Arrival minute_50,Scheduled Arrival minute_51,Scheduled Arrival minute_52,Scheduled Arrival minute_53,Scheduled Arrival minute_54,Scheduled Arrival minute_55,Scheduled Arrival minute_56,Scheduled Arrival minute_57,Scheduled Arrival minute_58,Scheduled Arrival minute_59
0,12.3,12.3,68,0.5,71,5.2,8.0,260,999.5,1014.9,...,False,False,False,False,False,False,False,False,False,False
1,16.3,16.3,84,0.5,55,9.2,14.2,170,996.0,1011.2,...,False,True,False,False,False,False,False,False,False,False
2,16.3,16.3,84,0.5,55,9.2,14.2,170,996.0,1011.2,...,False,False,False,False,False,False,False,False,False,False
3,6.6,2.9,28,0.0,51,6.26,9.2,280,998.5,1014.2,...,False,False,False,False,False,False,False,False,False,False
4,10.6,10.6,75,0.0,43,8.4,11.8,280,997.0,1012.6,...,False,False,False,False,False,False,False,False,False,False


In [627]:
all_hours = pd.Series(range(24))

# Convert to categorical variable with specified categories
merged_df_test_odd['Scheduled departure hour'] = pd.Categorical(merged_df_test_odd['Scheduled departure hour'], categories=all_hours, ordered=True)
merged_df_test_odd['Scheduled Arrival hour'] = pd.Categorical(merged_df_test_odd['Scheduled Arrival hour'], categories=all_hours, ordered=True)

In [628]:
all_minutes = pd.Series(range(60))

# Convert to categorical variable with specified categories
merged_df_test_odd['Scheduled departure minute'] = pd.Categorical(merged_df_test_odd['Scheduled departure minute'], categories=all_minutes, ordered=True)
merged_df_test_odd['Scheduled Arrival minute'] = pd.Categorical(merged_df_test_odd['Scheduled Arrival minute'], categories=all_minutes, ordered=True)

In [629]:
merged_df_test_odd.columns

Index(['Date (MM/DD/YYYY)', 'Origin Airport', 'Arrival Status First',
       'Day Name', 'Scheduled departure hour', 'Scheduled departure minute',
       'Scheduled Arrival hour', 'Scheduled Arrival minute', 'temp_x',
       'app_temp_x', 'clouds_x', 'precip_x', 'rh_x', 'wind_spd_x',
       'wind_gust_spd_x', 'wind_dir_x', 'pres_x', 'slp_x', 'vis_x', 'snow_x',
       'Location_x', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_y', 'rh_y',
       'wind_spd_y', 'wind_gust_spd_y', 'wind_dir_y', 'pres_y', 'slp_y',
       'vis_y', 'snow_y'],
      dtype='object')

In [630]:
merged_df_test_odd.head()

Unnamed: 0,Date (MM/DD/YYYY),Origin Airport,Arrival Status First,Day Name,Scheduled departure hour,Scheduled departure minute,Scheduled Arrival hour,Scheduled Arrival minute,temp_x,app_temp_x,...,clouds_y,precip_y,rh_y,wind_spd_y,wind_gust_spd_y,wind_dir_y,pres_y,slp_y,vis_y,snow_y
0,2024-04-19,ORD,1,Friday,19,59,22,52,11.5,11.5,...,53,0.0,38,7.2,11.0,280,996.5,1021.1,24.0,0
1,2024-04-19,JFK,1,Friday,14,55,16,21,15.9,15.9,...,89,0.0,61,5.6,7.4,120,1020.0,1020.1,24.0,0
2,2024-04-19,MCO,1,Friday,13,35,16,25,15.9,15.9,...,5,0.0,45,2.0,3.2,280,1013.0,1015.7,24.0,0
3,2024-04-20,ORD,1,Saturday,19,59,22,52,6.0,2.3,...,69,0.0,41,4.0,6.4,300,994.5,1019.1,24.13,0
4,2024-04-20,JFK,1,Saturday,14,55,16,21,10.3,10.3,...,59,0.0,44,4.4,6.2,260,1009.5,1009.6,24.13,0


In [631]:
merged_df_test_odd.dtypes

Date (MM/DD/YYYY)             datetime64[ns]
Origin Airport                        object
Arrival Status First                category
Day Name                            category
Scheduled departure hour            category
Scheduled departure minute          category
Scheduled Arrival hour              category
Scheduled Arrival minute            category
temp_x                               float64
app_temp_x                           float64
clouds_x                               int64
precip_x                             float64
rh_x                                   int64
wind_spd_x                           float64
wind_gust_spd_x                      float64
wind_dir_x                             int64
pres_x                               float64
slp_x                                float64
vis_x                                float64
snow_x                                 int64
Location_x                            object
temp_y                               float64
app_temp_y

In [632]:
#from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
#scaler = StandardScaler()

# Fit the scaler to the data and transform it
#scaled_data = scaler.fit_transform(merged_df_test_even)

# Replace the original data with the scaled data
#merged_df_test_even = pd.DataFrame(scaled_data, columns=merged_df_test_even.columns)

In [633]:
date_column = merged_df_test_odd['Date (MM/DD/YYYY)']

# Drop the date column before encoding
merged_df_test_odd = pd.get_dummies(merged_df_test_odd.drop(columns=['Date (MM/DD/YYYY)']), drop_first=True)

# Add the date column back to the encoded DataFrame
#merged_df_test['Date (MM/DD/YYYY)'] = date_column
merged_df_test_odd.head()

Unnamed: 0,temp_x,app_temp_x,clouds_x,precip_x,rh_x,wind_spd_x,wind_gust_spd_x,wind_dir_x,pres_x,slp_x,...,Scheduled Arrival minute_50,Scheduled Arrival minute_51,Scheduled Arrival minute_52,Scheduled Arrival minute_53,Scheduled Arrival minute_54,Scheduled Arrival minute_55,Scheduled Arrival minute_56,Scheduled Arrival minute_57,Scheduled Arrival minute_58,Scheduled Arrival minute_59
0,11.5,11.5,65,0.25,72,5.6,8.6,260,1000.0,1015.5,...,False,False,True,False,False,False,False,False,False,False
1,15.9,15.9,84,0.76,58,9.6,14.4,180,996.0,1011.2,...,False,False,False,False,False,False,False,False,False,False
2,15.9,15.9,84,0.76,58,9.6,14.4,180,996.0,1011.2,...,False,False,False,False,False,False,False,False,False,False
3,6.0,2.3,29,0.0,53,5.73,8.4,280,999.0,1014.7,...,False,False,True,False,False,False,False,False,False,False
4,10.3,10.3,63,0.0,41,8.66,12.0,280,996.5,1012.0,...,False,False,False,False,False,False,False,False,False,False


In [634]:
merged_df_test_odd.rename(columns={'Arrival Status First_1': 'Arrival Status First_1.0', 'Arrival Status First_2': 'Arrival Status First_2.0'}, inplace=True)

In [635]:
#from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
#scaled_data = scaler.fit_transform(merged_df_test_odd)

# Replace the original data with the scaled data
#merged_df_test_odd = pd.DataFrame(scaled_data, columns=merged_df_test_odd.columns)

In [636]:
gb_y_train_1_pred = pipeline.predict(merged_df_test_even)
gb_y_train_1_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [637]:
print(merged_df_test_even.columns)

Index(['temp_x', 'app_temp_x', 'clouds_x', 'precip_x', 'rh_x', 'wind_spd_x',
       'wind_gust_spd_x', 'wind_dir_x', 'pres_x', 'slp_x',
       ...
       'Scheduled Arrival minute_50', 'Scheduled Arrival minute_51',
       'Scheduled Arrival minute_52', 'Scheduled Arrival minute_53',
       'Scheduled Arrival minute_54', 'Scheduled Arrival minute_55',
       'Scheduled Arrival minute_56', 'Scheduled Arrival minute_57',
       'Scheduled Arrival minute_58', 'Scheduled Arrival minute_59'],
      dtype='object', length=196)


In [638]:
# Define a dictionary to map numerical labels to their meanings
label_mapping = {
    2: "late",
    1: "early",
    0: "ontime"
}

# Convert the numerical labels into their corresponding meanings
gb_y_train_1_pred_meanings = [label_mapping[label] for label in gb_y_train_1_pred]

# Print the predicted labels with their corresponding meanings
print(gb_y_train_1_pred_meanings)


['early', 'early', 'early', 'early', 'early', 'early', 'early', 'early', 'early', 'early', 'early', 'early']


In [639]:
gb_y_train_2_pred = pipeline_2.predict(merged_df_test_odd)
gb_y_train_2_pred

array([1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 2])

In [640]:
# Define a dictionary to map numerical labels to their meanings
label_mapping = {
    2: "late",
    1: "early",
    0: "ontime"
}

# Convert the numerical labels into their corresponding meanings
gb_y_train_1_pred_meanings = [label_mapping[label] for label in gb_y_train_2_pred]

# Print the predicted labels with their corresponding meanings
print(gb_y_train_1_pred_meanings)

['early', 'early', 'late', 'early', 'ontime', 'early', 'early', 'early', 'early', 'early', 'late']


In [641]:
import pandas as pd
import itertools

# Define a dictionary to map numerical labels to their meanings
label_mapping = {
    2: "late",
    1: "early",
    0: "ontime"
}

# Define the possible combinations of values for two columns
column_combinations = [(True, False), (False, True), (False, False)]

# Initialize an empty dictionary to store predictions
predictions_dict = {}

# Iterate over each combination of column values
for comb in column_combinations:
    # Create a copy of the DataFrame
    df_copy = merged_df_test_odd.copy()
    
    # Set the values of the columns based on the current combination
    df_copy['Arrival Status First_1.0'] = comb[0]
    df_copy['Arrival Status First_2.0'] = comb[1]
    
    # Make predictions using the model pipeline
    predictions = pipeline_2.predict(df_copy)
    
    # Map numerical labels to their corresponding meanings
    predictions_meanings = [label_mapping[label] for label in predictions]
    
    # Store the predictions in the dictionary with the combination as the key
    predictions_dict[comb] = predictions_meanings

# Access predictions for each combination from the dictionary
for comb, predictions in predictions_dict.items():
    print("Predictions for combination:", comb, ":", predictions)


Predictions for combination: (True, False) : ['early', 'early', 'late', 'early', 'ontime', 'early', 'early', 'early', 'early', 'early', 'late']
Predictions for combination: (False, True) : ['early', 'early', 'ontime', 'early', 'early', 'early', 'early', 'late', 'early', 'early', 'late']
Predictions for combination: (False, False) : ['early', 'early', 'late', 'early', 'early', 'early', 'early', 'late', 'early', 'early', 'late']
