In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
# fetch data 
airline_data = pd.read_csv('airline_dataset_2022_23.csv')
#flight_data = pd.read_csv('On_Time_Marketing_Carrier_On_Time_Performance_(Beginning_January_2018)_2023_12.csv')
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des
0,9E,2023-01-01,5190.0,N325PQ,JFK,22:41:00,22:13,91.0,68.0,-28.0,...,1672625700,0.0,12,804,c04n,Overcast clouds,300,2.2,0.9,SYR
1,9E,2022-01-02,5531.0,N678CA,JFK,14:12:00,17:12,77.0,84.0,180.0,...,1641146400,0.6,8,c04d,Overcast clouds,804,280,9.2,4.59,SYR
2,9E,2023-01-02,5190.0,N195PQ,JFK,22:41:00,22:46,91.0,77.0,5.0,...,1672712100,0.0,16,804,c04n,Overcast clouds,60,2.4,0.37,SYR
3,9E,2022-01-03,5531.0,N602LR,JFK,14:12:00,14:24,77.0,81.0,12.0,...,1641232800,0.8,16,c04d,Overcast clouds,804,310,7.6,3.1,SYR
4,9E,2023-01-03,5190.0,N303PQ,JFK,22:41:00,22:23,91.0,78.0,-18.0,...,1672798500,0.0,9,804,c04n,Overcast clouds,145,4.6,1.65,SYR


In [3]:
airline_data.shape

(5623, 90)

In [4]:
pd.set_option('display.max_columns', None)
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [5]:
OP_CARRIER_unique_values = airline_data['Carrier Code'].unique()
print(OP_CARRIER_unique_values)

['9E' 'MQ' 'B6' 'WN' 'UA']


In [6]:
airline_data.isna().sum()

Carrier Code          0
Date (MM/DD/YYYY)     0
Flight Number         0
Tail Number          11
Origin Airport        0
                     ..
weather.code_des      0
wind_dir_des          0
wind_gust_spd_des     0
wind_spd_des          0
Location_des          0
Length: 90, dtype: int64

In [7]:
airline_data.dropna(inplace=True)

In [8]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
weather.code_des     0
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Length: 90, dtype: int64

In [9]:
airline_data['Arrival Delay (Minutes)']=airline_data['Arrival Delay (Minutes)'].astype(np.int64)

In [10]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5612 entries, 0 to 5622
Data columns (total 90 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carrier Code                                5612 non-null   object 
 1   Date (MM/DD/YYYY)                           5612 non-null   object 
 2   Flight Number                               5612 non-null   float64
 3   Tail Number                                 5612 non-null   object 
 4   Origin Airport                              5612 non-null   object 
 5   Scheduled Arrival Time                      5612 non-null   object 
 6   Actual Arrival Time                         5612 non-null   object 
 7   Scheduled Elapsed Time (Minutes)            5612 non-null   float64
 8   Actual Elapsed Time (Minutes)               5612 non-null   float64
 9   Arrival Delay (Minutes)                     5612 non-null   int64  
 10  Wheels-on Time   

In [11]:
def set_dependent_variable(time):
    if time<-5:
        return 'Early'
    elif (time>=-5 and time<=5):
        return 'On-time'
    else:
        return 'Late'

In [12]:
airline_data['Status']=airline_data['Arrival Delay (Minutes)'].apply(set_dependent_variable)

In [13]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [14]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Status               0
Length: 91, dtype: int64

In [15]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [16]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [17]:
numeric_columns = airline_data.select_dtypes(include=['float64', 'int64'])

correl = numeric_columns.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='Airline data heatmap')

In [18]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [19]:
airline_data.drop(columns=['Tail Number','Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x', 'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y', 'app_temp', 'azimuth', 'clouds',
       'dewpt', 'dhi', 'dni', 'elev_angle', 'ghi', 'pod', 'precip_rate',
       'pres', 'revision_status', 'rh', 'slp', 'snow_rate', 'solar_rad',
       'temp', 'timestamp_local', 'timestamp_utc', 'ts', 'uv', 'vis',
       'weather.code', 'weather.icon', 'weather.description', 'wind_dir',
       'wind_gust_spd', 'wind_spd', 'Location'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,Arrival_time,departure_time,app_temp_des,azimuth_des,clouds_des,dewpt_des,dhi_des,dni_des,elev_angle_des,ghi_des,pod_des,precip_rate_des,pres_des,revision_status_des,rh_des,slp_des,snow_rate_des,solar_rad_des,temp_des,timestamp_local_des,timestamp_utc_des,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des,Status
0,9E,2023-01-01,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-01 22:45:00,2023-01-01 21:15:00,3.4,285.8,100,1.8,0,0,-49.1,0,n,0.0,1000,final,88,1015,0.0,0,3.6,2023-01-01 21:15:00,2023-01-02 02:15:00,1672625700,0.0,12,804,c04n,Overcast clouds,300,2.2,0.9,SYR,Early
1,9E,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,2022-01-02 14:15:00,2022-01-02 13:00:00,-10.9,192.9,100,-6.2,86,736,22.9,365,d,0.0,994,final,91,1009,0.0,123,-5.0,2022-01-02 13:00:00,2022-01-02 18:00:00,1641146400,0.6,8,c04d,Overcast clouds,804,280,9.2,4.59,SYR,Late
2,9E,2023-01-02,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-02 22:45:00,2023-01-02 21:15:00,5.8,285.8,100,0.9,0,0,-48.9,0,n,0.0,1005,final,78,1021,0.0,0,4.4,2023-01-02 21:15:00,2023-01-03 02:15:00,1672712100,0.0,16,804,c04n,Overcast clouds,60,2.4,0.37,SYR,On-time
3,9E,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,2022-01-03 14:15:00,2022-01-03 13:00:00,-13.0,192.8,87,-14.4,86,738,23.0,367,d,0.0,1009,final,59,1025,0.0,161,-7.8,2022-01-03 13:00:00,2022-01-03 18:00:00,1641232800,0.8,16,c04d,Overcast clouds,804,310,7.6,3.1,SYR,Late
4,9E,2023-01-03,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-03 22:45:00,2023-01-03 21:15:00,3.8,285.8,100,3.8,0,0,-48.8,0,n,0.0,991,final,92,1006,0.0,0,5.0,2023-01-03 21:15:00,2023-01-04 02:15:00,1672798500,0.0,9,804,c04n,Overcast clouds,145,4.6,1.65,SYR,Early


In [20]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Origin Airport',
       'Scheduled Arrival Time', 'Destination Airport',
       'Scheduled departure time', 'Arrival_time', 'departure_time',
       'app_temp_des', 'azimuth_des', 'clouds_des', 'dewpt_des', 'dhi_des',
       'dni_des', 'elev_angle_des', 'ghi_des', 'pod_des', 'precip_rate_des',
       'pres_des', 'revision_status_des', 'rh_des', 'slp_des', 'snow_rate_des',
       'solar_rad_des', 'temp_des', 'timestamp_local_des', 'timestamp_utc_des',
       'ts_des', 'uv_des', 'vis_des', 'weather.icon_des',
       'weather.description_des', 'weather.code_des', 'wind_dir_des',
       'wind_gust_spd_des', 'wind_spd_des', 'Location_des', 'Status'],
      dtype='object')

In [21]:
len(airline_data.columns)

39

In [22]:
airline_data.drop(columns=['Arrival_time', 'departure_time',
                            'timestamp_local_des', 'timestamp_utc_des', 'Location_des',
                            'elev_angle_des', 'ts_des', 'app_temp_des',
                          'azimuth_des', 'dhi_des', 'dni_des', 'elev_angle_des', 
                           'ghi_des', 'pod_des', 'revision_status_des', 'weather.code_des',
                           'weather.icon_des', 'weather.description_des', 'solar_rad_des',
                           'vis_des'], 
                  inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status
0,9E,2023-01-01,5190.0,JFK,22:41:00,SYR,21:10:00,100,1.8,0.0,1000,88,1015,0.0,3.6,0.0,300,2.2,0.9,Early
1,9E,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,Late
2,9E,2023-01-02,5190.0,JFK,22:41:00,SYR,21:10:00,100,0.9,0.0,1005,78,1021,0.0,4.4,0.0,60,2.4,0.37,On-time
3,9E,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,Late
4,9E,2023-01-03,5190.0,JFK,22:41:00,SYR,21:10:00,100,3.8,0.0,991,92,1006,0.0,5.0,0.0,145,4.6,1.65,Early


In [23]:
# weather_data = pd.read_csv('weather_data.csv')
# weather_data.head()

In [24]:
# weather_data[weather_data['ts']== 1672631100]

In [25]:
len(airline_data.columns)

20

In [26]:
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status
0,9E,2023-01-01,5190.0,JFK,22:41:00,SYR,21:10:00,100,1.8,0.0,1000,88,1015,0.0,3.6,0.0,300,2.2,0.9,Early
1,9E,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,Late
2,9E,2023-01-02,5190.0,JFK,22:41:00,SYR,21:10:00,100,0.9,0.0,1005,78,1021,0.0,4.4,0.0,60,2.4,0.37,On-time
3,9E,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,Late
4,9E,2023-01-03,5190.0,JFK,22:41:00,SYR,21:10:00,100,3.8,0.0,991,92,1006,0.0,5.0,0.0,145,4.6,1.65,Early


In [27]:



# Sort the dataset based on "Carrier Code" and "Date (MM/DD/YYYY)"
airline_data.sort_values(by=["Carrier Code", "Date (MM/DD/YYYY)", "Scheduled Arrival Time"], inplace=True)

# Add a new column to store the status of the previous flight
airline_data['Previous Flight Status'] = airline_data.groupby(['Carrier Code', 'Date (MM/DD/YYYY)'])['Status'].shift(1)

# Handle the first flight within each group
first_flight_mask = airline_data['Previous Flight Status'].isna()
airline_data.loc[first_flight_mask, 'Previous Flight Status'] = 'Unknown'  # Replace "N/A" with "Unknown"

# Print the updated dataset
airline_data.head()


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status
1,9E,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,Late,Unknown
3,9E,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,Late,Unknown
5,9E,2022-01-04,5531.0,JFK,14:12:00,SYR,12:55:00,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,Early,Unknown
7,9E,2022-01-05,5446.0,JFK,13:45:00,SYR,12:30:00,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,Early,Unknown
9,9E,2022-01-06,5446.0,JFK,13:45:00,SYR,12:30:00,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,Early,Unknown


In [28]:
airline_data.dtypes

Carrier Code                 object
Date (MM/DD/YYYY)            object
Flight Number               float64
Origin Airport               object
Scheduled Arrival Time       object
Destination Airport          object
Scheduled departure time     object
clouds_des                    int64
dewpt_des                   float64
precip_rate_des             float64
pres_des                      int64
rh_des                        int64
slp_des                       int64
snow_rate_des               float64
temp_des                    float64
uv_des                      float64
wind_dir_des                  int64
wind_gust_spd_des           float64
wind_spd_des                float64
Status                       object
Previous Flight Status       object
dtype: object

In [29]:
# first_flights[first_flights['Previous Flight Status'] != 'Unknown']

In [30]:
# subsequent_flights[subsequent_flights['Previous Flight Status'] == 'Unknown']

In [31]:
# airline_data[airline_data['Carrier Code'] == '9E']

In [32]:
airline_data['Scheduled Arrival Hour']= pd.to_datetime(airline_data['Scheduled Arrival Time']).dt.hour
airline_data['Scheduled Arrival Minutes']= pd.to_datetime(airline_data['Scheduled Arrival Time']).dt.minute

airline_data['Scheduled departure Hour']= pd.to_datetime(airline_data['Scheduled departure time']).dt.hour
airline_data['Scheduled departure Minutes']= pd.to_datetime(airline_data['Scheduled departure time']).dt.minute

airline_data


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
1,9E,2022-01-02,5531.00,JFK,14:12:00,SYR,12:55:00,100,-6.20,0.00,994,91,1009,0.00,-5.00,0.60,280,9.20,4.59,Late,Unknown,14,12,12,55
3,9E,2022-01-03,5531.00,JFK,14:12:00,SYR,12:55:00,87,-14.40,0.00,1009,59,1025,0.00,-7.80,0.80,310,7.60,3.10,Late,Unknown,14,12,12,55
5,9E,2022-01-04,5531.00,JFK,14:12:00,SYR,12:55:00,100,-4.40,0.00,1008,69,1024,0.00,0.60,0.60,240,8.40,4.09,Early,Unknown,14,12,12,55
7,9E,2022-01-05,5446.00,JFK,13:45:00,SYR,12:30:00,100,-3.60,1.00,990,58,1005,0.00,3.90,0.60,180,12.00,6.45,Early,Unknown,13,45,12,30
9,9E,2022-01-06,5446.00,JFK,13:45:00,SYR,12:30:00,93,-8.30,0.00,997,57,1013,0.00,-0.80,0.80,250,11.80,7.45,Early,Unknown,13,45,12,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4756,WN,2023-12-27,3149.00,MCO,13:00:00,SYR,10:20:00,100,7.20,0.00,1000,96,1016,0.00,7.80,0.50,80,6.90,3.47,Late,Unknown,13,0,10,20
4757,WN,2023-12-28,3149.00,MCO,13:00:00,SYR,10:20:00,96,6.90,0.00,993,96,1008,0.00,7.50,0.60,80,5.80,2.60,Late,Unknown,13,0,10,20
4758,WN,2023-12-29,3149.00,MCO,13:00:00,SYR,10:20:00,100,4.40,0.00,990,100,1005,0.00,4.40,0.50,260,7.60,3.60,Early,Unknown,13,0,10,20
4759,WN,2023-12-30,3208.00,MCO,12:30:00,SYR,09:50:00,87,-1.20,0.00,989,96,1005,0.00,-0.60,0.60,310,7.10,4.09,Early,Unknown,12,30,9,50


In [33]:
airline_data.drop(columns=['Scheduled Arrival Time'],inplace=True)
airline_data.drop(columns=['Scheduled departure time'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
1,9E,2022-01-02,5531.0,JFK,SYR,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,Late,Unknown,14,12,12,55
3,9E,2022-01-03,5531.0,JFK,SYR,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,Late,Unknown,14,12,12,55
5,9E,2022-01-04,5531.0,JFK,SYR,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,Early,Unknown,14,12,12,55
7,9E,2022-01-05,5446.0,JFK,SYR,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,Early,Unknown,13,45,12,30
9,9E,2022-01-06,5446.0,JFK,SYR,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,Early,Unknown,13,45,12,30


In [34]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5612 entries, 1 to 4761
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Carrier Code                 5612 non-null   object 
 1   Date (MM/DD/YYYY)            5612 non-null   object 
 2   Flight Number                5612 non-null   float64
 3   Origin Airport               5612 non-null   object 
 4   Destination Airport          5612 non-null   object 
 5   clouds_des                   5612 non-null   int64  
 6   dewpt_des                    5612 non-null   float64
 7   precip_rate_des              5612 non-null   float64
 8   pres_des                     5612 non-null   int64  
 9   rh_des                       5612 non-null   int64  
 10  slp_des                      5612 non-null   int64  
 11  snow_rate_des                5612 non-null   float64
 12  temp_des                     5612 non-null   float64
 13  uv_des                 

In [35]:
airline_data['Date'] = airline_data['Date (MM/DD/YYYY)'].astype('datetime64[ns]')


In [36]:
airline_data.drop(columns=['Date (MM/DD/YYYY)'],inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
1,9E,5531.0,JFK,SYR,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,Late,Unknown,14,12,12,55,2022-01-02
3,9E,5531.0,JFK,SYR,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,Late,Unknown,14,12,12,55,2022-01-03
5,9E,5531.0,JFK,SYR,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,Early,Unknown,14,12,12,55,2022-01-04
7,9E,5446.0,JFK,SYR,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,Early,Unknown,13,45,12,30,2022-01-05
9,9E,5446.0,JFK,SYR,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,Early,Unknown,13,45,12,30,2022-01-06


In [37]:
airline_data['Status'].replace("Early",0,inplace=True)
airline_data['Status'].replace("Late",1,inplace=True)
airline_data['Status'].replace("On-time",2,inplace=True)
airline_data.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
1,9E,5531.0,JFK,SYR,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,1,Unknown,14,12,12,55,2022-01-02
3,9E,5531.0,JFK,SYR,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,1,Unknown,14,12,12,55,2022-01-03
5,9E,5531.0,JFK,SYR,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,0,Unknown,14,12,12,55,2022-01-04
7,9E,5446.0,JFK,SYR,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,0,Unknown,13,45,12,30,2022-01-05
9,9E,5446.0,JFK,SYR,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,0,Unknown,13,45,12,30,2022-01-06


In [38]:
set(airline_data['Origin Airport'])

{'JFK', 'MCO', 'ORD'}

In [39]:
len(airline_data.columns)

23

In [40]:
# airline_data.drop(columns=['Date'],inplace=True
# Filter the dataset to create two subsets
first_flights = airline_data[airline_data['Previous Flight Status'] == 'Unknown']
subsequent_flights = airline_data[airline_data['Previous Flight Status'] != 'Unknown']


In [41]:
first_flights = airline_data[airline_data['Previous Flight Status'] == 'Unknown']
subsequent_flights = airline_data[airline_data['Previous Flight Status'] != 'Unknown']
first_flights.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
1,9E,5531.0,JFK,SYR,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,1,Unknown,14,12,12,55,2022-01-02
3,9E,5531.0,JFK,SYR,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,1,Unknown,14,12,12,55,2022-01-03
5,9E,5531.0,JFK,SYR,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,0,Unknown,14,12,12,55,2022-01-04
7,9E,5446.0,JFK,SYR,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,0,Unknown,13,45,12,30,2022-01-05
9,9E,5446.0,JFK,SYR,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,0,Unknown,13,45,12,30,2022-01-06


In [42]:
subsequent_flights.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
245,9E,4730.0,JFK,SYR,100,2.8,1.0,982,89,997,0.0,4.4,1.0,80,8.8,3.6,1,Early,16,20,15,0,2022-03-07
250,9E,4730.0,JFK,SYR,87,-9.5,0.0,1006,45,1021,0.0,1.1,1.3,300,8.4,5.7,1,Early,16,20,15,0,2022-03-08
255,9E,4730.0,JFK,SYR,100,-0.7,0.5,997,95,1013,7.5,0.0,1.0,80,4.0,1.5,1,Late,16,20,15,0,2022-03-09
260,9E,4730.0,JFK,SYR,87,-4.0,0.0,1005,52,1020,0.0,5.0,1.3,255,6.8,2.1,0,Early,16,20,15,0,2022-03-10
265,9E,4730.0,JFK,SYR,100,0.5,0.0,995,60,1010,0.0,7.8,1.0,50,9.3,4.09,0,Late,16,20,15,0,2022-03-11


In [43]:
first_flights.drop(columns=['Previous Flight Status'],inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [44]:
first_flights = pd.get_dummies(first_flights, drop_first = True)
first_flights.head()

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
1,5531.0,100,-6.2,0.0,994,91,1009,0.0,-5.0,0.6,280,9.2,4.59,1,14,12,12,55,2022-01-02,False,False,False,False,False,False
3,5531.0,87,-14.4,0.0,1009,59,1025,0.0,-7.8,0.8,310,7.6,3.1,1,14,12,12,55,2022-01-03,False,False,False,False,False,False
5,5531.0,100,-4.4,0.0,1008,69,1024,0.0,0.6,0.6,240,8.4,4.09,0,14,12,12,55,2022-01-04,False,False,False,False,False,False
7,5446.0,100,-3.6,1.0,990,58,1005,0.0,3.9,0.6,180,12.0,6.45,0,13,45,12,30,2022-01-05,False,False,False,False,False,False
9,5446.0,93,-8.3,0.0,997,57,1013,0.0,-0.8,0.8,250,11.8,7.45,0,13,45,12,30,2022-01-06,False,False,False,False,False,False


In [45]:
subsequent_flights['Previous Flight Status'].replace("Early",0,inplace=True)
subsequent_flights['Previous Flight Status'].replace("Late",1,inplace=True)
subsequent_flights['Previous Flight Status'].replace("On-time",2,inplace=True)
subsequent_flights.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Carrier Code,Flight Number,Origin Airport,Destination Airport,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date
245,9E,4730.0,JFK,SYR,100,2.8,1.0,982,89,997,0.0,4.4,1.0,80,8.8,3.6,1,0,16,20,15,0,2022-03-07
250,9E,4730.0,JFK,SYR,87,-9.5,0.0,1006,45,1021,0.0,1.1,1.3,300,8.4,5.7,1,0,16,20,15,0,2022-03-08
255,9E,4730.0,JFK,SYR,100,-0.7,0.5,997,95,1013,7.5,0.0,1.0,80,4.0,1.5,1,1,16,20,15,0,2022-03-09
260,9E,4730.0,JFK,SYR,87,-4.0,0.0,1005,52,1020,0.0,5.0,1.3,255,6.8,2.1,0,0,16,20,15,0,2022-03-10
265,9E,4730.0,JFK,SYR,100,0.5,0.0,995,60,1010,0.0,7.8,1.0,50,9.3,4.09,0,1,16,20,15,0,2022-03-11


In [46]:
subsequent_flights = pd.get_dummies(subsequent_flights, drop_first = True)
subsequent_flights.head()

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Date,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
245,4730.0,100,2.8,1.0,982,89,997,0.0,4.4,1.0,80,8.8,3.6,1,0,16,20,15,0,2022-03-07,False,False,False,False,False,False
250,4730.0,87,-9.5,0.0,1006,45,1021,0.0,1.1,1.3,300,8.4,5.7,1,0,16,20,15,0,2022-03-08,False,False,False,False,False,False
255,4730.0,100,-0.7,0.5,997,95,1013,7.5,0.0,1.0,80,4.0,1.5,1,1,16,20,15,0,2022-03-09,False,False,False,False,False,False
260,4730.0,87,-4.0,0.0,1005,52,1020,0.0,5.0,1.3,255,6.8,2.1,0,0,16,20,15,0,2022-03-10,False,False,False,False,False,False
265,4730.0,100,0.5,0.0,995,60,1010,0.0,7.8,1.0,50,9.3,4.09,0,1,16,20,15,0,2022-03-11,False,False,False,False,False,False


In [47]:
print(len(first_flights.columns))
print(len(subsequent_flights.columns))

25
26


In [48]:
X_train, X_test, y_train, y_test = train_test_split(first_flights.drop(columns= ['Date','Status'], axis=1), first_flights['Status'], stratify = first_flights['Status'], test_size=0.20, random_state = 35)
X_train.head()
X_test.head()
y_train.head()
y_test.head()


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
5510,1998.0,0,-4.7,0.0,1024,54,1039,0.0,3.8,0.0,335,5.4,2.97,21,4,18,10,False,False,True,False,False,True
3970,412.0,99,7.3,0.0,1006,88,1020,0.0,9.2,0.6,65,2.8,1.77,9,19,8,10,True,False,False,False,False,False
346,5350.0,90,-2.6,0.0,995,55,1010,0.0,5.7,1.2,265,14.8,9.55,11,14,9,50,False,False,False,False,False,False
5001,538.0,50,6.9,0.0,1002,32,1017,0.0,24.7,1.5,315,4.1,3.6,21,9,18,18,False,False,True,False,False,True
4710,3149.0,93,1.5,0.0,993,33,1007,0.0,18.1,1.0,185,10.3,7.2,14,10,11,30,False,False,False,True,True,False


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
2140,3854.0,87,15.5,0.0,993,76,1007,0.0,19.8,1.0,270,9.7,4.34,11,10,8,15,False,True,False,False,False,True
4935,1185.0,87,1.6,0.0,987,60,1002,0.0,8.9,0.7,360,5.2,2.6,21,59,19,6,False,False,True,False,False,True
1171,5202.0,87,12.7,0.0,1009,77,1023,0.0,16.7,1.4,70,4.8,2.1,11,20,9,59,False,False,False,False,False,False
1943,3772.0,62,16.1,0.0,988,75,1003,0.0,20.7,3.0,110,6.1,2.47,12,40,9,50,False,True,False,False,False,True
3955,656.0,87,5.4,0.0,1011,85,1026,0.0,7.8,0.7,80,4.8,2.4,10,50,8,7,True,False,False,False,True,False


5510    0
3970    2
346     0
5001    1
4710    1
Name: Status, dtype: int64

2140    2
4935    0
1171    0
1943    2
3955    0
Name: Status, dtype: int64

In [49]:
#scaling the data using standard scaler

if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    X_train
    X_test
    y_train
    y_test

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
5510,-0.36,-2.89,-0.92,-0.18,2.94,-0.56,2.86,-0.11,-0.73,-0.83,1.45,-0.61,-0.39,1.28,-1.25,0.71,-1.07,-0.60,-0.46,1.79,-0.26,-0.37,1.20
3970,-1.16,0.78,0.31,-0.18,0.67,1.37,0.53,-0.11,-0.22,-0.44,-1.59,-1.39,-0.93,-0.45,-0.42,-1.28,-1.07,1.67,-0.46,-0.56,-0.26,-0.37,-0.83
346,1.33,0.45,-0.71,-0.18,-0.72,-0.51,-0.70,-0.11,-0.55,-0.05,0.66,2.19,2.58,-0.16,-0.70,-1.08,1.10,-0.60,-0.46,-0.56,-0.26,-0.37,-0.83
5001,-1.10,-1.03,0.27,-0.18,0.16,-1.81,0.16,-0.11,1.25,0.15,1.22,-1.00,-0.11,1.28,-0.98,0.71,-0.64,-0.60,-0.46,1.79,-0.26,-0.37,1.20
4710,0.22,0.56,-0.29,-0.18,-0.97,-1.76,-1.06,-0.11,0.63,-0.18,-0.24,0.85,1.52,0.27,-0.92,-0.68,0.01,-0.60,-0.46,-0.56,3.92,2.70,-0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2831,-1.31,-0.37,-3.16,-0.18,2.06,-0.45,2.13,-0.11,-3.00,-0.31,1.51,-0.07,-0.33,-0.31,0.92,-1.08,-0.26,1.67,-0.46,-0.56,-0.26,-0.37,-0.83
4918,-0.77,0.82,-0.96,-0.18,-0.72,-0.45,-0.70,-0.11,-0.83,-0.50,-1.31,1.00,0.56,1.28,1.81,0.91,-1.29,-0.60,-0.46,1.79,-0.26,-0.37,1.20
2497,0.30,0.34,0.14,-0.18,1.05,-1.30,1.15,-0.11,0.73,0.22,-1.48,-0.43,-0.51,0.56,-0.81,-0.28,-0.80,-0.60,2.19,-0.56,-0.26,-0.37,1.20
4907,-1.10,0.34,-1.02,-0.18,0.29,-1.02,0.29,-0.11,-0.62,-0.37,1.28,-0.67,0.00,1.28,-0.59,0.71,-0.04,-0.60,-0.46,1.79,-0.26,-0.37,1.20


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
2140,0.57,0.34,1.15,-0.18,-0.97,0.69,-1.06,-0.11,0.79,-0.18,0.72,0.67,0.23,-0.16,-0.92,-1.28,-0.80,-0.60,2.19,-0.56,-0.26,-0.37,1.20
4935,-0.77,0.34,-0.27,-0.18,-1.73,-0.22,-1.68,-0.11,-0.25,-0.37,1.73,-0.67,-0.56,1.28,1.81,0.91,-1.29,-0.60,-0.46,1.79,-0.26,-0.37,1.20
1171,1.26,0.34,0.87,-0.18,1.05,0.74,0.90,-0.11,0.49,0.09,-1.54,-0.79,-0.78,-0.16,-0.36,-1.08,1.59,-0.60,-0.46,-0.56,-0.26,-0.37,-0.83
1943,0.53,-0.59,1.22,-0.18,-1.61,0.63,-1.56,-0.11,0.87,1.13,-1.09,-0.40,-0.62,-0.02,0.75,-1.08,1.10,-0.60,2.19,-0.56,-0.26,-0.37,1.20
3955,-1.04,0.34,0.12,-0.18,1.30,1.20,1.27,-0.11,-0.35,-0.37,-1.42,-0.79,-0.65,-0.31,1.31,-1.28,-1.23,1.67,-0.46,-0.56,-0.26,2.70,-0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046,1.39,0.82,1.43,-0.18,-0.72,1.26,-0.82,-0.11,0.86,-0.83,-1.59,-0.67,-0.78,-1.75,-0.08,1.51,1.10,-0.60,-0.46,-0.56,-0.26,-0.37,-0.83
2874,-1.31,0.82,-0.79,-0.18,-0.22,-0.05,-0.20,-0.11,-0.82,-0.37,-0.47,0.58,0.06,0.27,0.14,-0.28,-0.74,1.67,-0.46,-0.56,-0.26,-0.37,-0.83
2382,0.72,0.67,0.58,-0.18,-1.48,0.63,-1.43,-0.11,0.26,-0.18,-1.09,0.22,-0.28,-0.02,0.75,-1.08,1.10,-0.60,2.19,-0.56,-0.26,-0.37,1.20
1183,1.26,0.34,1.33,-0.18,-0.72,0.12,-0.70,-0.11,1.18,0.09,-0.07,0.16,0.11,-0.16,-0.36,-1.08,1.59,-0.60,-0.46,-0.56,-0.26,-0.37,-0.83


5510    0
3970    2
346     0
5001    1
4710    1
       ..
2831    1
4918    0
2497    0
4907    0
5456    2
Name: Status, Length: 2240, dtype: int64

2140    2
4935    0
1171    0
1943    2
3955    0
       ..
1046    1
2874    0
2382    2
1183    2
2408    2
Name: Status, Length: 561, dtype: int64

In [50]:
len(X_train)
len(y_train)
len(X_test)
len(y_test)

2240

2240

561

561

In [51]:
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import StratifiedKFold

# # Define classifiers with different parameters
# classifiers = {
#     'Random Forest': RandomForestClassifier(random_state=50, min_samples_leaf=6, max_features="sqrt", n_estimators=1000),
#     'Bagging': BaggingClassifier(base_estimator=RandomForestClassifier(random_state=50), n_estimators=100, max_samples=100),
#     'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
#     'Logistic Regression': LogisticRegression(),
# #     'Support Vector Machine': SVC(),
# #     'K-Nearest Neighbors': KNeighborsClassifier()
# }

# # Initialize lists to store results
# results = {}
# std_devs = {}

# # Define cross-validation strategy
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)

# # Perform cross-validation for each classifier
# for clf_name, clf in classifiers.items():
#     scores = cross_val_score(clf,X_train, y_train, cv=cv, scoring='accuracy')
#     results[clf_name] = scores.mean()
#     std_devs[clf_name] = scores.std()

# # Print results
# print("Mean Accuracy Scores:")
# for clf_name, mean_acc in results.items():
#     print(f"{clf_name}: {mean_acc:.4f} ± {std_devs[clf_name]:.4f}")

# # Select classifier with highest mean accuracy
# best_clf = max(results, key=results.get)
# print(f"\nBest Classifier: {best_clf}")


In [52]:
# Define parameter grid
# param_grid = {
#     'n_estimators': [100, 1000, 150],
#     'max_depth': [None, 5, 10],
#     'min_samples_leaf': [1, 3, 5],
#     'max_features': [None, 'sqrt', 'log2']
# }

# best_accuracy = 0  # Initialize with 0
# best_params = None

# # Perform grid search
# for n_estimators in param_grid['n_estimators']:
#     for max_depth in param_grid['max_depth']:
#         for min_samples_leaf in param_grid['min_samples_leaf']:
#             for max_features in param_grid['max_features']:
#                 # Create Random Forest Classifier with current parameters
#                 rf = RandomForestClassifier(n_estimators=n_estimators, 
#                                              max_depth=max_depth, 
#                                              min_samples_leaf=min_samples_leaf,
#                                              max_features=max_features,
#                                              random_state=50)
        
#                 # Perform cross-validation to get predicted values
#                 y_pred = cross_val_predict(rf, X_train, y_train, cv=5)
        
#                 # Calculate accuracy
#                 accuracy = accuracy_score(y_train, y_pred)
        
#                 # Check if current parameters yield a better accuracy
#                 if accuracy > best_accuracy:
#                     best_accuracy = accuracy
#                     best_params = {'n_estimators': n_estimators, 
#                                    'max_depth': max_depth, 
#                                    'min_samples_leaf': min_samples_leaf,
#                                    'max_features': max_features}
        
#                 print(f"Parameters: n_estimators={n_estimators}, max_depth={max_depth}, min_samples_leaf={min_samples_leaf}, max_features={max_features}, Accuracy: {accuracy}")

# print("Best Parameters:", best_params)
# print("Best Accuracy:", best_accuracy)


In [53]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_leaf=3, max_features="sqrt", random_state=42)
rf = rf.fit(X_train, y_train)

In [54]:
from sklearn.metrics import accuracy_score

# Predict the labels for the test data
y_pred_test = rf.predict(X_test)

# Calculate the accuracy score on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Accuracy on test data:", accuracy_test)

Accuracy on test data: 0.5080213903743316


In [55]:
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(subsequent_flights.drop(columns= ['Date', 'Status'], axis=1), subsequent_flights['Status'], stratify = subsequent_flights['Status'], test_size=0.20, random_state = 35)
X_train_sub.head()
X_test_sub.head()
y_train_sub.head()
y_test_sub.head()


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
4562,2516.0,87,-4.5,0.0,1005,62,1021,0.0,1.9,0.0,120,8.0,0.75,1,23,41,22,30,True,False,False,False,False,False
1952,4203.0,93,14.6,0.0,993,62,1008,0.0,22.3,3.4,190,14.0,3.6,2,16,23,13,35,False,True,False,False,False,True
1900,4203.0,37,0.8,0.0,1016,49,1031,0.0,11.1,6.2,300,10.8,4.9,0,16,21,13,35,False,True,False,False,False,True
257,5224.0,90,-5.6,0.0,1010,69,1026,0.0,-0.6,0.7,300,7.2,3.6,0,18,16,16,50,False,False,False,False,False,False
2722,3392.0,100,-12.6,0.0,1006,55,1021,0.0,-5.0,0.0,215,5.2,3.84,1,20,19,17,33,False,True,False,False,False,True


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
5517,1802.0,100,-3.1,0.0,1001,38,1016,0.0,10.6,0.8,280,10.8,4.59,0,16,45,13,55,False,False,True,False,False,True
1814,3392.0,87,-5.3,0.0,997,62,1012,0.0,1.1,0.0,90,2.4,1.12,0,22,24,19,40,False,True,False,False,False,True
439,5350.0,0,6.7,0.0,996,33,1010,0.0,23.9,4.3,260,8.8,6.2,0,11,15,9,55,False,False,False,False,False,False
362,4730.0,100,-9.1,0.0,1001,43,1017,0.0,2.2,1.7,90,10.0,3.6,2,16,20,15,0,False,False,False,False,False,False
2102,3402.0,87,16.6,0.0,995,81,1009,0.0,20.0,0.9,310,8.8,5.09,1,21,32,18,36,False,True,False,False,False,True


4562    1
1952    2
1900    1
257     0
2722    1
Name: Status, dtype: int64

5517    0
1814    1
439     0
362     1
2102    2
Name: Status, dtype: int64

In [56]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train_sub = pd.DataFrame(sc.fit_transform(X_train_sub), columns = X_train_sub.columns, index = X_train_sub.index)
    X_test_sub = pd.DataFrame(sc.transform(X_test_sub), columns = X_test_sub.columns, index = X_test_sub.index)
    X_train_sub
    X_test_sub
    y_train_sub
    y_test_sub

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
4562,-0.28,0.37,-0.95,-0.22,0.57,0.04,0.68,-0.10,-1.02,-0.77,-1.03,0.19,-1.43,0.34,1.19,0.49,1.41,0.05,1.24,-0.47,-0.25,-0.02,-0.43,-0.56
1952,0.60,0.57,1.13,-0.22,-0.97,0.04,-0.94,-0.10,1.00,1.39,-0.25,1.96,-0.14,1.59,-0.71,-0.41,-0.98,0.34,-0.81,2.12,-0.25,-0.02,-0.43,1.77
1900,0.60,-1.37,-0.38,-0.22,1.98,-0.68,1.93,-0.10,-0.11,3.17,0.97,1.02,0.44,-0.92,-0.71,-0.51,-0.98,0.34,-0.81,2.12,-0.25,-0.02,-0.43,1.77
257,1.14,0.47,-1.07,-0.22,1.21,0.42,1.31,-0.10,-1.27,-0.32,0.97,-0.04,-0.14,-0.92,-0.17,-0.76,-0.19,1.20,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
2722,0.18,0.82,-1.83,-0.22,0.70,-0.35,0.68,-0.10,-1.71,-0.77,0.02,-0.63,-0.03,0.34,0.38,-0.61,0.08,0.22,-0.81,2.12,-0.25,-0.02,-0.43,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,1.08,-0.92,1.10,-0.22,0.19,-0.95,0.18,-0.10,1.54,1.90,0.25,0.19,0.56,0.34,-0.71,-1.42,-0.72,-0.01,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
827,1.01,0.37,1.33,-0.22,-0.46,0.15,-0.56,-0.10,1.15,0.50,-0.86,-0.48,-0.14,-0.92,-0.17,-0.06,-0.19,1.20,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
3088,-1.60,0.82,-0.59,-0.22,-0.20,1.08,-0.19,-0.10,-1.04,-0.77,0.36,-0.57,-0.48,1.59,0.92,0.80,1.14,0.34,1.24,-0.47,-0.25,-0.02,-0.43,-0.56
4200,-1.26,0.05,0.90,-0.22,0.31,0.20,0.18,-0.10,0.70,-0.77,-0.31,1.11,1.71,0.34,-2.33,-0.41,-2.57,0.57,1.24,-0.47,-0.25,-0.02,2.33,-0.56


Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
5517,-0.66,0.82,-0.80,-0.22,0.06,-1.28,0.06,-0.10,-0.16,-0.26,0.75,1.02,0.30,-0.92,-0.71,0.70,-0.98,1.48,-0.81,-0.47,3.97,-0.02,-0.43,1.77
1814,0.18,0.37,-1.04,-0.22,-0.46,0.04,-0.44,-0.10,-1.10,-0.77,-1.36,-1.45,-1.26,-0.92,0.92,-0.36,0.61,0.62,-0.81,2.12,-0.25,-0.02,-0.43,1.77
439,1.21,-2.66,0.27,-0.22,-0.58,-1.56,-0.69,-0.10,1.16,1.96,0.52,0.43,1.03,-0.92,-2.06,-0.81,-2.04,1.48,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
362,0.88,0.82,-1.45,-0.22,0.06,-1.01,0.18,-0.10,-0.99,0.31,-1.36,0.78,-0.14,1.59,-0.71,-0.56,-0.45,-1.67,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
2102,0.18,0.37,1.34,-0.22,-0.71,1.08,-0.81,-0.10,0.77,-0.20,1.08,0.43,0.53,0.34,0.65,0.04,0.34,0.39,-0.81,2.12,-0.25,-0.02,-0.43,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,0.96,-1.79,0.76,-0.22,0.57,-0.29,0.56,-0.10,0.80,1.71,-0.53,-1.04,-1.49,-0.92,-2.33,1.10,-2.04,-0.01,-0.81,-0.47,-0.25,-0.02,-0.43,-0.56
3429,-1.26,0.37,-0.76,-0.22,0.19,-1.83,0.18,-0.10,0.34,0.50,1.19,0.40,0.60,-0.92,0.11,-1.47,-0.19,-0.81,1.24,-0.47,-0.25,-0.02,2.33,-0.56
3049,-1.58,0.82,-1.19,-0.22,0.44,-0.29,0.43,-0.10,-1.10,-0.20,-1.70,-0.75,0.08,-0.92,-0.71,1.00,-0.72,-1.27,1.24,-0.47,-0.25,-0.02,2.33,-0.56
2279,0.45,0.82,1.51,-0.22,0.19,1.52,0.18,-0.10,0.77,-0.26,-1.70,-0.63,-0.59,1.59,0.38,-1.57,0.08,-0.93,-0.81,2.12,-0.25,-0.02,-0.43,1.77


4562    1
1952    2
1900    1
257     0
2722    1
       ..
1099    0
827     2
3088    1
4200    0
1494    0
Name: Status, Length: 2248, dtype: int64

5517    0
1814    1
439     0
362     1
2102    2
       ..
910     0
3429    0
3049    0
2279    0
1558    2
Name: Status, Length: 563, dtype: int64

In [57]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_leaf=3, max_features="sqrt", random_state=42)
rf = rf.fit(X_train_sub, y_train_sub)

In [58]:
from sklearn.metrics import accuracy_score

# Predict the labels for the test data
y_pred_test_sub = rf.predict(X_test_sub)

# Calculate the accuracy score on the test data
accuracy_test = accuracy_score(y_test_sub, y_pred_test_sub)

print("Accuracy on test data:", accuracy_test)

Accuracy on test data: 0.5452930728241563


In [59]:
test_sub_output = pd.DataFrame(rf.predict(X_test_sub), index = X_test_sub.index, columns = ['pred_arr_status'])
test_sub_output.head(50)

Unnamed: 0,pred_arr_status
5517,0
1814,1
439,0
362,0
2102,1
2935,2
4185,1
2729,2
609,0
1179,0


In [60]:
set(test_sub_output['pred_arr_status'])

{0, 1, 2}

In [61]:
value_counts = test_sub_output['pred_arr_status'].value_counts()
value_counts

pred_arr_status
0    317
1    203
2     43
Name: count, dtype: int64

In [62]:
# fetch data 
test_data = pd.read_csv('test_Flight_weather2.csv')
#flight_data = pd.read_csv('On_Time_Marketing_Carrier_On_Time_Performance_(Beginning_January_2018)_2023_12.csv')
test_data.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,Dest,clouds_des,temp_des,max_temp_des,min_temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,snow_depth_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM


In [63]:
test_data

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,Dest,clouds_des,temp_des,max_temp_des,min_temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,snow_depth_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM
5,4/19/24,B6,56,MCO,SYR,84,12.4,16.7,8.9,171,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM
6,4/20/24,UA,538,ORD,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
7,4/20/24,MQ,3402,ORD,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
8,4/20/24,B6,116,JFK,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,1:25 PM,2:41 PM
9,4/20/24,9E,5340,JFK,SYR,49,10.1,19.8,4.6,227,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM


In [64]:
OP_CARRIER_unique_values

array(['9E', 'MQ', 'B6', 'WN', 'UA'], dtype=object)

In [65]:
test_data.columns

Index(['Date', 'CarrierCode', 'FlightNumber', 'Origin', 'Dest', 'clouds_des',
       'temp_des', 'max_temp_des', 'min_temp_des', 'wind_dir_des',
       'wind_spd_des', 'wind_gust_spd_des', 'snow_rate_des', 'snow_depth_des',
       'precip_rate_des', 'pres_des', 'uv_des', 'dewpt_des', 'rh_des',
       'slp_des', 'Scheduled departure time', 'Scheduled Arrival Time'],
      dtype='object')

In [66]:
test_data.drop(columns=['Dest', 'max_temp_des', 'min_temp_des', 'snow_depth_des'],inplace=True)
test_data.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,4/19/24,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,4/19/24,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
2,4/19/24,B6,116,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:34 PM,2:51 PM
3,4/19/24,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM
4,4/19/24,WN,491,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,11:35 AM,2:20 PM


In [67]:
test_data['Date'] = pd.to_datetime(test_data['Date'])


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [68]:
test_data.dtypes

Date                        datetime64[ns]
CarrierCode                         object
FlightNumber                         int64
Origin                              object
clouds_des                           int64
temp_des                           float64
wind_dir_des                         int64
wind_spd_des                       float64
wind_gust_spd_des                  float64
snow_rate_des                        int64
precip_rate_des                    float64
pres_des                           float64
uv_des                             float64
dewpt_des                          float64
rh_des                               int64
slp_des                            float64
Scheduled departure time            object
Scheduled Arrival Time              object
dtype: object

In [69]:
pred_ORD = test_data[test_data['Origin'] == 'ORD']
pred_JFK = test_data[test_data['Origin'] == 'JFK']
pred_MCO = test_data[test_data['Origin'] == 'MCO']

In [70]:
pred_ORD

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM
1,2024-04-19,MQ,3402,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,7:59 PM,10:52 PM
6,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM
7,2024-04-20,MQ,3402,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,7:59 PM,10:52 PM
11,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM
12,2024-04-21,MQ,3402,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,7:59 PM,10:52 PM
17,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM
18,2024-04-22,MQ,3402,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,7:59 PM,10:52 PM


In [71]:
def calculate_previous_status(group):
    # Check if the date and carrier code are the same within the group
    same_date_and_carrier = group["Date"] == group["Date"].shift()
    
    # Fill the "Previous Flight Status" column with boolean values
    group["Previous Flight Status"] = same_date_and_carrier
    
    return group

In [72]:
# Process data_JFK
pred_ORD.sort_values(by=["Date",'Scheduled Arrival Time'], inplace=True)
pred_ORD = pred_ORD.groupby(["Date"]).apply(calculate_previous_status)
pred_ORD.dropna(subset=["Previous Flight Status"], inplace=True)

pred_JFK.sort_values(by=["Date",'Scheduled Arrival Time'], inplace=True)
pred_JFK = pred_JFK.groupby(["Date"]).apply(calculate_previous_status)
pred_JFK.dropna(subset=["Previous Flight Status"], inplace=True)

pred_MCO.sort_values(by=["Date",'Scheduled Arrival Time'], inplace=True)
pred_MCO = pred_MCO.groupby(["Date"]).apply(calculate_previous_status)
pred_MCO.dropna(subset=["Previous Flight Status"], inplace=True)

pred_ORD = pred_ORD[pred_ORD['Previous Flight Status']]
pred_JFK = pred_JFK[pred_JFK['Previous Flight Status']]
pred_MCO = pred_MCO[pred_MCO['Previous Flight Status']]

pred_MCO.isna().sum()
pred_JFK.isna().sum()
pred_ORD.isna().sum()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

Date                        0
CarrierCode                 0
FlightNumber                0
Origin                      0
clouds_des                  0
temp_des                    0
wind_dir_des                0
wind_spd_des                0
wind_gust_spd_des           0
snow_rate_des               0
precip_rate_des             0
pres_des                    0
uv_des                      0
dewpt_des                   0
rh_des                      0
slp_des                     0
Scheduled departure time    0
Scheduled Arrival Time      0
Previous Flight Status      0
dtype: int64

In [73]:
pred_ORD

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM,True
2024-04-20,6,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM,True
2024-04-21,11,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM,True
2024-04-22,17,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM,True


In [74]:
pred_JFK.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,3,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True
2024-04-20,9,2024-04-20,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM,True
2024-04-21,14,2024-04-21,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2:55 PM,4:21 PM,True
2024-04-22,20,2024-04-22,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2:55 PM,4:21 PM,True


In [75]:
pred_MCO.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-04-19,5,2024-04-19,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM,True
2024-04-21,16,2024-04-21,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1:35 PM,4:25 PM,True
2024-04-22,22,2024-04-22,B6,56,MCO,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,1:34 PM,4:25 PM,True


In [76]:
combined_df = pd.concat([pred_ORD, pred_JFK, pred_MCO], ignore_index=True)

In [77]:
combined_df.head()

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM,True
1,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM,True
2,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM,True
3,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM,True
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True


In [78]:
combined_df['Scheduled Arrival Hour']= pd.to_datetime(combined_df['Scheduled Arrival Time']).dt.hour
combined_df['Scheduled Arrival Minutes']= pd.to_datetime(combined_df['Scheduled Arrival Time']).dt.minute

combined_df['Scheduled departure Hour']= pd.to_datetime(combined_df['Scheduled departure time']).dt.hour
combined_df['Scheduled departure Minutes']= pd.to_datetime(combined_df['Scheduled departure time']).dt.minute

combined_df.head()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM,True,21,47,18,52
1,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM,True,21,47,18,52
2,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM,True,21,47,18,52
3,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM,True,21,47,18,52
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True,16,21,14,55


In [79]:
set(combined_df['CarrierCode'])

{'9E', 'B6', 'UA'}

In [80]:
combined_df

Unnamed: 0,Date,CarrierCode,FlightNumber,Origin,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Scheduled departure time,Scheduled Arrival Time,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,2024-04-19,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,6:52 PM,9:47 PM,True,21,47,18,52
1,2024-04-20,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,6:52 PM,9:47 PM,True,21,47,18,52
2,2024-04-21,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,6:52 PM,9:47 PM,True,21,47,18,52
3,2024-04-22,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,6:52 PM,9:47 PM,True,21,47,18,52
4,2024-04-19,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,2:55 PM,4:21 PM,True,16,21,14,55
5,2024-04-20,9E,5340,JFK,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,2:55 PM,4:21 PM,True,16,21,14,55
6,2024-04-21,9E,5340,JFK,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,2:55 PM,4:21 PM,True,16,21,14,55
7,2024-04-22,9E,5340,JFK,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,2:55 PM,4:21 PM,True,16,21,14,55
8,2024-04-19,B6,56,MCO,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,1:35 PM,4:25 PM,True,16,25,13,35
9,2024-04-21,B6,56,MCO,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,1:35 PM,4:25 PM,True,16,25,13,35


In [81]:
combined_df.rename(columns={'Origin': 'Origin Airport'}, inplace=True)

In [82]:
combined_df.rename(columns={'CarrierCode': 'Carrier Code'}, inplace=True)

In [83]:
combined_df.rename(columns={'FlightNumber': 'Flight Number'}, inplace=True)

In [84]:
combined_df.drop(columns=['Date', 'Scheduled departure time', 'Scheduled Arrival Time'], inplace=True)

In [85]:
subsequent_flights.columns

Index(['Flight Number', 'clouds_des', 'dewpt_des', 'precip_rate_des',
       'pres_des', 'rh_des', 'slp_des', 'snow_rate_des', 'temp_des', 'uv_des',
       'wind_dir_des', 'wind_gust_spd_des', 'wind_spd_des', 'Status',
       'Previous Flight Status', 'Scheduled Arrival Hour',
       'Scheduled Arrival Minutes', 'Scheduled departure Hour',
       'Scheduled departure Minutes', 'Date', 'Carrier Code_B6',
       'Carrier Code_MQ', 'Carrier Code_UA', 'Carrier Code_WN',
       'Origin Airport_MCO', 'Origin Airport_ORD'],
      dtype='object')

In [86]:
combined_df.columns

Index(['Carrier Code', 'Flight Number', 'Origin Airport', 'clouds_des',
       'temp_des', 'wind_dir_des', 'wind_spd_des', 'wind_gust_spd_des',
       'snow_rate_des', 'precip_rate_des', 'pres_des', 'uv_des', 'dewpt_des',
       'rh_des', 'slp_des', 'Previous Flight Status', 'Scheduled Arrival Hour',
       'Scheduled Arrival Minutes', 'Scheduled departure Hour',
       'Scheduled departure Minutes'],
      dtype='object')

In [87]:
combined_df.head()

Unnamed: 0,Carrier Code,Flight Number,Origin Airport,clouds_des,temp_des,wind_dir_des,wind_spd_des,wind_gust_spd_des,snow_rate_des,precip_rate_des,pres_des,uv_des,dewpt_des,rh_des,slp_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes
0,UA,538,ORD,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,True,21,47,18,52
1,UA,538,ORD,49,10.1,227,3.4,6.5,0,25.0,1007.5,8.4,6.3,78,1023.7,True,21,47,18,52
2,UA,538,ORD,90,12.2,110,1.9,2.7,0,13.19,1002.8,8.4,9.6,85,1018.7,True,21,47,18,52
3,UA,538,ORD,72,12.4,131,5.5,10.9,0,17.38,999.5,8.5,8.5,78,1015.4,True,21,47,18,52
4,9E,5340,JFK,84,12.4,171,4.2,4.6,0,42.56,1003.5,8.3,3.9,57,1019.5,True,16,21,14,55


In [88]:
# L_flight_data_merged_pred = pd.get_dummies(L_flight_data_merged_pred, drop_first = True)

NameError: name 'L_flight_data_merged_pred' is not defined

In [89]:
# Assuming flight_data_merged_pred is your DataFrame
# combined_df['Status'].replace("Late",1,inplace=True)
combined_df['Previous Flight Status'] = 1
L_flight_data_merged_pred = combined_df.copy()

combined_df['Previous Flight Status'] = 0
E_flight_data_merged_pred = combined_df.copy()

combined_df['Previous Flight Status'] = 2
O_flight_data_merged_pred = combined_df.copy()

In [90]:
L_flight_data_merged_pred = pd.get_dummies(L_flight_data_merged_pred, drop_first = True)

In [91]:
# Example: Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train_sub.columns) - set(L_flight_data_merged_pred.columns)
for c in missing_cols:
    L_flight_data_merged_pred[c] = 0
L_flight_data_merged_pred = L_flight_data_merged_pred[X_train_sub.columns]


In [92]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    L_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(L_flight_data_merged_pred), columns = L_flight_data_merged_pred.columns, index = L_flight_data_merged_pred.index)
L_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,-0.67,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
1,-0.67,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
2,-0.67,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
3,-0.67,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
4,1.32,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
5,1.32,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
6,1.32,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
7,1.32,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
8,-0.87,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76
9,-0.87,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76


In [93]:
# Predict the labels for the test data
y_pred_test_sub_L_flight = rf.predict(L_flight_data_merged_pred) 
print(y_pred_test_sub_L_flight)

[1 1 1 1 0 0 0 0 0 0 0]


In [94]:
E_flight_data_merged_pred = pd.get_dummies(E_flight_data_merged_pred, drop_first = True)

In [95]:
# Example: Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train_sub.columns) - set(E_flight_data_merged_pred.columns)
for c in missing_cols:
    E_flight_data_merged_pred[c] = 0
E_flight_data_merged_pred = E_flight_data_merged_pred[X_train_sub.columns]


In [96]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    E_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(E_flight_data_merged_pred), columns = E_flight_data_merged_pred.columns, index = E_flight_data_merged_pred.index)
E_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,-0.67,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
1,-0.67,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
2,-0.67,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
3,-0.67,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
4,1.32,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
5,1.32,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
6,1.32,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
7,1.32,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
8,-0.87,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76
9,-0.87,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76


In [97]:
# Predict the labels for the test data
y_pred_test_sub_E_flight = rf.predict(E_flight_data_merged_pred) 
print(y_pred_test_sub_E_flight)

[1 1 1 1 0 0 0 0 0 0 0]


In [98]:
O_flight_data_merged_pred = pd.get_dummies(O_flight_data_merged_pred, drop_first = True)

In [99]:
# Example: Ensuring the test dataset has the same dummy columns as the training dataset
# combined_pred = pd.get_dummies(combined_pred)
missing_cols = set(X_train_sub.columns) - set(O_flight_data_merged_pred.columns)
for c in missing_cols:
    O_flight_data_merged_pred[c] = 0
O_flight_data_merged_pred = O_flight_data_merged_pred[X_train_sub.columns]


In [100]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    O_flight_data_merged_pred = pd.DataFrame(sc.fit_transform(O_flight_data_merged_pred), columns = O_flight_data_merged_pred.columns, index = O_flight_data_merged_pred.index)
O_flight_data_merged_pred

Unnamed: 0,Flight Number,clouds_des,dewpt_des,precip_rate_des,pres_des,rh_des,slp_des,snow_rate_des,temp_des,uv_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Previous Flight Status,Scheduled Arrival Hour,Scheduled Arrival Minutes,Scheduled departure Hour,Scheduled departure Minutes,Carrier Code_B6,Carrier Code_MQ,Carrier Code_UA,Carrier Code_WN,Origin Airport_MCO,Origin Airport_ORD
0,-0.67,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
1,-0.67,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
2,-0.67,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
3,-0.67,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,1.32,1.31,1.3,0.43,-0.61,0.0,1.32,0.0,-0.61,1.32
4,1.32,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
5,1.32,-1.87,-0.37,0.04,1.71,0.35,1.73,0.0,-2.11,0.0,1.77,0.11,-0.28,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
6,1.32,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
7,1.32,-0.28,0.6,-0.61,-1.29,0.35,-1.28,0.0,0.55,1.35,-0.55,1.5,1.26,0.0,-0.76,-0.89,-0.55,0.78,-0.61,0.0,-0.76,0.0,-0.61,-0.76
8,-0.87,0.55,-1.43,1.54,0.21,-1.57,0.21,0.0,0.55,-1.35,0.42,-0.49,0.31,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76
9,-0.87,0.97,1.08,-0.96,-0.05,0.99,-0.08,0.0,0.32,0.0,-1.05,-1.09,-1.39,0.0,-0.76,-0.56,-1.01,-1.57,1.63,0.0,-0.76,0.0,1.63,-0.76


In [101]:
# Predict the labels for the test data
y_pred_test_sub_O_flight = rf.predict(O_flight_data_merged_pred) 
print(y_pred_test_sub_O_flight)

[1 1 1 1 0 0 0 0 0 0 0]
