In [264]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
import datetime

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


In [265]:
Airline_Data = pd.read_csv('Detailed_Statistics_Arrivals-weather.csv',parse_dates = ["Date"])
Airline_Data.head()

Unnamed: 0,Carrier Code,Date,Flight_Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,org_snow,org_temp,org_solar_rad,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_solar_rad,dest_wind_spd
0,UA,2022-01-01,1282,N4901U,IAD,23:10,0:01,70,76,51,...,0.0,13.8,11.0,1.5,100,991.2,0.0,5.4,8.6,2.8
1,UA,2023-01-01,604,N814UA,DEN,14:58,14:52,193,177,-6,...,0.0,0.4,50.0,2.4,92,998.4,0.0,6.8,37.5,4.1
2,UA,2023-01-01,2488,N38458,EWR,23:14,23:15,75,62,1,...,0.0,11.3,95.2,3.7,92,998.4,0.0,6.8,37.5,4.1
3,UA,2023-01-01,2645,N23721,ORD,23:57,23:47,107,100,-10,...,6.5,4.6,31.4,2.2,92,998.4,0.0,6.8,37.5,4.1
4,UA,2022-01-02,1282,N4901U,IAD,23:10,23:27,70,64,17,...,0.0,15.3,20.5,2.6,98,995.8,50.5,-0.7,17.5,4.2


In [266]:
Airline_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Carrier Code                              1361 non-null   object        
 1   Date                                      1361 non-null   datetime64[ns]
 2   Flight_Number                             1361 non-null   int64         
 3   Tail Number                               1342 non-null   object        
 4   Origin Airport                            1361 non-null   object        
 5   Scheduled Arrival Time                    1361 non-null   object        
 6   Actual Arrival Time                       1361 non-null   object        
 7   Scheduled Elapsed Time (Minutes)          1361 non-null   int64         
 8   Actual Elapsed Time (Minutes)             1361 non-null   int64         
 9   Arrival Delay (Minutes)       

In [267]:
Airline_Data.isna().sum()
print(Airline_Data.columns)

Carrier Code                                 0
Date                                         0
Flight_Number                                0
Tail Number                                 19
Origin Airport                               0
Scheduled Arrival Time                       0
Actual Arrival Time                          0
Scheduled Elapsed Time (Minutes)             0
Actual Elapsed Time (Minutes)                0
Arrival Delay (Minutes)                      0
Wheels-on Time                               0
Taxi-In time (Minutes)                       0
Delay Carrier (Minutes)                      0
Delay Weather (Minutes)                      0
Delay National Aviation System (Minutes)     0
Delay Security (Minutes)                     0
Delay Late Aircraft Arrival (Minutes)        0
month                                        0
org_clouds                                   0
org_pres                                     0
org_snow                                     0
org_temp     

Index(['Carrier Code', 'Date', 'Flight_Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)', 'month', 'org_clouds',
       'org_pres', 'org_snow', 'org_temp', 'org_solar_rad', 'org_wind_spd',
       'dest_clouds', 'dest_pres', 'dest_snow', 'dest_temp', 'dest_solar_rad',
       'dest_wind_spd'],
      dtype='object')


In [268]:
month_labels = list(Airline_Data['month'].unique())
print(month_labels)
Airline_Data['month'] = Airline_Data['month'].apply(lambda x : month_labels.index(x) +1 )
Airline_Data.info()

['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 30 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Carrier Code                              1361 non-null   object        
 1   Date                                      1361 non-null   datetime64[ns]
 2   Flight_Number                             1361 non-null   int64         
 3   Tail Number                               1342 non-null   object        
 4   Origin Airport                            1361 non-null   object        
 5   Scheduled Arrival Time                    1361 non-null   object        
 6   Actual Arrival Time                       1361 non-null   object        
 7   Scheduled Elapsed Time (Minutes)          1361 non-null   int64

In [269]:
Airline_Data['Day'] = Airline_Data['Date'].dt.strftime('%w')
Airline_Data['Day'] = Airline_Data['Date'].dt.strftime('%w').astype(int)

In [270]:
required_columns = [ 'Flight_Number', 'Origin Airport', 'Scheduled Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'month', 'Day', 'org_clouds',
       'org_pres', 'org_snow', 'org_temp', 'org_wind_spd',
       'dest_clouds', 'dest_pres', 'dest_snow', 'dest_temp',
       'dest_wind_spd', 'Arrival Delay (Minutes)' ]

In [271]:
data_before_processsing = Airline_Data[required_columns]

In [272]:
data_before_processsing.head()

Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Arrival Delay (Minutes)
0,1282,IAD,23:10,70,1,6,100,996.9,0.0,13.8,1.5,100,991.2,0.0,5.4,2.8,51
1,604,DEN,14:58,193,1,0,77,818.8,0.0,0.4,2.4,92,998.4,0.0,6.8,4.1,-6
2,2488,EWR,23:14,75,1,0,49,1011.6,0.0,11.3,3.7,92,998.4,0.0,6.8,4.1,1
3,2645,ORD,23:57,107,1,0,93,990.8,6.5,4.6,2.2,92,998.4,0.0,6.8,4.1,-10
4,1282,IAD,23:10,70,1,0,100,996.1,0.0,15.3,2.6,98,995.8,50.5,-0.7,4.2,17


In [273]:
data_before_processsing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Flight_Number                     1361 non-null   int64  
 1   Origin Airport                    1361 non-null   object 
 2   Scheduled Arrival Time            1361 non-null   object 
 3   Scheduled Elapsed Time (Minutes)  1361 non-null   int64  
 4   month                             1361 non-null   int64  
 5   Day                               1361 non-null   int64  
 6   org_clouds                        1361 non-null   int64  
 7   org_pres                          1361 non-null   float64
 8   org_snow                          1361 non-null   float64
 9   org_temp                          1361 non-null   float64
 10  org_wind_spd                      1361 non-null   float64
 11  dest_clouds                       1361 non-null   int64  
 12  dest_p

In [274]:
data_before_processsing['Flight_Number'] = data_before_processsing['Flight_Number'].astype(str)
data_before_processsing['Scheduled Arrival Time'] = data_before_processsing['Scheduled Arrival Time'].apply(lambda x : (float(x.split(":")[0]) + float(x.split(":")[1])/60.0)) 
data_before_processsing['org_clouds'] = data_before_processsing['org_clouds'].astype(float)
data_before_processsing['dest_clouds'] = data_before_processsing['dest_clouds'].astype(float)


data_before_processsing.info()
data_before_processsing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Flight_Number                     1361 non-null   object 
 1   Origin Airport                    1361 non-null   object 
 2   Scheduled Arrival Time            1361 non-null   float64
 3   Scheduled Elapsed Time (Minutes)  1361 non-null   int64  
 4   month                             1361 non-null   int64  
 5   Day                               1361 non-null   int64  
 6   org_clouds                        1361 non-null   float64
 7   org_pres                          1361 non-null   float64
 8   org_snow                          1361 non-null   float64
 9   org_temp                          1361 non-null   float64
 10  org_wind_spd                      1361 non-null   float64
 11  dest_clouds                       1361 non-null   float64
 12  dest_p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_before_processsing['Flight_Number'] = data_before_processsing['Flight_Number'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_before_processsing['Scheduled Arrival Time'] = data_before_processsing['Scheduled Arrival Time'].apply(lambda x : (float(x.split(":")[0]) + float(x.split(":")[1])/60.0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Arrival Delay (Minutes)
0,1282,IAD,23.17,70,1,6,100.0,996.9,0.0,13.8,1.5,100.0,991.2,0.0,5.4,2.8,51
1,604,DEN,14.97,193,1,0,77.0,818.8,0.0,0.4,2.4,92.0,998.4,0.0,6.8,4.1,-6
2,2488,EWR,23.23,75,1,0,49.0,1011.6,0.0,11.3,3.7,92.0,998.4,0.0,6.8,4.1,1
3,2645,ORD,23.95,107,1,0,93.0,990.8,6.5,4.6,2.2,92.0,998.4,0.0,6.8,4.1,-10
4,1282,IAD,23.17,70,1,0,100.0,996.1,0.0,15.3,2.6,98.0,995.8,50.5,-0.7,4.2,17


In [275]:
data_before_processsing['org_clouds'] = data_before_processsing['org_clouds'].astype(float)
data_before_processsing['dest_clouds'] = data_before_processsing['dest_clouds'].astype(float)


data_before_processsing.info()
data_before_processsing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Flight_Number                     1361 non-null   object 
 1   Origin Airport                    1361 non-null   object 
 2   Scheduled Arrival Time            1361 non-null   float64
 3   Scheduled Elapsed Time (Minutes)  1361 non-null   int64  
 4   month                             1361 non-null   int64  
 5   Day                               1361 non-null   int64  
 6   org_clouds                        1361 non-null   float64
 7   org_pres                          1361 non-null   float64
 8   org_snow                          1361 non-null   float64
 9   org_temp                          1361 non-null   float64
 10  org_wind_spd                      1361 non-null   float64
 11  dest_clouds                       1361 non-null   float64
 12  dest_p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_before_processsing['org_clouds'] = data_before_processsing['org_clouds'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_before_processsing['dest_clouds'] = data_before_processsing['dest_clouds'].astype(float)


Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Arrival Delay (Minutes)
0,1282,IAD,23.17,70,1,6,100.0,996.9,0.0,13.8,1.5,100.0,991.2,0.0,5.4,2.8,51
1,604,DEN,14.97,193,1,0,77.0,818.8,0.0,0.4,2.4,92.0,998.4,0.0,6.8,4.1,-6
2,2488,EWR,23.23,75,1,0,49.0,1011.6,0.0,11.3,3.7,92.0,998.4,0.0,6.8,4.1,1
3,2645,ORD,23.95,107,1,0,93.0,990.8,6.5,4.6,2.2,92.0,998.4,0.0,6.8,4.1,-10
4,1282,IAD,23.17,70,1,0,100.0,996.1,0.0,15.3,2.6,98.0,995.8,50.5,-0.7,4.2,17


In [276]:
def classify_status(delay):
    if delay <= -10:
        return 'Early'
    elif delay >= -10 and delay <= 10:
        return 'On-time'
    elif delay > 10 and delay <= 30:
        return 'Late'
    else:
        return 'Severely late'

In [277]:
data_before_processsing['Status'] = data_before_processsing['Arrival Delay (Minutes)'].apply(classify_status)

data_before_processsing.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_before_processsing['Status'] = data_before_processsing['Arrival Delay (Minutes)'].apply(classify_status)


Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Arrival Delay (Minutes),Status
0,1282,IAD,23.17,70,1,6,100.0,996.9,0.0,13.8,1.5,100.0,991.2,0.0,5.4,2.8,51,Severely late
1,604,DEN,14.97,193,1,0,77.0,818.8,0.0,0.4,2.4,92.0,998.4,0.0,6.8,4.1,-6,On-time
2,2488,EWR,23.23,75,1,0,49.0,1011.6,0.0,11.3,3.7,92.0,998.4,0.0,6.8,4.1,1,On-time
3,2645,ORD,23.95,107,1,0,93.0,990.8,6.5,4.6,2.2,92.0,998.4,0.0,6.8,4.1,-10,Early
4,1282,IAD,23.17,70,1,0,100.0,996.1,0.0,15.3,2.6,98.0,995.8,50.5,-0.7,4.2,17,Late


In [278]:
required_columns2 = [ 'Flight_Number', 'Origin Airport', 'Scheduled Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'month', 'Day', 'org_clouds',
       'org_pres', 'org_snow', 'org_temp', 'org_wind_spd',
       'dest_clouds', 'dest_pres', 'dest_snow', 'dest_temp',
       'dest_wind_spd', 'Status' ]

In [279]:
data_before_processsing = data_before_processsing[required_columns2]

data_before_processsing.head()
data_before_processsing.info()

Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Status
0,1282,IAD,23.17,70,1,6,100.0,996.9,0.0,13.8,1.5,100.0,991.2,0.0,5.4,2.8,Severely late
1,604,DEN,14.97,193,1,0,77.0,818.8,0.0,0.4,2.4,92.0,998.4,0.0,6.8,4.1,On-time
2,2488,EWR,23.23,75,1,0,49.0,1011.6,0.0,11.3,3.7,92.0,998.4,0.0,6.8,4.1,On-time
3,2645,ORD,23.95,107,1,0,93.0,990.8,6.5,4.6,2.2,92.0,998.4,0.0,6.8,4.1,Early
4,1282,IAD,23.17,70,1,0,100.0,996.1,0.0,15.3,2.6,98.0,995.8,50.5,-0.7,4.2,Late


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Flight_Number                     1361 non-null   object 
 1   Origin Airport                    1361 non-null   object 
 2   Scheduled Arrival Time            1361 non-null   float64
 3   Scheduled Elapsed Time (Minutes)  1361 non-null   int64  
 4   month                             1361 non-null   int64  
 5   Day                               1361 non-null   int64  
 6   org_clouds                        1361 non-null   float64
 7   org_pres                          1361 non-null   float64
 8   org_snow                          1361 non-null   float64
 9   org_temp                          1361 non-null   float64
 10  org_wind_spd                      1361 non-null   float64
 11  dest_clouds                       1361 non-null   float64
 12  dest_p

In [280]:
categorical_varibales = ['Flight_Number', 'Origin Airport']

cvar = ['Flight_Number', 'Origin Airport', 'Status']

non_categoricalvariables = list(set(data_before_processsing.columns) - set(cvar))

print(non_categoricalvariables)


['dest_clouds', 'Scheduled Elapsed Time (Minutes)', 'dest_pres', 'dest_temp', 'org_temp', 'Day', 'org_clouds', 'Scheduled Arrival Time', 'dest_snow', 'org_pres', 'org_snow', 'org_wind_spd', 'dest_wind_spd', 'month']


In [281]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
tempdata = pd.DataFrame(sc.fit_transform(data_before_processsing[non_categoricalvariables]), columns = non_categoricalvariables, index = data_before_processsing.index)

for nc in non_categoricalvariables :
    data_before_processsing[nc] = tempdata[nc]

In [282]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
dt = enc.fit(data_before_processsing[categorical_varibales])
values = dt.transform(data_before_processsing[categorical_varibales])
columnnames = enc.get_feature_names_out(categorical_varibales)
tempT = np.transpose(values)
   



In [283]:
for i in range(len(tempT)) :
    data_before_processsing[columnnames[i]] = pd.DataFrame(tempT[i], index = data_before_processsing.index)
    
data_before_processsing.head()

Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
0,1282,IAD,0.81,-1.14,-1.79,1.52,1.53,0.54,-0.17,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,604,DEN,-1.08,1.48,-1.79,-1.49,0.77,-1.77,-0.17,-1.31,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2488,EWR,0.82,-1.03,-1.79,-1.49,-0.15,0.73,-0.17,-0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2645,ORD,0.99,-0.35,-1.79,-1.49,1.3,0.46,0.09,-0.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1282,IAD,0.81,-1.14,-1.79,-1.49,1.53,0.53,-0.17,0.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [284]:
#data = pd.get_dummies(data_before_processsing.drop(columns=['Status']))
data = data_before_processsing.drop(columns=cvar)

data.head()
columns = data.columns

for v in categorical_varibales :
    if v in columns :
        print(v,'is present')

labels = data_before_processsing['Status']
labels,class_names = pd.factorize(labels)

labelsdf = pd.DataFrame(labels, index = data.index, columns = ['Status'])
print(class_names)

Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
0,0.81,-1.14,-1.79,1.52,1.53,0.54,-0.17,0.03,-1.39,1.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.08,1.48,-1.79,-1.49,0.77,-1.77,-0.17,-1.31,-0.79,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.82,-1.03,-1.79,-1.49,-0.15,0.73,-0.17,-0.22,0.09,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.99,-0.35,-1.79,-1.49,1.3,0.46,0.09,-0.89,-0.92,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.81,-1.14,-1.79,-1.49,1.53,0.53,-0.17,0.18,-0.65,1.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Index(['Severely late', 'On-time', 'Early', 'Late'], dtype='object')


In [285]:
X_train, X_test, y_train, y_test = train_test_split(data, labelsdf, test_size = 0.15, stratify=labelsdf, random_state = 42)

len(X_train)
len(X_test)

X_train.head()
y_train.head()
X_test.head()
y_test.head()

1156

205

Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
657,-1.05,1.61,0.0,-0.49,1.1,-1.64,-0.17,1.03,0.22,-0.89,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
982,0.32,-0.14,0.9,1.02,-0.71,0.57,-0.17,0.66,-1.06,-0.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
846,-1.03,1.61,0.6,1.52,-1.28,-1.56,-0.17,1.01,-0.05,0.21,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
558,0.91,-1.07,0.0,-0.49,1.2,0.65,-0.17,1.23,-0.25,0.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
907,0.94,-0.84,0.6,0.52,1.5,0.82,-0.17,1.16,-1.39,-0.74,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


Unnamed: 0,Status
657,2
982,2
846,3
558,1
907,2


Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
162,0.31,-0.31,-1.49,-0.49,1.53,0.41,11.66,-1.31,0.83,1.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1329,-1.08,1.48,1.49,1.02,-0.71,-1.61,-0.17,-3.12,-1.06,1.27,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
956,-1.03,1.63,0.6,-0.99,-1.7,-1.58,-0.17,0.57,-0.58,-0.51,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1236,0.28,-0.26,1.2,-0.99,0.47,0.49,-0.17,-1.16,0.56,0.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
878,0.94,-0.84,0.6,1.02,-1.31,0.77,-0.17,0.79,-0.18,-1.04,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


Unnamed: 0,Status
162,2
1329,3
956,1
1236,2
878,2


In [286]:
# Fitting Classifier to the Training Set
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#classifier = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)
classifier =  RandomForestClassifier(n_estimators=100,criterion='gini', max_depth=4, random_state=42)
classifier.fit(X_train, y_train)

  classifier.fit(X_train, y_train)


In [287]:
classifier.score(X_train,y_train)

0.5129757785467128

In [288]:
# Model performance on training set
y_pred_train =pd.DataFrame(classifier.predict(X_train), index = X_train.index, columns = ['rf_pred'])
y_pred_train.head()

Unnamed: 0,rf_pred
657,1
982,2
846,1
558,1
907,2


In [289]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [290]:
accuracy = metrics.accuracy_score(y_train, y_pred_train)
print("Accuracy: {:.2f}".format(accuracy))
cm=confusion_matrix(y_train,y_pred_train)
print('Confusion Matrix: \n', cm)
print(classification_report(y_train, y_pred_train, target_names=class_names))

Accuracy: 0.51
Confusion Matrix: 
 [[  1 139  13   0]
 [  0 474  22   0]
 [  0 251 118   0]
 [  0 131   7   0]]
               precision    recall  f1-score   support

Severely late       1.00      0.01      0.01       153
      On-time       0.48      0.96      0.64       496
        Early       0.74      0.32      0.45       369
         Late       0.00      0.00      0.00       138

     accuracy                           0.51      1156
    macro avg       0.55      0.32      0.27      1156
 weighted avg       0.57      0.51      0.42      1156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [291]:
# Model performance on training set
y_pred_test =pd.DataFrame(classifier.predict(X_test), index = X_test.index, columns = ['rf_pred'])
y_pred_test.head()

Unnamed: 0,rf_pred
162,1
1329,1
956,1
1236,1
878,2


In [292]:
accuracy = metrics.accuracy_score(y_test, y_pred_test)
print("Accuracy: {:.2f}".format(accuracy))

from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_test,y_pred_test)
print('Confusion Matrix: \n', cm)
print(classification_report(y_test, y_pred_test, target_names=class_names))

Accuracy: 0.47
Confusion Matrix: 
 [[ 0 25  2  0]
 [ 0 81  7  0]
 [ 0 50 15  0]
 [ 0 24  1  0]]
               precision    recall  f1-score   support

Severely late       0.00      0.00      0.00        27
      On-time       0.45      0.92      0.60        88
        Early       0.60      0.23      0.33        65
         Late       0.00      0.00      0.00        25

     accuracy                           0.47       205
    macro avg       0.26      0.29      0.23       205
 weighted avg       0.38      0.47      0.37       205



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [293]:
def addOutputRF(features,ouput) :
    x_train_with_op =  features.copy()
    x_train_with_op = x_train_with_op.merge(ouput, left_index = True, right_index = True)
    return x_train_with_op


In [294]:
xrf_train = addOutputRF(X_train,y_pred_train)
xrf_test = addOutputRF(X_test,y_pred_test)

xrf_train.head()
xrf_test.head()

Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD,rf_pred
657,-1.05,1.61,0.0,-0.49,1.1,-1.64,-0.17,1.03,0.22,-0.89,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
982,0.32,-0.14,0.9,1.02,-0.71,0.57,-0.17,0.66,-1.06,-0.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
846,-1.03,1.61,0.6,1.52,-1.28,-1.56,-0.17,1.01,-0.05,0.21,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
558,0.91,-1.07,0.0,-0.49,1.2,0.65,-0.17,1.23,-0.25,0.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
907,0.94,-0.84,0.6,0.52,1.5,0.82,-0.17,1.16,-1.39,-0.74,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2


Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD,rf_pred
162,0.31,-0.31,-1.49,-0.49,1.53,0.41,11.66,-1.31,0.83,1.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1329,-1.08,1.48,1.49,1.02,-0.71,-1.61,-0.17,-3.12,-1.06,1.27,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
956,-1.03,1.63,0.6,-0.99,-1.7,-1.58,-0.17,0.57,-0.58,-0.51,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
1236,0.28,-0.26,1.2,-0.99,0.47,0.49,-0.17,-1.16,0.56,0.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
878,0.94,-0.84,0.6,1.02,-1.31,0.77,-0.17,0.79,-0.18,-1.04,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2


In [295]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=15)

In [296]:
knn.fit(xrf_train, y_train)

  return self._fit(X, y)


In [297]:
knn.score(xrf_train, y_train)

0.5294117647058824

In [298]:
knn_pred = pd.DataFrame(knn.predict(xrf_test), index = xrf_test.index, columns = ['knn_pred'])

In [299]:
accuracy = metrics.accuracy_score(y_test, knn_pred)
print("Accuracy: {:.2f}".format(accuracy))

from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_test,knn_pred)
print('Confusion Matrix: \n', cm)
print(classification_report(y_test, knn_pred, target_names=class_names))

Accuracy: 0.40
Confusion Matrix: 
 [[ 3 18  6  0]
 [ 4 61 23  0]
 [ 3 45 17  0]
 [ 2 19  3  1]]
               precision    recall  f1-score   support

Severely late       0.25      0.11      0.15        27
      On-time       0.43      0.69      0.53        88
        Early       0.35      0.26      0.30        65
         Late       1.00      0.04      0.08        25

     accuracy                           0.40       205
    macro avg       0.51      0.28      0.26       205
 weighted avg       0.45      0.40      0.35       205



In [300]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 60, min_samples_leaf = 4, max_depth = 5, n_estimators = 70)

gb = gb.fit(xrf_train, y_train) 
gb.score(xrf_train, y_train) 


  y = column_or_1d(y, warn=True)


0.7923875432525952

In [301]:
test_output = pd.DataFrame(gb.predict(xrf_test), index = xrf_test.index, columns = ['pred_Y'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
gb.score(xrf_test, y_test) 

Unnamed: 0,pred_Y,Status
162,1,2
1329,1,3
956,1,1
1236,1,2
878,2,2


Fraction of correct classification 


0.3804878048780488

In [302]:
predict_Apr_Data = pd.read_csv('predict_Arrivals.csv',parse_dates = ["Date"])
predict_Apr_Data.head()
predict_Apr_Data.info()

Unnamed: 0,Date,Day,Origin Airport,Flight Number,Arrival Time,Scheduled Elapsed Time (Minutes),month,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd
0,2023-04-21,Friday,ORD,UA 3839,10:00 AM,110,April,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0
1,2023-04-21,Friday,ORD,UA 3524,4:50 PM,115,April,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0
2,2023-04-21,Friday,ORD,UA 538,9:34 PM,114,April,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0
3,2023-04-22,Saturday,ORD,UA 3839,10:00 AM,110,April,77,987.8,0.0,6.3,4.8,74,984.5,0,16.1,4.6
4,2023-04-22,Saturday,ORD,UA 3524,4:50 PM,115,April,77,987.8,0.0,6.3,4.8,74,984.5,0,16.1,4.6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              32 non-null     datetime64[ns]
 1   Day                               32 non-null     object        
 2   Origin Airport                    32 non-null     object        
 3   Flight Number                     32 non-null     object        
 4   Arrival Time                      32 non-null     object        
 5   Scheduled Elapsed Time (Minutes)  32 non-null     int64         
 6   month                             32 non-null     object        
 7   org_clouds                        32 non-null     int64         
 8   org_pres                          32 non-null     float64       
 9   org_snow                          32 non-null     float64       
 10  org_temp                          32 non-null     fl

In [303]:
print(predict_Apr_Data['Flight Number'].head())

0    UA 3839
1    UA 3524
2     UA 538
3    UA 3839
4    UA 3524
Name: Flight Number, dtype: object


In [304]:
predict_Apr_Data['Flight_Number'] = predict_Apr_Data['Flight Number'].apply(lambda x : x.split(" ")[1])
predict_Apr_Data['Scheduled Arrival Time'] = pd.to_datetime(predict_Apr_Data['Arrival Time']).dt.time
predict_Apr_Data['Scheduled Arrival Time'] = predict_Apr_Data['Scheduled Arrival Time'] .astype(str)

predict_Apr_Data['Scheduled Arrival Time'] = predict_Apr_Data['Scheduled Arrival Time'].apply(lambda x : (float(x.split(":")[0]) + float(x.split(":")[1])/60.0)) 

predict_Apr_Data['month'] = predict_Apr_Data['month'].apply(lambda x : month_labels.index(x) +1 )
predict_Apr_Data['Day'] = predict_Apr_Data['Date'].dt.strftime('%w')
predict_Apr_Data['Day'] = predict_Apr_Data['Date'].dt.strftime('%w').astype(int)

predict_Apr_Data.info()
predict_Apr_Data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 19 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              32 non-null     datetime64[ns]
 1   Day                               32 non-null     int64         
 2   Origin Airport                    32 non-null     object        
 3   Flight Number                     32 non-null     object        
 4   Arrival Time                      32 non-null     object        
 5   Scheduled Elapsed Time (Minutes)  32 non-null     int64         
 6   month                             32 non-null     int64         
 7   org_clouds                        32 non-null     int64         
 8   org_pres                          32 non-null     float64       
 9   org_snow                          32 non-null     float64       
 10  org_temp                          32 non-null     fl

Unnamed: 0,Date,Day,Origin Airport,Flight Number,Arrival Time,Scheduled Elapsed Time (Minutes),month,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd,Flight_Number,Scheduled Arrival Time
0,2023-04-21,5,ORD,UA 3839,10:00 AM,110,4,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0,3839,10.0
1,2023-04-21,5,ORD,UA 3524,4:50 PM,115,4,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0,3524,16.83
2,2023-04-21,5,ORD,UA 538,9:34 PM,114,4,82,991.3,0.0,11.0,4.0,54,997.3,0,17.5,3.0,538,21.57
3,2023-04-22,6,ORD,UA 3839,10:00 AM,110,4,77,987.8,0.0,6.3,4.8,74,984.5,0,16.1,4.6,3839,10.0
4,2023-04-22,6,ORD,UA 3524,4:50 PM,115,4,77,987.8,0.0,6.3,4.8,74,984.5,0,16.1,4.6,3524,16.83


In [305]:
predict_Apr_Data['org_clouds'] = predict_Apr_Data['org_clouds'].astype(float)
predict_Apr_Data['dest_clouds'] = predict_Apr_Data['dest_clouds'].astype(float)

In [306]:
selectf = [ 'Flight_Number', 'Origin Airport', 'Scheduled Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'month', 'Day', 'org_clouds',
       'org_pres', 'org_snow', 'org_temp', 'org_wind_spd',
       'dest_clouds', 'dest_pres', 'dest_snow', 'dest_temp',
       'dest_wind_spd' ]

In [307]:
databefore_scaling = predict_Apr_Data[selectf]
databefore_scaling.head()

Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd
0,3839,ORD,10.0,110,4,5,82.0,991.3,0.0,11.0,4.0,54.0,997.3,0,17.5,3.0
1,3524,ORD,16.83,115,4,5,82.0,991.3,0.0,11.0,4.0,54.0,997.3,0,17.5,3.0
2,538,ORD,21.57,114,4,5,82.0,991.3,0.0,11.0,4.0,54.0,997.3,0,17.5,3.0
3,3839,ORD,10.0,110,4,6,77.0,987.8,0.0,6.3,4.8,74.0,984.5,0,16.1,4.6
4,3524,ORD,16.83,115,4,6,77.0,987.8,0.0,6.3,4.8,74.0,984.5,0,16.1,4.6


In [308]:
tempdata = pd.DataFrame(sc.transform(databefore_scaling[non_categoricalvariables]), columns = non_categoricalvariables, index = databefore_scaling.index)

for nc in non_categoricalvariables :
    databefore_scaling[nc] = tempdata[nc]
    
databefore_scaling.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  databefore_scaling[nc] = tempdata[nc]


Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,dest_pres,dest_snow,dest_temp,dest_wind_spd
0,3839,ORD,-2.22,-0.29,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,-0.57,-0.19,0.57,-0.52
1,3524,ORD,-0.65,-0.18,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,-0.57,-0.19,0.57,-0.52
2,538,ORD,0.44,-0.2,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,-0.57,-0.19,0.57,-0.52
3,3839,ORD,-2.22,-0.29,-0.89,1.52,0.77,0.42,-0.17,-0.72,0.83,0.32,-2.49,-0.19,0.43,0.55
4,3524,ORD,-0.65,-0.18,-0.89,1.52,0.77,0.42,-0.17,-0.72,0.83,0.32,-2.49,-0.19,0.43,0.55


In [310]:
values2 = dt.transform(databefore_scaling[categorical_varibales])
tempT2 = np.transpose(values2)

In [311]:
for i in range(len(tempT2)) :
    databefore_scaling[columnnames[i]] = pd.DataFrame(tempT2[i], index = databefore_scaling.index)
    
databefore_scaling.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  databefore_scaling[columnnames[i]] = pd.DataFrame(tempT2[i], index = databefore_scaling.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  databefore_scaling[columnnames[i]] = pd.DataFrame(tempT2[i], index = databefore_scaling.index)


Unnamed: 0,Flight_Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
0,3839,ORD,-2.22,-0.29,-0.89,1.02,0.94,0.46,-0.17,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3524,ORD,-0.65,-0.18,-0.89,1.02,0.94,0.46,-0.17,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,538,ORD,0.44,-0.2,-0.89,1.02,0.94,0.46,-0.17,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3839,ORD,-2.22,-0.29,-0.89,1.52,0.77,0.42,-0.17,-0.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3524,ORD,-0.65,-0.18,-0.89,1.52,0.77,0.42,-0.17,-0.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [314]:
apr_pred_featues = databefore_scaling[X_train.columns]
apr_pred_featues.head(33)

Unnamed: 0,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),month,Day,org_clouds,org_pres,org_snow,org_temp,org_wind_spd,dest_clouds,...,Flight_Number_604,Flight_Number_652,Flight_Number_702,Flight_Number_776,Flight_Number_790,Flight_Number_794,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
0,-2.22,-0.29,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.65,-0.18,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.44,-0.2,-0.89,1.02,0.94,0.46,-0.17,-0.25,0.29,-0.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-2.22,-0.29,-0.89,1.52,0.77,0.42,-0.17,-0.72,0.83,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.65,-0.18,-0.89,1.52,0.77,0.42,-0.17,-0.72,0.83,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.44,-0.2,-0.89,1.52,0.77,0.42,-0.17,-0.72,0.83,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,-2.22,-0.29,-0.89,-1.49,1.23,0.5,-0.17,-0.79,0.43,0.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,-0.63,-0.18,-0.89,-1.49,1.23,0.5,-0.17,-0.79,0.43,0.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.44,-0.2,-0.89,-1.49,1.23,0.5,-0.17,-0.79,0.43,0.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,-2.22,-0.29,-0.89,-0.99,-0.55,0.55,-0.17,-0.68,-0.58,0.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [315]:
apr_pred_test =pd.DataFrame(classifier.predict(apr_pred_featues), index = apr_pred_featues.index, columns = ['Status (Early, On-time, Late, Severly Late)'])
apr_pred_test.head()

Unnamed: 0,pred
0,1
1,1
2,1
3,1
4,1


In [318]:
classnames = list(class_names)

apr_pred_test['Status (Early, On-time, Late, Severly Late)'] = apr_pred_test['Status (Early, On-time, Late, Severly Late)'].apply(lambda x : classnames[x])
apr_pred_test.head()

Unnamed: 0,pred
0,On-time
1,On-time
2,On-time
3,On-time
4,On-time


In [319]:
select_columns = ['Date','Day', 'Origin Airport', 'Flight Number', 'Arrival Time']
pred_data = predict_Apr_Data[select_columns]
pred_data = pred_data.merge(apr_pred_test, left_index = True, right_index = True)
pred_data.head()

Unnamed: 0,Date,Day,Origin Airport,Flight Number,Arrival Time,pred
0,2023-04-21,5,ORD,UA 3839,10:00 AM,On-time
1,2023-04-21,5,ORD,UA 3524,4:50 PM,On-time
2,2023-04-21,5,ORD,UA 538,9:34 PM,On-time
3,2023-04-22,6,ORD,UA 3839,10:00 AM,On-time
4,2023-04-22,6,ORD,UA 3524,4:50 PM,On-time


In [320]:
pred_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            32 non-null     datetime64[ns]
 1   Day             32 non-null     int64         
 2   Origin Airport  32 non-null     object        
 3   Flight Number   32 non-null     object        
 4   Arrival Time    32 non-null     object        
 5   pred            32 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 1.6+ KB


In [322]:
pred_data['Date'] = pred_data['Date'].dt.strftime('%m/%d/%Y')
pred_data.head()

Unnamed: 0,Date,Day,Origin Airport,Flight Number,Arrival Time,pred
0,04/21/2023,5,ORD,UA 3839,10:00 AM,On-time
1,04/21/2023,5,ORD,UA 3524,4:50 PM,On-time
2,04/21/2023,5,ORD,UA 538,9:34 PM,On-time
3,04/22/2023,6,ORD,UA 3839,10:00 AM,On-time
4,04/22/2023,6,ORD,UA 3524,4:50 PM,On-time


In [324]:
pred_data.to_csv("predicted_results.csv",index=False)