In [1]:
!pip install graphviz



In [2]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [3]:
# fetch data 
airline_data = pd.read_csv('flight_dataset_2022_23.csv')
#flight_data = pd.read_csv('On_Time_Marketing_Carrier_On_Time_Performance_(Beginning_January_2018)_2023_12.csv')
airline_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des
0,9E,2023-01-01,5190.0,N325PQ,JFK,22:41:00,22:13,91.0,68.0,-28.0,...,1672625700,0.0,12,804,c04n,Overcast clouds,300,2.2,0.9,SYR
1,9E,2022-01-02,5531.0,N678CA,JFK,14:12:00,17:12,77.0,84.0,180.0,...,1641146400,0.6,8,c04d,Overcast clouds,804,280,9.2,4.59,SYR
2,9E,2023-01-02,5190.0,N195PQ,JFK,22:41:00,22:46,91.0,77.0,5.0,...,1672712100,0.0,16,804,c04n,Overcast clouds,60,2.4,0.37,SYR
3,9E,2022-01-03,5531.0,N602LR,JFK,14:12:00,14:24,77.0,81.0,12.0,...,1641232800,0.8,16,c04d,Overcast clouds,804,310,7.6,3.1,SYR
4,9E,2023-01-03,5190.0,N303PQ,JFK,22:41:00,22:23,91.0,78.0,-18.0,...,1672798500,0.0,9,804,c04n,Overcast clouds,145,4.6,1.65,SYR


In [4]:
airline_data.shape

(5623, 90)

In [6]:
pd.set_option('display.max_columns', None)
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [7]:
OP_CARRIER_unique_values = airline_data['Carrier Code'].unique()
print(OP_CARRIER_unique_values)

['9E' 'MQ' 'B6' 'WN' 'UA']


In [8]:
airline_data.isna().sum()

Carrier Code          0
Date (MM/DD/YYYY)     0
Flight Number         0
Tail Number          11
Origin Airport        0
                     ..
weather.code_des      0
wind_dir_des          0
wind_gust_spd_des     0
wind_spd_des          0
Location_des          0
Length: 90, dtype: int64

In [9]:
airline_data.dropna(inplace=True)

In [10]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
weather.code_des     0
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Length: 90, dtype: int64

In [11]:
airline_data['Arrival Delay (Minutes)']=airline_data['Arrival Delay (Minutes)'].astype(np.int64)

In [12]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5612 entries, 0 to 5622
Data columns (total 90 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Carrier Code                                5612 non-null   object 
 1   Date (MM/DD/YYYY)                           5612 non-null   object 
 2   Flight Number                               5612 non-null   float64
 3   Tail Number                                 5612 non-null   object 
 4   Origin Airport                              5612 non-null   object 
 5   Scheduled Arrival Time                      5612 non-null   object 
 6   Actual Arrival Time                         5612 non-null   object 
 7   Scheduled Elapsed Time (Minutes)            5612 non-null   float64
 8   Actual Elapsed Time (Minutes)               5612 non-null   float64
 9   Arrival Delay (Minutes)                     5612 non-null   int64  
 10  Wheels-on Time   

In [14]:
def set_dependent_variable(time):
    if time<-5:
        return 'Early'
    elif (time>=-5 and time<=5):
        return 'On-time'
    else:
        return 'Late'

In [15]:
airline_data['Status']=airline_data['Arrival Delay (Minutes)'].apply(set_dependent_variable)

In [16]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [17]:
airline_data.isna().sum()

Carrier Code         0
Date (MM/DD/YYYY)    0
Flight Number        0
Tail Number          0
Origin Airport       0
                    ..
wind_dir_des         0
wind_gust_spd_des    0
wind_spd_des         0
Location_des         0
Status               0
Length: 91, dtype: int64

In [18]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [19]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [21]:
numeric_columns = airline_data.select_dtypes(include=['float64', 'int64'])

correl = numeric_columns.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='Airline data heatmap')

In [22]:
airline_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',
       'Arrival_time', 'departure_time', 'app_temp',

In [25]:
airline_data.drop(columns=['Carrier Code',  'Tail Number','Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)_x', 'Delay Weather (Minutes)_x',
       'Delay National Aviation System (Minutes)_x',
       'Delay Security (Minutes)_x', 'Delay Late Aircraft Arrival (Minutes)_x', 'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)',
       'Delay Carrier (Minutes)_y', 'Delay Weather (Minutes)_y',
       'Delay National Aviation System (Minutes)_y',
       'Delay Security (Minutes)_y', 'Delay Late Aircraft Arrival (Minutes)_y',],inplace=True)
airline_data.head()

Unnamed: 0,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,Arrival_time,departure_time,app_temp,azimuth,clouds,dewpt,dhi,dni,elev_angle,ghi,pod,precip_rate,pres,revision_status,rh,slp,snow_rate,solar_rad,temp,timestamp_local,timestamp_utc,ts,uv,vis,weather.code,weather.icon,weather.description,wind_dir,wind_gust_spd,wind_spd,Location,app_temp_des,azimuth_des,clouds_des,dewpt_des,dhi_des,dni_des,elev_angle_des,ghi_des,pod_des,precip_rate_des,pres_des,revision_status_des,rh_des,slp_des,snow_rate_des,solar_rad_des,temp_des,timestamp_local_des,timestamp_utc_des,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Location_des,Status
0,2023-01-01,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-01 22:45:00,2023-01-01 21:15:00,4.8,314.5,100,3.3,0,0,-66.2,0,n,0.0,1016,final,76,1017,0.0,0,7.2,2023-01-01 22:45:00,2023-01-02 03:45:00,1672631100,0.0,16,Overcast clouds,804,c04n,230,5.2,3.6,JFK,3.4,285.8,100,1.8,0,0,-49.1,0,n,0.0,1000,final,88,1015,0.0,0,3.6,2023-01-01 21:15:00,2023-01-02 02:15:00,1672625700,0.0,12,804,c04n,Overcast clouds,300,2.2,0.9,SYR,Early
1,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,2022-01-02 14:15:00,2022-01-02 13:00:00,14.7,212.9,93,11.8,77,674,18.8,290,d,0.0,1002,final,83,1003,0.0,76,14.7,2022-01-02 14:15:00,2022-01-02 19:15:00,1641150900,0.6,16,804,c04d,Overcast clouds,255,9.0,4.84,JFK,-10.9,192.9,100,-6.2,86,736,22.9,365,d,0.0,994,final,91,1009,0.0,123,-5.0,2022-01-02 13:00:00,2022-01-02 18:00:00,1641146400,0.6,8,c04d,Overcast clouds,804,280,9.2,4.59,SYR,Late
2,2023-01-02,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-02 22:45:00,2023-01-02 21:15:00,9.4,314.4,100,8.2,0,0,-66.1,0,n,0.0,1020,final,92,1021,0.0,0,9.4,2023-01-02 22:45:00,2023-01-03 03:45:00,1672717500,0.0,16,Overcast clouds,804,c04n,160,3.4,2.5,JFK,5.8,285.8,100,0.9,0,0,-48.9,0,n,0.0,1005,final,78,1021,0.0,0,4.4,2023-01-02 21:15:00,2023-01-03 02:15:00,1672712100,0.0,16,804,c04n,Overcast clouds,60,2.4,0.37,SYR,On-time
3,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,2022-01-03 14:15:00,2022-01-03 13:00:00,-8.6,212.8,100,-10.7,78,676,19.0,292,d,0.88,1015,final,49,1016,14.88,107,-1.4,2022-01-03 14:15:00,2022-01-03 19:15:00,1641237300,0.5,16,600,s01d,Light snow,360,11.8,9.05,JFK,-13.0,192.8,87,-14.4,86,738,23.0,367,d,0.0,1009,final,59,1025,0.0,161,-7.8,2022-01-03 13:00:00,2022-01-03 18:00:00,1641232800,0.8,16,c04d,Overcast clouds,804,310,7.6,3.1,SYR,Late
4,2023-01-03,5190.0,JFK,22:41:00,SYR,21:10:00,2023-01-03 22:45:00,2023-01-03 21:15:00,8.3,314.4,100,7.7,0,0,-65.9,0,n,0.0,1009,final,96,1009,0.0,0,8.3,2023-01-03 22:45:00,2023-01-04 03:45:00,1672803900,0.0,8,Overcast clouds,804,c04n,180,9.2,2.1,JFK,3.8,285.8,100,3.8,0,0,-48.8,0,n,0.0,991,final,92,1006,0.0,0,5.0,2023-01-03 21:15:00,2023-01-04 02:15:00,1672798500,0.0,9,804,c04n,Overcast clouds,145,4.6,1.65,SYR,Early


In [26]:
airline_data.columns

Index(['Date (MM/DD/YYYY)', 'Flight Number', 'Origin Airport',
       'Scheduled Arrival Time', 'Destination Airport',
       'Scheduled departure time', 'Arrival_time', 'departure_time',
       'app_temp', 'azimuth', 'clouds', 'dewpt', 'dhi', 'dni', 'elev_angle',
       'ghi', 'pod', 'precip_rate', 'pres', 'revision_status', 'rh', 'slp',
       'snow_rate', 'solar_rad', 'temp', 'timestamp_local', 'timestamp_utc',
       'ts', 'uv', 'vis', 'weather.code', 'weather.icon',
       'weather.description', 'wind_dir', 'wind_gust_spd', 'wind_spd',
       'Location', 'app_temp_des', 'azimuth_des', 'clouds_des', 'dewpt_des',
       'dhi_des', 'dni_des', 'elev_angle_des', 'ghi_des', 'pod_des',
       'precip_rate_des', 'pres_des', 'revision_status_des', 'rh_des',
       'slp_des', 'snow_rate_des', 'solar_rad_des', 'temp_des',
       'timestamp_local_des', 'timestamp_utc_des', 'ts_des', 'uv_des',
       'vis_des', 'weather.icon_des', 'weather.description_des',
       'weather.code_des', 'wind_dir

In [27]:
len(airline_data.columns)

67

In [28]:
airline_data.drop(columns=['Arrival_time', 'departure_time', 'timestamp_local', 'timestamp_utc', 
                           'Location', 'timestamp_local_des', 'timestamp_utc_des', 'Location_des'], inplace=True)
airline_data.head()

Unnamed: 0,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Destination Airport,Scheduled departure time,app_temp,azimuth,clouds,dewpt,dhi,dni,elev_angle,ghi,pod,precip_rate,pres,revision_status,rh,slp,snow_rate,solar_rad,temp,ts,uv,vis,weather.code,weather.icon,weather.description,wind_dir,wind_gust_spd,wind_spd,app_temp_des,azimuth_des,clouds_des,dewpt_des,dhi_des,dni_des,elev_angle_des,ghi_des,pod_des,precip_rate_des,pres_des,revision_status_des,rh_des,slp_des,snow_rate_des,solar_rad_des,temp_des,ts_des,uv_des,vis_des,weather.icon_des,weather.description_des,weather.code_des,wind_dir_des,wind_gust_spd_des,wind_spd_des,Status
0,2023-01-01,5190.0,JFK,22:41:00,SYR,21:10:00,4.8,314.5,100,3.3,0,0,-66.2,0,n,0.0,1016,final,76,1017,0.0,0,7.2,1672631100,0.0,16,Overcast clouds,804,c04n,230,5.2,3.6,3.4,285.8,100,1.8,0,0,-49.1,0,n,0.0,1000,final,88,1015,0.0,0,3.6,1672625700,0.0,12,804,c04n,Overcast clouds,300,2.2,0.9,Early
1,2022-01-02,5531.0,JFK,14:12:00,SYR,12:55:00,14.7,212.9,93,11.8,77,674,18.8,290,d,0.0,1002,final,83,1003,0.0,76,14.7,1641150900,0.6,16,804,c04d,Overcast clouds,255,9.0,4.84,-10.9,192.9,100,-6.2,86,736,22.9,365,d,0.0,994,final,91,1009,0.0,123,-5.0,1641146400,0.6,8,c04d,Overcast clouds,804,280,9.2,4.59,Late
2,2023-01-02,5190.0,JFK,22:41:00,SYR,21:10:00,9.4,314.4,100,8.2,0,0,-66.1,0,n,0.0,1020,final,92,1021,0.0,0,9.4,1672717500,0.0,16,Overcast clouds,804,c04n,160,3.4,2.5,5.8,285.8,100,0.9,0,0,-48.9,0,n,0.0,1005,final,78,1021,0.0,0,4.4,1672712100,0.0,16,804,c04n,Overcast clouds,60,2.4,0.37,On-time
3,2022-01-03,5531.0,JFK,14:12:00,SYR,12:55:00,-8.6,212.8,100,-10.7,78,676,19.0,292,d,0.88,1015,final,49,1016,14.88,107,-1.4,1641237300,0.5,16,600,s01d,Light snow,360,11.8,9.05,-13.0,192.8,87,-14.4,86,738,23.0,367,d,0.0,1009,final,59,1025,0.0,161,-7.8,1641232800,0.8,16,c04d,Overcast clouds,804,310,7.6,3.1,Late
4,2023-01-03,5190.0,JFK,22:41:00,SYR,21:10:00,8.3,314.4,100,7.7,0,0,-65.9,0,n,0.0,1009,final,96,1009,0.0,0,8.3,1672803900,0.0,8,Overcast clouds,804,c04n,180,9.2,2.1,3.8,285.8,100,3.8,0,0,-48.8,0,n,0.0,991,final,92,1006,0.0,0,5.0,1672798500,0.0,9,804,c04n,Overcast clouds,145,4.6,1.65,Early


In [30]:
weather_data = pd.read_csv('weather_data.csv')
weather_data.head()


Columns (23) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0.1,Unnamed: 0,app_temp,azimuth,clouds,dewpt,dhi,dni,elev_angle,ghi,pod,precip_rate,pres,revision_status,rh,slp,snow_rate,solar_rad,temp,timestamp_local,timestamp_utc,ts,uv,vis,weather.code,weather.icon,weather.description,wind_dir,wind_gust_spd,wind_spd,Location
0,0,-8.9,3.6,25,-6.3,0,0,-71.0,0,n,0.0,984,final,84,1010,0.0,0,-4.0,2020-01-01 00:00:00,2020-01-01 06:00:00,1577858400,0.0,16,802,c02n,Scattered clouds,270,6.4,3.6,ORD
1,1,-9.0,12.9,25,-6.4,0,0,-69.9,0,n,0.0,984,final,84,1010,0.0,0,-4.2,2020-01-01 00:15:00,2020-01-01 06:15:00,1577859300,0.0,16,802,c02n,Scattered clouds,270,6.4,3.6,ORD
2,2,-9.2,22.1,25,-6.6,0,0,-68.9,0,n,0.0,984,final,84,1010,0.0,0,-4.3,2020-01-01 00:30:00,2020-01-01 06:30:00,1577860200,0.0,16,802,c02n,Scattered clouds,270,6.4,3.6,ORD
3,3,-9.4,31.4,25,-6.8,0,0,-67.8,0,n,0.0,984,final,83,1010,0.0,0,-4.4,2020-01-01 00:45:00,2020-01-01 06:45:00,1577861100,0.0,16,802,c02n,Scattered clouds,270,6.4,3.6,ORD
4,4,-9.3,40.6,25,-6.8,0,0,-66.7,0,n,0.0,984,final,84,1010,0.0,0,-4.5,2020-01-01 01:00:00,2020-01-01 07:00:00,1577862000,0.0,16,802,c02n,Scattered clouds,270,6.4,3.35,ORD


In [31]:
weather_data[weather_data['ts']== 1672631100]

Unnamed: 0.1,Unnamed: 0,app_temp,azimuth,clouds,dewpt,dhi,dni,elev_angle,ghi,pod,precip_rate,pres,revision_status,rh,slp,snow_rate,solar_rad,temp,timestamp_local,timestamp_utc,ts,uv,vis,weather.code,weather.icon,weather.description,wind_dir,wind_gust_spd,wind_spd,Location
105303,105303,2.6,295.4,100,2.1,0,0,-56.8,0,n,0.0,992,final,88,1017,0.0,0,3.9,2023-01-01 21:45:00,2023-01-02 03:45:00,1672631100,0.0,5,c04n,Overcast clouds,804,265,3.3,1.6,ORD
254967,254967,4.8,314.5,100,3.3,0,0,-66.2,0,n,0.0,1016,final,76,1017,0.0,0,7.2,2023-01-01 22:45:00,2023-01-02 03:45:00,1672631100,0.0,16,Overcast clouds,804,c04n,230,5.2,3.6,JFK
404627,404627,18.9,277.8,31,17.2,0,0,-66.0,0,n,0.0,1018,final,92,1022,0.0,0,18.6,2023-01-01 22:45:00,2023-01-02 03:45:00,1672631100,0.0,16,802,c02n,Scattered clouds,65,3.2,2.22,MCO
