In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

## Predictions for April 9th, 2024: 

In [2]:
initial_predictions_dataset = pd.read_csv("./test_data/Professor_CSV.csv")

In [3]:
initial_predictions_dataset.head(10)

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,,,,
1,4/10/24,WEDNESDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,,,
2,4/10/24,WEDNESDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,,,,
3,4/10/24,WEDNESDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,,,
4,4/10/24,WEDNESDAY,WN 5285,WN,WN,MCO,11:05 AM,1:45 PM,,,,
5,4/10/24,WEDNESDAY,B6 656,B6,B6,MCO,1:35 PM,4:25 PM,,,,
6,4/11/24,THURSDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,,,,
7,4/11/24,THURSDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,,,
8,4/11/24,THURSDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,,,,
9,4/11/24,THURSDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,,,


In [4]:
initial_predictions_dataset.columns

Index(['DATE', 'DAY', 'FLIGHT NUMBER', 'MKT_UNIQUE_CARRIER',
       'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEPARTURE TIME', 'ARRIVAL TIME',
       'ARRIVAL STATUS', 'ARRIVAL STATUS_Prev_flight_early',
       'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'],
      dtype='object')

In [5]:
initial_predictions_dataset.dtypes

DATE                                  object
DAY                                   object
FLIGHT NUMBER                         object
MKT_UNIQUE_CARRIER                    object
OP_UNIQUE_CARRIER                     object
ORIGIN                                object
DEPARTURE TIME                        object
ARRIVAL TIME                          object
ARRIVAL STATUS                       float64
ARRIVAL STATUS_Prev_flight_early     float64
ARRIVAL STATUS_Prev_flight_ontime    float64
ARRIVAL STATUS_Prev_flight_late      float64
dtype: object

In [6]:
def convert_to_24hr_format(date, time):
    # Adjust the format to match the 'month/day/year' structure
    return pd.to_datetime(date + ' ' + time, format='%m/%d/%y %I:%M %p')

initial_predictions_dataset['SCH_DEP_TIME'] = initial_predictions_dataset.apply(
    lambda row: convert_to_24hr_format(row['DATE'], row['DEPARTURE TIME']), axis=1)
initial_predictions_dataset['SCH_ARR_TIME'] = initial_predictions_dataset.apply(
    lambda row: convert_to_24hr_format(row['DATE'], row['ARRIVAL TIME']), axis=1)

initial_predictions_dataset.head(10)

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME
0,4/10/24,WEDNESDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00
1,4/10/24,WEDNESDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00
2,4/10/24,WEDNESDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00
3,4/10/24,WEDNESDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00
4,4/10/24,WEDNESDAY,WN 5285,WN,WN,MCO,11:05 AM,1:45 PM,,,,,2024-04-10 11:05:00,2024-04-10 13:45:00
5,4/10/24,WEDNESDAY,B6 656,B6,B6,MCO,1:35 PM,4:25 PM,,,,,2024-04-10 13:35:00,2024-04-10 16:25:00
6,4/11/24,THURSDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,,,,,2024-04-11 18:52:00,2024-04-11 21:47:00
7,4/11/24,THURSDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,,,,2024-04-11 19:59:00,2024-04-11 22:52:00
8,4/11/24,THURSDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,,,,,2024-04-11 13:33:00,2024-04-11 14:50:00
9,4/11/24,THURSDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,,,,2024-04-11 14:55:00,2024-04-11 16:21:00


In [7]:
initial_predictions_dataset.drop(columns=['DATE', 'DEPARTURE TIME', 'ARRIVAL TIME'], inplace = True) 
initial_predictions_dataset.head()

Unnamed: 0,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME
0,WEDNESDAY,UA 1400,UA,UA,ORD,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00
1,WEDNESDAY,AA 3402,AA,MQ,ORD,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00
2,WEDNESDAY,B6 116,B6,B6,JFK,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00
3,WEDNESDAY,DL 5182,DL,9E,JFK,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00
4,WEDNESDAY,WN 5285,WN,WN,MCO,,,,,2024-04-10 11:05:00,2024-04-10 13:45:00


In [8]:
# Define a mapping from day names to numbers
day_to_number = {
    'MONDAY': 1,
    'TUESDAY': 2,
    'WEDNESDAY': 3,
    'THURSDAY': 4,
    'FRIDAY': 5,
    'SATURDAY': 6,
    'SUNDAY': 7
}

# Apply the mapping to the 'DAY' column to create 'DAY_OF_WEEK'
initial_predictions_dataset['DAY_OF_WEEK'] = initial_predictions_dataset['DAY'].map(day_to_number)
initial_predictions_dataset.drop(columns=['DAY'], inplace=True)
# Verify the changes
initial_predictions_dataset.head()


Unnamed: 0,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME,DAY_OF_WEEK
0,UA 1400,UA,UA,ORD,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00,3
1,AA 3402,AA,MQ,ORD,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00,3
2,B6 116,B6,B6,JFK,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00,3
3,DL 5182,DL,9E,JFK,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00,3
4,WN 5285,WN,WN,MCO,,,,,2024-04-10 11:05:00,2024-04-10 13:45:00,3


In [9]:

# Negative "diff_arrival_departure" indicates that flight arrive next day.
diff_arrival_departure = initial_predictions_dataset['SCH_ARR_TIME'] - initial_predictions_dataset['SCH_DEP_TIME']


initial_predictions_dataset = initial_predictions_dataset[diff_arrival_departure > pd.Timedelta(0)]

initial_predictions_dataset.shape
initial_predictions_dataset.dtypes

(51, 11)

FLIGHT NUMBER                                object
MKT_UNIQUE_CARRIER                           object
OP_UNIQUE_CARRIER                            object
ORIGIN                                       object
ARRIVAL STATUS                              float64
ARRIVAL STATUS_Prev_flight_early            float64
ARRIVAL STATUS_Prev_flight_ontime           float64
ARRIVAL STATUS_Prev_flight_late             float64
SCH_DEP_TIME                         datetime64[ns]
SCH_ARR_TIME                         datetime64[ns]
DAY_OF_WEEK                                   int64
dtype: object

In [10]:
initial_predictions_dataset['DEST'] = 'SYR'

In [11]:
weather_data = pd.read_csv('./test_data/Test_Weather_Data.csv', parse_dates=['datetime'])
weather_data.head()
weather_data.shape
weather_data.dtypes

Unnamed: 0,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,Chicago,2024-04-08 00:00:00,48.5,45.4,41.4,76.37,0.0,0,,0.0,0.0,16.1,6.9,135,1010.7,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
1,Chicago,2024-04-08 01:00:00,49.2,46.2,40.2,71.16,0.0,0,,0.0,0.0,13.9,6.9,157,1010.3,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
2,Chicago,2024-04-08 02:00:00,48.7,46.0,40.9,74.21,0.0,0,,0.0,0.0,11.4,6.3,161,1010.0,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
3,Chicago,2024-04-08 03:00:00,47.7,44.9,40.9,76.95,0.0,0,,0.0,0.0,10.3,6.1,166,1009.7,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
4,Chicago,2024-04-08 04:00:00,47.7,43.8,41.8,79.87,0.0,0,,0.0,0.0,15.0,8.4,170,1009.5,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."


(1056, 24)

name                        object
datetime            datetime64[ns]
temp                       float64
feelslike                  float64
dew                        float64
humidity                   float64
precip                     float64
precipprob                   int64
preciptype                  object
snow                       float64
snowdepth                  float64
windgust                   float64
windspeed                  float64
winddir                      int64
sealevelpressure           float64
cloudcover                 float64
visibility                 float64
solarradiation               int64
solarenergy                float64
uvindex                      int64
severerisk                   int64
conditions                  object
icon                        object
stations                    object
dtype: object

In [12]:
print(set(weather_data['name']))

{'New York', 'Syracuse', 'Chicago', 'Orlando'}


In [13]:
# Replacing city names with airport names
dic = {'syracuse': 'SYR', 'new york': 'JFK', 'orlando': 'MCO', 'chicago': 'ORD'}

weather_data['name'] = weather_data['name'].apply(lambda row: dic[row.lower()])

weather_data.head()
print(set(weather_data['name']))

Unnamed: 0,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,ORD,2024-04-08 00:00:00,48.5,45.4,41.4,76.37,0.0,0,,0.0,0.0,16.1,6.9,135,1010.7,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
1,ORD,2024-04-08 01:00:00,49.2,46.2,40.2,71.16,0.0,0,,0.0,0.0,13.9,6.9,157,1010.3,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
2,ORD,2024-04-08 02:00:00,48.7,46.0,40.9,74.21,0.0,0,,0.0,0.0,11.4,6.3,161,1010.0,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
3,ORD,2024-04-08 03:00:00,47.7,44.9,40.9,76.95,0.0,0,,0.0,0.0,10.3,6.1,166,1009.7,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."
4,ORD,2024-04-08 04:00:00,47.7,43.8,41.8,79.87,0.0,0,,0.0,0.0,15.0,8.4,170,1009.5,0.0,9.9,0,0.0,0,10,Clear,clear-night,"72534014819,KORD,KMDW,72530094846,F1983,744665..."


{'SYR', 'MCO', 'JFK', 'ORD'}


In [14]:
weather_columns_to_drop = ['dew','humidity','snowdepth','sealevelpressure', 'solarradiation', 'solarenergy','uvindex','icon','stations']
weather_data.drop(columns = weather_columns_to_drop, inplace=True)

In [15]:
# Create two data frame for origin and destination. Also rename the columns by adding the prefix.
rename_origin = {}
rename_dest = {}
for col in weather_data.columns:
    rename_origin[col] = 'ORGIN_WTH_' + col
    rename_dest[col] = 'DEST_WTH_' + col
    
org_weather_data = weather_data.rename(columns=rename_origin)
dst_weather_data = weather_data.rename(columns=rename_dest)

In [16]:
# Adding Join columns
initial_predictions_dataset['ORGIN_WTH_JOIN'] = initial_predictions_dataset['SCH_DEP_TIME'].dt.round('H').astype(str) + initial_predictions_dataset['ORIGIN']
initial_predictions_dataset['DEST_WTH_JOIN'] = initial_predictions_dataset['SCH_ARR_TIME'].dt.round('H').astype(str) + initial_predictions_dataset['DEST']
org_weather_data['ORGIN_WTH_JOIN'] = org_weather_data['ORGIN_WTH_datetime'].astype(str) + org_weather_data['ORGIN_WTH_name']
dst_weather_data['DEST_WTH_JOIN'] = dst_weather_data['DEST_WTH_datetime'].astype(str) + dst_weather_data['DEST_WTH_name']
initial_predictions_dataset.head()
org_weather_data.head()
dst_weather_data.head()

Unnamed: 0,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME,DAY_OF_WEEK,DEST,ORGIN_WTH_JOIN,DEST_WTH_JOIN
0,UA 1400,UA,UA,ORD,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00,3,SYR,2024-04-10 19:00:00ORD,2024-04-10 22:00:00SYR
1,AA 3402,AA,MQ,ORD,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00,3,SYR,2024-04-10 20:00:00ORD,2024-04-10 23:00:00SYR
2,B6 116,B6,B6,JFK,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00,3,SYR,2024-04-10 14:00:00JFK,2024-04-10 15:00:00SYR
3,DL 5182,DL,9E,JFK,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00,3,SYR,2024-04-10 15:00:00JFK,2024-04-10 16:00:00SYR
4,WN 5285,WN,WN,MCO,,,,,2024-04-10 11:05:00,2024-04-10 13:45:00,3,SYR,2024-04-10 11:00:00MCO,2024-04-10 14:00:00SYR


Unnamed: 0,ORGIN_WTH_name,ORGIN_WTH_datetime,ORGIN_WTH_temp,ORGIN_WTH_feelslike,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_preciptype,ORGIN_WTH_snow,ORGIN_WTH_windgust,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,ORGIN_WTH_conditions,ORGIN_WTH_JOIN
0,ORD,2024-04-08 00:00:00,48.5,45.4,0.0,0,,0.0,16.1,6.9,135,0.0,9.9,10,Clear,2024-04-08 00:00:00ORD
1,ORD,2024-04-08 01:00:00,49.2,46.2,0.0,0,,0.0,13.9,6.9,157,0.0,9.9,10,Clear,2024-04-08 01:00:00ORD
2,ORD,2024-04-08 02:00:00,48.7,46.0,0.0,0,,0.0,11.4,6.3,161,0.0,9.9,10,Clear,2024-04-08 02:00:00ORD
3,ORD,2024-04-08 03:00:00,47.7,44.9,0.0,0,,0.0,10.3,6.1,166,0.0,9.9,10,Clear,2024-04-08 03:00:00ORD
4,ORD,2024-04-08 04:00:00,47.7,43.8,0.0,0,,0.0,15.0,8.4,170,0.0,9.9,10,Clear,2024-04-08 04:00:00ORD


Unnamed: 0,DEST_WTH_name,DEST_WTH_datetime,DEST_WTH_temp,DEST_WTH_feelslike,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_preciptype,DEST_WTH_snow,DEST_WTH_windgust,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,DEST_WTH_conditions,DEST_WTH_JOIN
0,ORD,2024-04-08 00:00:00,48.5,45.4,0.0,0,,0.0,16.1,6.9,135,0.0,9.9,10,Clear,2024-04-08 00:00:00ORD
1,ORD,2024-04-08 01:00:00,49.2,46.2,0.0,0,,0.0,13.9,6.9,157,0.0,9.9,10,Clear,2024-04-08 01:00:00ORD
2,ORD,2024-04-08 02:00:00,48.7,46.0,0.0,0,,0.0,11.4,6.3,161,0.0,9.9,10,Clear,2024-04-08 02:00:00ORD
3,ORD,2024-04-08 03:00:00,47.7,44.9,0.0,0,,0.0,10.3,6.1,166,0.0,9.9,10,Clear,2024-04-08 03:00:00ORD
4,ORD,2024-04-08 04:00:00,47.7,43.8,0.0,0,,0.0,15.0,8.4,170,0.0,9.9,10,Clear,2024-04-08 04:00:00ORD


In [17]:
# Join data set
merged_flight_weather = pd.merge(initial_predictions_dataset, org_weather_data, on='ORGIN_WTH_JOIN')
merged_flight_weather = pd.merge(merged_flight_weather, dst_weather_data, on='DEST_WTH_JOIN')

merged_flight_weather.head()
merged_flight_weather.shape
merged_flight_weather.columns

Unnamed: 0,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME,DAY_OF_WEEK,DEST,ORGIN_WTH_JOIN,DEST_WTH_JOIN,ORGIN_WTH_name,ORGIN_WTH_datetime,ORGIN_WTH_temp,ORGIN_WTH_feelslike,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_preciptype,ORGIN_WTH_snow,ORGIN_WTH_windgust,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,ORGIN_WTH_conditions,DEST_WTH_name,DEST_WTH_datetime,DEST_WTH_temp,DEST_WTH_feelslike,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_preciptype,DEST_WTH_snow,DEST_WTH_windgust,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,DEST_WTH_conditions
0,UA 1400,UA,UA,ORD,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00,3,SYR,2024-04-10 19:00:00ORD,2024-04-10 22:00:00SYR,ORD,2024-04-10 19:00:00,62.5,62.5,0.0,0,,0.0,12.8,9.5,87,88.1,9.9,10,Partially cloudy,SYR,2024-04-10 22:00:00,58.7,58.7,0.0,100,rain,0.0,3.4,3.2,342,100.0,9.9,10,"Rain, Overcast"
1,AA 3402,AA,MQ,ORD,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00,3,SYR,2024-04-10 20:00:00ORD,2024-04-10 23:00:00SYR,ORD,2024-04-10 20:00:00,61.9,61.9,0.0,0,,0.0,11.4,9.7,86,88.1,9.9,10,Partially cloudy,SYR,2024-04-10 23:00:00,56.7,56.7,0.01,100,rain,0.0,2.2,0.0,4,100.0,9.9,10,"Rain, Overcast"
2,B6 116,B6,B6,JFK,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00,3,SYR,2024-04-10 14:00:00JFK,2024-04-10 15:00:00SYR,JFK,2024-04-10 14:00:00,60.3,60.3,0.0,0,,0.0,11.4,11.2,42,88.3,9.9,10,Partially cloudy,SYR,2024-04-10 15:00:00,62.8,62.8,0.0,100,rain,0.0,8.1,16.0,331,99.9,9.8,10,"Rain, Overcast"
3,DL 5182,DL,9E,JFK,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00,3,SYR,2024-04-10 15:00:00JFK,2024-04-10 16:00:00SYR,JFK,2024-04-10 15:00:00,64.1,64.1,0.0,0,,0.0,10.3,5.0,42,29.5,9.9,10,Partially cloudy,SYR,2024-04-10 16:00:00,61.0,61.0,0.09,100,rain,0.0,11.4,4.8,341,100.0,8.6,10,"Rain, Overcast"
4,B6 656,B6,B6,MCO,,,,,2024-04-10 13:35:00,2024-04-10 16:25:00,3,SYR,2024-04-10 14:00:00MCO,2024-04-10 16:00:00SYR,MCO,2024-04-10 14:00:00,84.0,84.1,0.0,0,,0.0,28.8,18.1,160,30.7,9.9,10,Partially cloudy,SYR,2024-04-10 16:00:00,61.0,61.0,0.09,100,rain,0.0,11.4,4.8,341,100.0,8.6,10,"Rain, Overcast"


(51, 44)

Index(['FLIGHT NUMBER', 'MKT_UNIQUE_CARRIER', 'OP_UNIQUE_CARRIER', 'ORIGIN',
       'ARRIVAL STATUS', 'ARRIVAL STATUS_Prev_flight_early',
       'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late',
       'SCH_DEP_TIME', 'SCH_ARR_TIME', 'DAY_OF_WEEK', 'DEST', 'ORGIN_WTH_JOIN',
       'DEST_WTH_JOIN', 'ORGIN_WTH_name', 'ORGIN_WTH_datetime',
       'ORGIN_WTH_temp', 'ORGIN_WTH_feelslike', 'ORGIN_WTH_precip',
       'ORGIN_WTH_precipprob', 'ORGIN_WTH_preciptype', 'ORGIN_WTH_snow',
       'ORGIN_WTH_windgust', 'ORGIN_WTH_windspeed', 'ORGIN_WTH_winddir',
       'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility', 'ORGIN_WTH_severerisk',
       'ORGIN_WTH_conditions', 'DEST_WTH_name', 'DEST_WTH_datetime',
       'DEST_WTH_temp', 'DEST_WTH_feelslike', 'DEST_WTH_precip',
       'DEST_WTH_precipprob', 'DEST_WTH_preciptype', 'DEST_WTH_snow',
       'DEST_WTH_windgust', 'DEST_WTH_windspeed', 'DEST_WTH_winddir',
       'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DEST_WTH_severeris

In [18]:
# Drop unnecessary columns
merged_flight_weather_updated = merged_flight_weather.drop(columns=['ORGIN_WTH_JOIN', 'DEST_WTH_JOIN', 'ORGIN_WTH_datetime', 'DEST_WTH_datetime', 'ORGIN_WTH_name', 'DEST_WTH_name'])

In [19]:
merged_flight_weather_updated.head()

Unnamed: 0,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,SCH_DEP_TIME,SCH_ARR_TIME,DAY_OF_WEEK,DEST,ORGIN_WTH_temp,ORGIN_WTH_feelslike,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_preciptype,ORGIN_WTH_snow,ORGIN_WTH_windgust,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,ORGIN_WTH_conditions,DEST_WTH_temp,DEST_WTH_feelslike,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_preciptype,DEST_WTH_snow,DEST_WTH_windgust,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,DEST_WTH_conditions
0,UA 1400,UA,UA,ORD,,,,,2024-04-10 18:52:00,2024-04-10 21:47:00,3,SYR,62.5,62.5,0.0,0,,0.0,12.8,9.5,87,88.1,9.9,10,Partially cloudy,58.7,58.7,0.0,100,rain,0.0,3.4,3.2,342,100.0,9.9,10,"Rain, Overcast"
1,AA 3402,AA,MQ,ORD,,,,,2024-04-10 19:59:00,2024-04-10 22:52:00,3,SYR,61.9,61.9,0.0,0,,0.0,11.4,9.7,86,88.1,9.9,10,Partially cloudy,56.7,56.7,0.01,100,rain,0.0,2.2,0.0,4,100.0,9.9,10,"Rain, Overcast"
2,B6 116,B6,B6,JFK,,,,,2024-04-10 13:33:00,2024-04-10 14:50:00,3,SYR,60.3,60.3,0.0,0,,0.0,11.4,11.2,42,88.3,9.9,10,Partially cloudy,62.8,62.8,0.0,100,rain,0.0,8.1,16.0,331,99.9,9.8,10,"Rain, Overcast"
3,DL 5182,DL,9E,JFK,,,,,2024-04-10 14:55:00,2024-04-10 16:21:00,3,SYR,64.1,64.1,0.0,0,,0.0,10.3,5.0,42,29.5,9.9,10,Partially cloudy,61.0,61.0,0.09,100,rain,0.0,11.4,4.8,341,100.0,8.6,10,"Rain, Overcast"
4,B6 656,B6,B6,MCO,,,,,2024-04-10 13:35:00,2024-04-10 16:25:00,3,SYR,84.0,84.1,0.0,0,,0.0,28.8,18.1,160,30.7,9.9,10,Partially cloudy,61.0,61.0,0.09,100,rain,0.0,11.4,4.8,341,100.0,8.6,10,"Rain, Overcast"


In [20]:
merged_flight_weather_updated.columns

Index(['FLIGHT NUMBER', 'MKT_UNIQUE_CARRIER', 'OP_UNIQUE_CARRIER', 'ORIGIN',
       'ARRIVAL STATUS', 'ARRIVAL STATUS_Prev_flight_early',
       'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late',
       'SCH_DEP_TIME', 'SCH_ARR_TIME', 'DAY_OF_WEEK', 'DEST', 'ORGIN_WTH_temp',
       'ORGIN_WTH_feelslike', 'ORGIN_WTH_precip', 'ORGIN_WTH_precipprob',
       'ORGIN_WTH_preciptype', 'ORGIN_WTH_snow', 'ORGIN_WTH_windgust',
       'ORGIN_WTH_windspeed', 'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover',
       'ORGIN_WTH_visibility', 'ORGIN_WTH_severerisk', 'ORGIN_WTH_conditions',
       'DEST_WTH_temp', 'DEST_WTH_feelslike', 'DEST_WTH_precip',
       'DEST_WTH_precipprob', 'DEST_WTH_preciptype', 'DEST_WTH_snow',
       'DEST_WTH_windgust', 'DEST_WTH_windspeed', 'DEST_WTH_winddir',
       'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DEST_WTH_severerisk',
       'DEST_WTH_conditions'],
      dtype='object')

In [21]:
columns_to_drop = ['FLIGHT NUMBER', 'ARRIVAL STATUS', 'ARRIVAL STATUS_Prev_flight_early', 'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late', 'ORGIN_WTH_feelslike',
                    'ORGIN_WTH_windgust', 'DEST_WTH_feelslike', 'DEST_WTH_windgust',
                    'DEST', 'DEST_WTH_preciptype', 'ORGIN_WTH_preciptype',
                    'DEST_WTH_conditions', 'ORGIN_WTH_conditions']
flight_weather_data_updated =  merged_flight_weather_updated.drop(columns = columns_to_drop)
flight_weather_data_updated.shape
flight_weather_data_updated.columns

(51, 24)

Index(['MKT_UNIQUE_CARRIER', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'SCH_DEP_TIME',
       'SCH_ARR_TIME', 'DAY_OF_WEEK', 'ORGIN_WTH_temp', 'ORGIN_WTH_precip',
       'ORGIN_WTH_precipprob', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'ORGIN_WTH_severerisk', 'DEST_WTH_temp', 'DEST_WTH_precip',
       'DEST_WTH_precipprob', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility',
       'DEST_WTH_severerisk'],
      dtype='object')

In [22]:
flight_weather_data_updated.to_csv('./test_data/Test_Merged_Data.csv', index=False)