In [1]:
# note for team: download the dataset zip from kaggle
# create a folder in the same root folder as this file called "dataset"
# unzip the downloaded data in that dataset folder and everything else should work
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
all_airlines = pd.read_csv("dataset/archive/Airlines.csv")
print(all_airlines.head())
print(all_airlines.describe())

  Code                    Description
0  02Q                  Titan Airways
1  04Q             Tradewind Aviation
2  05Q            Comlux Aviation, AG
3  06Q  Master Top Linhas Aereas Ltd.
4  07Q            Flair Airlines Ltd.
        Code    Description
count   1570           1571
unique  1570           1571
top      02Q  Titan Airways
freq       1              1


In [3]:
flight_data = pd.read_csv("dataset/archive/Combined_Flights_2022.csv")
print(flight_data.head())
print(flight_data.describe())

   FlightDate                                    Airline Origin Dest  \
0  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    GJT  DEN   
1  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    HRL  IAH   
2  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
3  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    IAH  GPT   
4  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   

   Cancelled  Diverted  CRSDepTime  DepTime  DepDelayMinutes  DepDelay  ...  \
0      False     False        1133   1123.0              0.0     -10.0  ...   
1      False     False         732    728.0              0.0      -4.0  ...   
2      False     False        1529   1514.0              0.0     -15.0  ...   
3      False     False        1435   1430.0              0.0      -5.0  ...   
4      False     False        1135   1135.0              0.0       0.0  ...   

   WheelsOff  WheelsOn  TaxiIn  CRSArrTime  ArrDelay  ArrDel15  \
0     1140.0    1220.0    

In [4]:
missing_values_per_column = flight_data.isna().any(axis=0)
print(missing_values_per_column)

FlightDate            False
Airline               False
Origin                False
Dest                  False
Cancelled             False
                      ...  
ArrDel15               True
ArrivalDelayGroups     True
ArrTimeBlk            False
DistanceGroup         False
DivAirportLandings    False
Length: 61, dtype: bool


In [5]:
print(flight_data['Origin'].nunique())
print(flight_data['Dest'].nunique())

# There's 375 different airport codes which is too much to reliably encode. I found this dataset: https://github.com/ip2location/ip2location-iata-icao/blob/master/iata-icao.csv
# which gives you the geographic location (lon, lat) of each airport code. So we'll map the code to a geographic location. Add a file called "airport-codes" in the "archive" file
# add only the csv file or move it back one folder if you downloaded zip (should be /archive/airport-codes/iata-icao.csv)
airport_codes = pd.read_csv("dataset/archive/airport-codes/iata-icao.csv")
print(airport_codes.head())
airport_codes_indexed = airport_codes.set_index("iata")

# If it seems like it's taking forever or like something is broken, going to "Run" -> "Restart Kernel and Run All Cells" worked for me
mapping_dict_lat = airport_codes_indexed["latitude"].to_dict()
mapping_dict_lon = airport_codes_indexed["longitude"].to_dict()
flight_data["Origin_Lat"] = flight_data["Origin"].map(mapping_dict_lat)
flight_data["Origin_Lon"] = flight_data["Origin"].map(mapping_dict_lon)

flight_data["Dest_Lat"] = flight_data["Dest"].map(mapping_dict_lat)
flight_data["Dest_Lon"] = flight_data["Dest"].map(mapping_dict_lon)

print(flight_data.head())

# Add a column that gives us our target delay prediction time including the total delay (will be negative if early arrival)
flight_data["TotalDelay"] = flight_data["ActualElapsedTime"] - flight_data["CRSElapsedTime"]
print(flight_data["TotalDelay"].head())

# Replace any unknown delays with 0 - might want to improve this logic (there might be other info we can derive delay from for only the na values)
flight_data["TotalDelay"] = np.nan_to_num(flight_data["TotalDelay"])

375
375
  country_code region_name iata  icao                          airport  \
0           AE    Abu Zaby  AAN  OMAL     Al Ain International Airport   
1           AE    Abu Zaby  AUH  OMAA  Abu Dhabi International Airport   
2           AE    Abu Zaby  AYM   NaN         Yas Island Seaplane Base   
3           AE    Abu Zaby  AZI  OMAD      Al Bateen Executive Airport   
4           AE    Abu Zaby  DHF  OMAM               Al Dhafra Air Base   

   latitude  longitude  
0   24.2617    55.6092  
1   24.4330    54.6511  
2   24.4670    54.6103  
3   24.4283    54.4581  
4   24.2482    54.5477  
   FlightDate                                    Airline Origin Dest  \
0  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    GJT  DEN   
1  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    HRL  IAH   
2  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
3  2022-04-04  Commutair Aka Champlain Enterprises, Inc.    IAH  GPT   
4  2022-04-04  Commutair Aka Champlai

In [13]:
# normalize the weather delay column to 0-1 to match the normalized value on the website backend  (I
# basically just made the delay in minutes correspond to the severity of weather (clear skies, thunderstorm, etc.)

min_val = np.min(flight_data["TotalDelay"])
max_val = np.max(flight_data["TotalDelay"])

# Apply the min-max normalization formula
flight_data["DepDelay_Normalized"] = (flight_data["TotalDelay"] - min_val) / (max_val - min_val)

print("Normalized data:", flight_data["DepDelay_Normalized"].head())
print("DepDelay_Normalized:", np.unique(flight_data["DepDelay_Normalized"], return_counts=True))


Normalized data: 0    0.336434
1    0.351938
2    0.365891
3    0.327132
4    0.356589
Name: DepDelay_Normalized, dtype: float64
DepDelay_Normalized: (array([0.        , 0.00310078, 0.04031008, 0.07906977, 0.08372093,
       0.08682171, 0.15503876, 0.16124031, 0.1751938 , 0.17674419,
       0.18604651, 0.1875969 , 0.19844961, 0.20310078, 0.20465116,
       0.20775194, 0.20930233, 0.21550388, 0.21705426, 0.22015504,
       0.22170543, 0.22325581, 0.2248062 , 0.22635659, 0.22790698,
       0.22945736, 0.23100775, 0.23255814, 0.23410853, 0.23565891,
       0.2372093 , 0.23875969, 0.24031008, 0.24186047, 0.24341085,
       0.24496124, 0.24651163, 0.24806202, 0.2496124 , 0.25116279,
       0.25271318, 0.25426357, 0.25581395, 0.25736434, 0.25891473,
       0.26046512, 0.2620155 , 0.26356589, 0.26511628, 0.26666667,
       0.26821705, 0.26976744, 0.27131783, 0.27286822, 0.2744186 ,
       0.27596899, 0.27751938, 0.27906977, 0.28062016, 0.28217054,
       0.28372093, 0.28527132, 0.28682171, 0.

In [6]:
# Last step before training, just set up training, testing, and validation data. 
# BEFORE TRAINING: Need to remove any unwanted x (input) columns and finish cleaning the data
y = flight_data["TotalDelay"]
X = flight_data.drop(columns=['TotalDelay'])

X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=16, 
                                   test_size=0.2, 
                                   shuffle=True)

split_index = len(y_test) // 2

print(f"Original test data length: {len(X_test)}")

X_test_new = X_test[:split_index]
y_test_new = y_test[:split_index]

X_validation = X_test[split_index:]
y_validation = y_test[split_index:]

print(f"Final test data length x: {len(X_test_new)}")
print(f"Final test data length y: {len(y_test_new)}")
print(f"Final validation data length x: {len(X_validation)}")
print(f"Final validation data length y: {len(y_validation)}")

Original test data length: 815664
Final test data length x: 407832
Final test data length y: 407832
Final validation data length x: 407832
Final validation data length y: 407832


In [7]:
# Do training and validation here