# LOADING 2020 - 2025 DATA

In [75]:
import time
import requests
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go

In [None]:
# Load flight delay data from 2020 to 2025
COLUMNS_TO_SELECT = [
    "FlightDate", "DOT_ID_Reporting_Airline", "Tail_Number", 
    "Flight_Number_Reporting_Airline", "OriginAirportID", "Origin",
    "DestAirportID", "Dest", "CRSDepTime",
    "DepTime", "DepDelay", "TaxiOut",
    "WheelsOff", "WheelsOn", "TaxiIn",
    "CRSArrTime", "ArrDelay", "Cancelled",
    "CancellationCode", "Diverted", "CRSElapsedTime",
    "ActualElapsedTime", "AirTime", "Distance",
    "CarrierDelay", "WeatherDelay", "NASDelay",
    "SecurityDelay","LateAircraftDelay"
]

data_view = duckdb.read_csv(
    f"../data/raw/[0-9][0-9][0-9][0-9]-[0-9][0-9].csv",
    auto_detect = True
)

query = f"SELECT {", ".join(COLUMNS_TO_SELECT)} FROM data_view WHERE DEST = 'PHL'"

data_raw = duckdb.sql(query).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [8]:
data_raw.shape

(488392, 29)

In [9]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488392 entries, 0 to 488391
Data columns (total 29 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   FlightDate                       488392 non-null  datetime64[us]
 1   DOT_ID_Reporting_Airline         488392 non-null  int64         
 2   Tail_Number                      484298 non-null  object        
 3   Flight_Number_Reporting_Airline  488392 non-null  int64         
 4   OriginAirportID                  488392 non-null  int64         
 5   Origin                           488392 non-null  object        
 6   DestAirportID                    488392 non-null  int64         
 7   Dest                             488392 non-null  object        
 8   CRSDepTime                       488392 non-null  object        
 9   DepTime                          476427 non-null  object        
 10  DepDelay                         476427 non-

In [10]:
data_raw.isna().sum()

FlightDate                              0
DOT_ID_Reporting_Airline                0
Tail_Number                          4094
Flight_Number_Reporting_Airline         0
OriginAirportID                         0
Origin                                  0
DestAirportID                           0
Dest                                    0
CRSDepTime                              0
DepTime                             11965
DepDelay                            11965
TaxiOut                             12194
WheelsOff                           12194
WheelsOn                            12239
TaxiIn                              12239
CRSArrTime                              0
ArrDelay                            13018
Cancelled                               0
CancellationCode                   476181
Diverted                                0
CRSElapsedTime                          0
ActualElapsedTime                   13018
AirTime                             13018
Distance                          

In [11]:
data_raw.describe()

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,DestAirportID,DepDelay,TaxiOut,TaxiIn,ArrDelay,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,488392,488392.0,488392.0,488392.0,488392.0,476427.0,476198.0,476153.0,475374.0,488392.0,488392.0,488392.0,475374.0,475374.0,488392.0,94035.0,94035.0,94035.0,94035.0,94035.0
mean,2022-12-18 09:04:10.677324,20065.29539,2673.708894,12619.935425,14100.0,13.075508,17.261234,7.514561,7.518152,0.025002,0.001652,149.178987,143.829379,119.064278,892.80055,28.343393,3.500388,12.907215,0.179508,35.9829
min,2020-01-01 00:00:00,19393.0,6.0,10154.0,14100.0,-56.0,1.0,1.0,-88.0,0.0,0.0,-56.0,35.0,18.0,80.0,0.0,0.0,0.0,0.0,0.0
25%,2021-08-27 00:00:00,19805.0,1300.0,11066.0,14100.0,-7.0,12.0,5.0,-16.0,0.0,0.0,102.0,98.0,72.0,453.0,0.0,0.0,0.0,0.0,0.0
50%,2023-01-11 00:00:00,19805.0,2301.0,12892.0,14100.0,-3.0,15.0,6.0,-7.0,0.0,0.0,129.0,127.0,101.0,690.0,3.0,0.0,1.0,0.0,3.0
75%,2024-05-20 00:00:00,20416.0,4508.0,13931.0,14100.0,7.0,19.0,9.0,8.0,0.0,0.0,170.0,168.0,141.0,1013.0,22.0,0.0,17.0,0.0,39.0
max,2025-07-31 00:00:00,20452.0,8815.0,15919.0,14100.0,3403.0,179.0,296.0,3407.0,1.0,1.0,397.0,584.0,555.0,2522.0,3403.0,1332.0,1217.0,277.0,2557.0
std,,335.343967,1715.869855,1561.446414,0.0,67.863894,8.774088,5.535872,69.242305,0.156133,0.040616,63.608243,63.931164,62.623689,588.519116,97.634385,27.082548,29.851321,2.960894,84.952342


In [12]:
# Load airport codes table
airport_view = duckdb.read_csv(
    f"../data/raw/airports.csv",
    auto_detect = True
)

query = f"SELECT * FROM airport_view"

airport_codes = duckdb.sql(query).df()

In [15]:
airport_codes.shape

(9803, 14)

In [16]:
airport_codes.head()

Unnamed: 0,code,icao,name,latitude,longitude,elevation,url,time_zone,city_code,country,city,state,county,type
0,AAA,NTGA,Anaa,-17.350665,-145.51112,36,,Pacific/Tahiti,AAA,PF,,,,AP
1,AAB,YARY,Arrabury Airport,-26.696783,141.049092,328,,Australia/Brisbane,AAB,AU,Tanbar,Queensland,Barcoo Shire,AP
2,AAC,HEAR,El Arish International Airport,31.074284,33.829172,85,,Africa/Cairo,AAC,EG,Arish,Muhafazat Shamal Sina',,AP
3,AAD,HCAD,Adado Airport,6.096286,46.637708,980,,Africa/Khartoum,AAD,SO,Adado,,,AP
4,AAE,DABB,Les Salines Airport,36.821392,7.811857,36,,Africa/Algiers,AAE,DZ,El Hadjar,Annaba,,AP


In [None]:
# Look at latitude and longitude for codes in delay data
delay_codes = list(data_raw["Origin"].unique())
print(len(delay_codes)) # This should match the # of rows in the output table
airport_codes[airport_codes["code"].isin(delay_codes)][["code", "name", "latitude", "longitude"]]

110


Unnamed: 0,code,name,latitude,longitude
61,ACK,Nantucket Memorial Airport,41.256351,-70.064227
152,AGS,Augusta Regional Airport,33.372302,-81.965064
237,ALB,Albany International Airport,42.745752,-73.809209
429,ATL,Hartsfield-Jackson Atlanta International Airport,33.637799,-84.429271
462,AUS,Austin-Bergstrom International Airport,30.193489,-97.665010
...,...,...,...,...
8182,TPA,Tampa International Airport,27.979165,-82.534928
8307,TVC,Cherry Capital Airport,44.743492,-85.584216
8349,TYS,McGhee Tyson Airport,35.809553,-83.998848
8729,VPS,Destin Fort Walton Beach Airport,30.495420,-86.549503


In [22]:
df = pd.merge(left = data_raw, right = airport_codes[["code", "name", "latitude", "longitude"]],
                     how = "left", left_on = "Origin", right_on="code")

In [23]:
df

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,...,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,code,name,latitude,longitude
0,2020-01-01,20409,N655JB,976,11697,FLL,14100,PHL,2152,2143,...,992.0,,,,,,FLL,Fort Lauderdale-Hollywood International Airport,26.072017,-80.150997
1,2020-01-02,20409,N591JB,976,11697,FLL,14100,PHL,2152,2152,...,992.0,,,,,,FLL,Fort Lauderdale-Hollywood International Airport,26.072017,-80.150997
2,2020-01-03,20409,N657JB,976,11697,FLL,14100,PHL,2152,2150,...,992.0,,,,,,FLL,Fort Lauderdale-Hollywood International Airport,26.072017,-80.150997
3,2020-01-04,20409,N709JB,976,11697,FLL,14100,PHL,2152,2215,...,992.0,,,,,,FLL,Fort Lauderdale-Hollywood International Airport,26.072017,-80.150997
4,2020-01-05,20409,N627JB,976,11697,FLL,14100,PHL,2152,2149,...,992.0,,,,,,FLL,Fort Lauderdale-Hollywood International Airport,26.072017,-80.150997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488387,2025-07-19,20416,N648NK,1617,13303,MIA,14100,PHL,1103,1052,...,1013.0,,,,,,MIA,Miami International Airport,25.794979,-80.286723
488388,2025-07-21,20416,N680NK,1617,13303,MIA,14100,PHL,1529,1522,...,1013.0,,,,,,MIA,Miami International Airport,25.794979,-80.286723
488389,2025-07-25,20416,N680NK,1617,13303,MIA,14100,PHL,1529,1522,...,1013.0,0.0,0.0,60.0,0.0,0.0,MIA,Miami International Airport,25.794979,-80.286723
488390,2025-07-26,20416,N905NK,1617,13303,MIA,14100,PHL,1103,1055,...,1013.0,,,,,,MIA,Miami International Airport,25.794979,-80.286723


In [None]:
# Work out most efficient way to call OpenMeteoAPI and get weather data

# Define variables using the data & API Website variable selector
date_start, date_end = min(pd.to_datetime(df["FlightDate"])), max(pd.to_datetime(df["FlightDate"]))

daily_vars = [
    "temperature_2m_mean",
    "temperature_2m_max",
    "temperature_2m_min",
    "apparent_temperature_mean",
    "apparent_temperature_max",
    "apparent_temperature_min",
    "wind_speed_10m_max",
    "wind_gusts_10m_max",
    "wind_direction_10m_dominant",
    "shortwave_radiation_sum",
    "et0_fao_evapotranspiration",
    "precipitation_sum",
    "rain_sum",
    "snowfall_sum",
    "precipitation_hours",
    "weather_code",
]

# Call API with params
API_URL = "https://archive-api.open-meteo.com/v1/archive"


params = {
    "latitude": "38.87,26.072017000000002",
    "longitude": "75.24,-80.15099673135214",
    "start_date": date_start.strftime("%Y-%m-%d"),
    "end_date": date_end.strftime("%Y-%m-%d"),
    "daily": daily_vars
}

response = requests.get(API_URL, params = params)
print(response.status_code) # Check that code is 200

#data_weather = pd.DataFrame(response.json()["daily"])

200


TypeError: list indices must be integers or slices, not str

In [77]:
len(pd.DataFrame(response.json()[1]["daily"]))

2039

In [79]:
len([(1,5)] * len(pd.DataFrame(response.json()[1]["daily"])))

2039

In [69]:
df["lat_long"] = df.apply(lambda x: (round(x["latitude"], 3), round(x["longitude"], 3)), axis = 1)

In [None]:
# Base API url
API_URL = "https://archive-api.open-meteo.com/v1/archive"

# Same start and end date for every airport
DATE_START, DATE_END = min(pd.to_datetime(df["FlightDate"])), max(pd.to_datetime(df["FlightDate"]))

# Same weather variables for every airport
WEATHER_VARS = [
    "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean",
    "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max",
    "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration", "precipitation_sum", "rain_sum",
    "snowfall_sum", "precipitation_hours", "weather_code"
]

# To prevent hitting a rate limit, set a batch size to pass (lat,long) values at
BATCH_SIZE = 10

# Need a list of the unique (lat,long) pairs as a proxy for airport code to enter into API
lat_long_pairs = list(df["lat_long"].unique())

# Initialize an empty list to hold weather data for each (lat,long) pair across time range
weather_dfs = []

# For each unique (lat,long) pair, get weather data for set time period, store data frame with (lat,long) in a list
for i in range(0,len(lat_long_pairs), BATCH_SIZE):
    current_batch = lat_long_pairs[i : i + BATCH_SIZE]
    latitudes = ",".join(str(pair[0]) for pair in current_batch)
    longitudes = ",".join(str(pair[1]) for pair in current_batch)
    
    params = {
        "latitude":latitudes,
        "longitude":longitudes,
        "start_date":DATE_START.strftime("%Y-%m-%d"),
        "end_date":DATE_END.strftime("%Y-%m-%d"),
        "daily":WEATHER_VARS
    }
    
    print(f"Fetching batch {i//BATCH_SIZE + 1} / {((len(lat_long_pairs)-1) // BATCH_SIZE) + 1}")
    response = requests.get(API_URL, params = params)
    
    if response.status_code == 200:
        batch_len = len(current_batch)
        lat_long_weather_list = []
        
        for j in range(0, batch_len):
            weather_df_j = pd.DataFrame(response.json()[j]["daily"])
            weather_df_j["lat_long"] = [current_batch[j]] * len(weather_df_j)
            lat_long_weather_list.append(weather_df_j)
        
        lat_long_weather_df = pd.concat(lat_long_weather_list)
        weather_dfs.append(lat_long_weather_df)
        
    elif response.status_code == 429:
        print("429 Status -- Retrying after 60 seconds")
        time.sleep(60)
        response = requests.get(API_URL, params = params)
        if response.status_code != 200:
            print(reponse.text)
            raise Exception(f"Request still failed with status code: {response.status_code}")
   
    else:
        print(response.text)
        raise Exception(f"Request failed with status code: {response.status_code}")

    time.sleep(35)
            
# Old script that runs into minute rate limits
# for pair in lat_long_pairs:
#     lat = pair[0]
#     long = pair[1]
    
#     params = {
#         "latitude": lat,
#         "longitude": long, 
#         "start_date": DATE_START.strftime("%Y-%m-%d"),
#         "end_date": DATE_END.strftime("%Y-%m-%d"),
#         "daily": WEATHER_VARS
#     }
    
#     print(f"Fetching data for {df[df["lat_long"] == (lat, long)]["Origin"].iloc[0]}...")
#     response = requests.get(API_URL, params = params)
    
#     if response.status_code == 200:
#         lat_long_weather = pd.DataFrame(response.json())["daily"]
#         lat_long_weather["lat_long"] = (lat,long)
        
#         weather_dfs.append(lat_long_weather)
        
#     else:
#         raise Exception(f"Request failed with status code: {response.status_code}")


Fetching batch 1 / 11
Fetching batch 2 / 11
Fetching batch 3 / 11
Fetching batch 4 / 11
Fetching batch 5 / 11
Fetching batch 6 / 11
429 Status -- Retrying after 60 seconds
Fetching batch 7 / 11
429 Status -- Retrying after 60 seconds


NameError: name 'reponse' is not defined

In [82]:
response.text

'{"error":true,"reason":"Daily API request limit exceeded. Please try again tomorrow."}'

Since I hit a daily limit, I'll save what we have for now into a df, wait a day, and then rerun a modified script below that starts from batch 6.

In [99]:
# Collect the batches we did get into one df
batch_0_through_5_weather_df = pd.concat(weather_dfs) # Concat current collected batches

In [102]:
# Save collected batches as a csv in data/raw/
batch_0_through_5_weather_df.to_csv("../data/raw/batch_0_through_5_weather_data.csv")

In [101]:
batch_0_through_5_weather_df["lat_long"].nunique() # Looking to get to 110

50

In [None]:
# Rewritten API script that starts from batch 6

# Base API url
API_URL = "https://archive-api.open-meteo.com/v1/archive"

# Same start and end date for every airport
DATE_START, DATE_END = min(pd.to_datetime(df["FlightDate"])), max(pd.to_datetime(df["FlightDate"]))

# Same weather variables for every airport
WEATHER_VARS = [
    "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean",
    "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max",
    "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration", "precipitation_sum", "rain_sum",
    "snowfall_sum", "precipitation_hours", "weather_code"
]

# To prevent hitting a rate limit, set a batch size to pass (lat,long) values at
BATCH_SIZE = 10

# Need a list of the unique (lat,long) pairs as a proxy for airport code to enter into API
lat_long_pairs = list(df["lat_long"].unique())

# Initialize an empty list to hold weather data for each (lat,long) pair across time range
weather_dfs = []

# For each unique (lat,long) pair, get weather data for set time period, store data frame with (lat,long) in a list
print("Starting from batch 7")
for i in range(0,len(lat_long_pairs), BATCH_SIZE):
    if i < 50:
        pass
        
    else:
        current_batch = lat_long_pairs[i : i + BATCH_SIZE]
        latitudes = ",".join(str(pair[0]) for pair in current_batch)
        longitudes = ",".join(str(pair[1]) for pair in current_batch)
        
        params = {
            "latitude":latitudes,
            "longitude":longitudes,
            "start_date":DATE_START.strftime("%Y-%m-%d"),
            "end_date":DATE_END.strftime("%Y-%m-%d"),
            "daily":WEATHER_VARS
        }
        
        print(f"Fetching batch {i//BATCH_SIZE + 1} / {((len(lat_long_pairs)-1) // BATCH_SIZE) + 1}")
        response = requests.get(API_URL, params = params)
        
        if response.status_code == 200:
            batch_len = len(current_batch)
            lat_long_weather_list = []
            
            for j in range(0, batch_len):
                weather_df_j = pd.DataFrame(response.json()[j]["daily"])
                weather_df_j["lat_long"] = [current_batch[j]] * len(weather_df_j)
                lat_long_weather_list.append(weather_df_j)
            
            lat_long_weather_df = pd.concat(lat_long_weather_list)
            weather_dfs.append(lat_long_weather_df)
            
        elif response.status_code == 429:
            print("429 Status -- Retrying after 60 seconds")
            time.sleep(60)
            response = requests.get(API_URL, params = params)
            
            if response.status_code != 200:
                print(response.text)
                raise Exception(f"Request still failed with status code: {response.status_code}")
            
            batch_len = len(current_batch)
            lat_long_weather_list = []
            
            for j in range(0, batch_len):
                weather_df_j = pd.DataFrame(response.json()[j]["daily"])
                weather_df_j["lat_long"] = [current_batch[j]] * len(weather_df_j)
                lat_long_weather_list.append(weather_df_j)
            
            lat_long_weather_df = pd.concat(lat_long_weather_list)
            weather_dfs.append(lat_long_weather_df)
            
        else:
            print(response.text)
            raise Exception(f"Request failed with status code: {response.status_code}")

        time.sleep(35)


Starting from batch 7
Fetching batch 6 / 11
Fetching batch 7 / 11
429 Status -- Retrying after 60 seconds
Fetching batch 8 / 11
Fetching batch 9 / 11
429 Status -- Retrying after 60 seconds


NameError: name 'reponse' is not defined

In [104]:
response.text

'{"error":true,"reason":"Hourly API request limit exceeded. Please try again in the next hour."}'

In [None]:
# 6 through 8 means length should be 3
len(weather_dfs)

3

In [None]:
# Load collected data into one df
batch_6_through_8_weather_df = pd.concat(weather_dfs)

In [None]:
# Should be 30 
batch_6_through_8_weather_df["lat_long"].nunique()

30

In [108]:
# Write this data to CSV
batch_6_through_8_weather_df.to_csv("../data/raw/batch_6_through_8_weather_data.csv")

In [111]:
# Script to collect 9th, 10th, and 11th batches - changed time.sleep() to be a bit longer

# Rewritten API script that starts from batch 7

# Base API url
API_URL = "https://archive-api.open-meteo.com/v1/archive"

# Same start and end date for every airport
DATE_START, DATE_END = min(pd.to_datetime(df["FlightDate"])), max(pd.to_datetime(df["FlightDate"]))

# Same weather variables for every airport
WEATHER_VARS = [
    "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean",
    "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max",
    "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration", "precipitation_sum", "rain_sum",
    "snowfall_sum", "precipitation_hours", "weather_code"
]

# To prevent hitting a rate limit, set a batch size to pass (lat,long) values at
BATCH_SIZE = 10

# Need a list of the unique (lat,long) pairs as a proxy for airport code to enter into API
lat_long_pairs = list(df["lat_long"].unique())

# Initialize an empty list to hold weather data for each (lat,long) pair across time range
weather_dfs = []

# For each unique (lat,long) pair, get weather data for set time period, store data frame with (lat,long) in a list
print("Starting from batch 9")
for i in range(0,len(lat_long_pairs), BATCH_SIZE):
    # 0, 10, 20, 30, 40, 50, 60, 70 are first 8 batches
    if i < 80:
        pass
    
    # Start from 9th batch
    else:
        current_batch = lat_long_pairs[i : i + BATCH_SIZE]
        latitudes = ",".join(str(pair[0]) for pair in current_batch)
        longitudes = ",".join(str(pair[1]) for pair in current_batch)
        
        params = {
            "latitude":latitudes,
            "longitude":longitudes,
            "start_date":DATE_START.strftime("%Y-%m-%d"),
            "end_date":DATE_END.strftime("%Y-%m-%d"),
            "daily":WEATHER_VARS
        }
        
        print(f"Fetching batch {i//BATCH_SIZE + 1} / {((len(lat_long_pairs)-1) // BATCH_SIZE) + 1}")
        response = requests.get(API_URL, params = params)
        
        # If request goes through
        if response.status_code == 200:
            batch_len = len(current_batch)
            lat_long_weather_list = []
            
            for j in range(0, batch_len):
                weather_df_j = pd.DataFrame(response.json()[j]["daily"])
                weather_df_j["lat_long"] = [current_batch[j]] * len(weather_df_j)
                lat_long_weather_list.append(weather_df_j)
            
            lat_long_weather_df = pd.concat(lat_long_weather_list)
            weather_dfs.append(lat_long_weather_df)
        
        # If rate limit, assume it's minute rate limit and try again 
        elif response.status_code == 429:
            print("429 Status -- Retrying after 60 seconds")
            time.sleep(60)
            response = requests.get(API_URL, params = params)
            
            # If still rate limit, give up and print text
            if response.status_code != 200:
                print(response.text)
                raise Exception(f"Request still failed with status code: {response.status_code}")
            
            batch_len = len(current_batch)
            lat_long_weather_list = []
            
            for j in range(0, batch_len):
                weather_df_j = pd.DataFrame(response.json()[j]["daily"])
                weather_df_j["lat_long"] = [current_batch[j]] * len(weather_df_j)
                lat_long_weather_list.append(weather_df_j)
            
            lat_long_weather_df = pd.concat(lat_long_weather_list)
            weather_dfs.append(lat_long_weather_df)
        
        # Something else went wrong, print text  
        else:
            print(response.text)
            raise Exception(f"Request failed with status code: {response.status_code}")

        # Wait before next request
        time.sleep(60) # Test a longer gap between requests

Starting from batch 7
Fetching batch 9 / 11
Fetching batch 10 / 11
Fetching batch 11 / 11


In [None]:
# Should be 30
batch_9_through_11_weather_df = pd.concat(weather_dfs)
batch_9_through_11_weather_df["lat_long"].nunique()

30

In [113]:
# Save last batch to csv
batch_9_through_11_weather_df.to_csv("../data/raw/batch_9_through_11_weather_data.csv")

## Create intermediate data sets (Data to join with flight delays)

In [114]:
"hello word".startswith("hello")

True

In [121]:
weather_combined_df = pd.concat([pd.read_csv("../data/raw/"+ f) for f in sorted([file for file in os.listdir("../data/raw") if file.startswith("batch")])])

In [122]:
weather_combined_df

Unnamed: 0.1,Unnamed: 0,time,temperature_2m_mean,temperature_2m_max,temperature_2m_min,apparent_temperature_mean,apparent_temperature_max,apparent_temperature_min,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,weather_code,lat_long
0,0,2020-01-01,18.9,23.7,14.4,19.2,24.5,13.3,11.3,20.5,351,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)"
1,1,2020-01-02,21.4,25.5,17.2,22.5,27.2,17.4,16.3,31.3,111,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
2,2,2020-01-03,24.8,27.0,22.9,26.6,28.1,25.2,25.4,46.1,160,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
3,3,2020-01-04,25.3,29.0,23.3,27.5,30.0,25.5,24.1,43.9,189,14.09,3.30,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
4,4,2020-01-05,17.9,23.1,13.1,15.7,25.7,9.0,26.3,45.0,335,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61165,2034,2025-07-27,21.4,23.7,18.7,20.9,22.8,19.6,22.7,51.1,201,14.73,3.55,0.2,0.2,0.0,1.0,51,"(41.671, -70.284)"
61166,2035,2025-07-28,23.8,27.1,21.7,26.8,31.1,22.8,21.6,43.6,275,21.77,4.20,0.1,0.1,0.0,1.0,51,"(41.671, -70.284)"
61167,2036,2025-07-29,25.5,33.3,20.1,28.2,37.2,22.4,19.5,46.4,227,26.02,5.76,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)"
61168,2037,2025-07-30,27.6,31.9,22.5,30.6,36.9,26.4,17.8,36.0,249,25.85,5.95,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)"


In [123]:
weather_combined_df["lat_long"].nunique()

110

In [126]:
min(weather_combined_df["time"]), max(weather_combined_df["time"])

('2020-01-01', '2025-07-31')

In [None]:
weather_combined_df.drop(columns = ["Unnamed: 0"], inplace = True)

In [139]:
weather_combined_df = weather_combined_df.add_prefix("origin_")

In [140]:
weather_combined_df

Unnamed: 0,origin_time,origin_temperature_2m_mean,origin_temperature_2m_max,origin_temperature_2m_min,origin_apparent_temperature_mean,origin_apparent_temperature_max,origin_apparent_temperature_min,origin_wind_speed_10m_max,origin_wind_gusts_10m_max,origin_wind_direction_10m_dominant,origin_shortwave_radiation_sum,origin_et0_fao_evapotranspiration,origin_precipitation_sum,origin_rain_sum,origin_snowfall_sum,origin_precipitation_hours,origin_weather_code,origin_lat_long
0,2020-01-01,18.9,23.7,14.4,19.2,24.5,13.3,11.3,20.5,351,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)"
1,2020-01-02,21.4,25.5,17.2,22.5,27.2,17.4,16.3,31.3,111,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
2,2020-01-03,24.8,27.0,22.9,26.6,28.1,25.2,25.4,46.1,160,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
3,2020-01-04,25.3,29.0,23.3,27.5,30.0,25.5,24.1,43.9,189,14.09,3.30,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
4,2020-01-05,17.9,23.1,13.1,15.7,25.7,9.0,26.3,45.0,335,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61165,2025-07-27,21.4,23.7,18.7,20.9,22.8,19.6,22.7,51.1,201,14.73,3.55,0.2,0.2,0.0,1.0,51,"(41.671, -70.284)"
61166,2025-07-28,23.8,27.1,21.7,26.8,31.1,22.8,21.6,43.6,275,21.77,4.20,0.1,0.1,0.0,1.0,51,"(41.671, -70.284)"
61167,2025-07-29,25.5,33.3,20.1,28.2,37.2,22.4,19.5,46.4,227,26.02,5.76,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)"
61168,2025-07-30,27.6,31.9,22.5,30.6,36.9,26.4,17.8,36.0,249,25.85,5.95,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)"


In [141]:
weather_combined_df.to_csv("../data/intermediate/origin_weather_data.csv")

In [130]:
data_raw.to_csv("../data/intermediate/delays_PHL_2020_2025.csv")

In [132]:
airport_codes.to_csv("../data/intermediate/airport_codes.csv")

In [137]:
data_raw

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,...,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2020-01-01,20409,N655JB,976,11697,FLL,14100,PHL,2152,2143,...,0.0,158.0,166.0,130.0,992.0,,,,,
1,2020-01-02,20409,N591JB,976,11697,FLL,14100,PHL,2152,2152,...,0.0,158.0,147.0,128.0,992.0,,,,,
2,2020-01-03,20409,N657JB,976,11697,FLL,14100,PHL,2152,2150,...,0.0,158.0,143.0,124.0,992.0,,,,,
3,2020-01-04,20409,N709JB,976,11697,FLL,14100,PHL,2152,2215,...,0.0,158.0,134.0,119.0,992.0,,,,,
4,2020-01-05,20409,N627JB,976,11697,FLL,14100,PHL,2152,2149,...,0.0,158.0,153.0,131.0,992.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488387,2025-07-19,20416,N648NK,1617,13303,MIA,14100,PHL,1103,1052,...,0.0,176.0,166.0,140.0,1013.0,,,,,
488388,2025-07-21,20416,N680NK,1617,13303,MIA,14100,PHL,1529,1522,...,0.0,170.0,167.0,143.0,1013.0,,,,,
488389,2025-07-25,20416,N680NK,1617,13303,MIA,14100,PHL,1529,1522,...,0.0,170.0,237.0,187.0,1013.0,0.0,0.0,60.0,0.0,0.0
488390,2025-07-26,20416,N905NK,1617,13303,MIA,14100,PHL,1103,1055,...,0.0,176.0,167.0,138.0,1013.0,,,,,


I'll also quickly create a weather data set for PHL (Philly international airport) for the date range of the data.

In [None]:
%%time

import requests

date_start, date_end = min(pd.to_datetime(data_raw["FlightDate"])), max(pd.to_datetime(data_raw["FlightDate"]))

daily_vars = [
    "temperature_2m_mean",
    "temperature_2m_max",
    "temperature_2m_min",
    "apparent_temperature_mean",
    "apparent_temperature_max",
    "apparent_temperature_min",
    "wind_speed_10m_max",
    "wind_gusts_10m_max",
    "wind_direction_10m_dominant",
    "shortwave_radiation_sum",
    "et0_fao_evapotranspiration",
    "precipitation_sum",
    "rain_sum",
    "snowfall_sum",
    "precipitation_hours",
    "weather_code",
]

API_URL = "https://archive-api.open-meteo.com/v1/archive"


params = {
    "latitude": 39.87, # Googled
    "longitude": 75.24, # Googled
    "start_date": date_start.strftime("%Y-%m-%d"),
    "end_date": date_end.strftime("%Y-%m-%d"),
    "daily": daily_vars
}

response = requests.get(API_URL, params = params)
print(response.status_code) # Check that code is 200

dest_weather = pd.DataFrame(response.json()["daily"])

dest_weather = dest_weather.add_prefix("dest_")

dest_weather.to_csv("../data/raw/destination_weather_data.csv")