# DATA EXPLORATION & PREPROCESSING

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Load processed data set


In [11]:
%%time
delays_raw = pd.read_csv("../data/processed/delays_PHL_coord_weather_data.csv")

CPU times: user 809 ms, sys: 168 ms, total: 977 ms
Wall time: 996 ms


In [12]:
%%time
delays_raw = pd.read_parquet("../data/processed/delays_PHL_coord_weather_data.parquet", engine="pyarrow")

CPU times: user 240 ms, sys: 48.1 ms, total: 288 ms
Wall time: 92.8 ms


In [13]:
delays_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488392 entries, 0 to 488391
Data columns (total 68 columns):
 #   Column                              Non-Null Count   Dtype         
---  ------                              --------------   -----         
 0   FlightDate                          488392 non-null  datetime64[us]
 1   DOT_ID_Reporting_Airline            488392 non-null  int64         
 2   Tail_Number                         484298 non-null  object        
 3   Flight_Number_Reporting_Airline     488392 non-null  int64         
 4   OriginAirportID                     488392 non-null  int64         
 5   Origin                              488392 non-null  object        
 6   DestAirportID                       488392 non-null  int64         
 7   Dest                                488392 non-null  object        
 8   CRSDepTime                          488392 non-null  object        
 9   DepTime                             476427 non-null  object        
 10  DepDelay

In [15]:
delays_raw.describe()

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,DestAirportID,DepDelay,TaxiOut,TaxiIn,ArrDelay,Cancelled,...,dest_wind_speed_10m_max,dest_wind_gusts_10m_max,dest_wind_direction_10m_dominant,dest_shortwave_radiation_sum,dest_et0_fao_evapotranspiration,dest_precipitation_sum,dest_rain_sum,dest_snowfall_sum,dest_precipitation_hours,dest_weather_code
count,488392,488392.0,488392.0,488392.0,488392.0,476427.0,476198.0,476153.0,475374.0,488392.0,...,488392.0,488392.0,488392.0,488392.0,488392.0,488392.0,488392.0,488392.0,488392.0,488392.0
mean,2022-12-18 09:04:10.677324,20065.29539,2673.708894,12619.935425,14100.0,13.075508,17.261234,7.514561,7.518152,0.025002,...,9.80592,40.522073,266.005053,18.894176,2.321413,1.520064,0.733691,0.55188,3.64204,36.441948
min,2020-01-01 00:00:00,19393.0,6.0,10154.0,14100.0,-56.0,1.0,1.0,-88.0,0.0,...,3.9,15.8,0.0,4.99,0.21,0.0,0.0,0.0,0.0,0.0
25%,2021-08-27 00:00:00,19805.0,1300.0,11066.0,14100.0,-7.0,12.0,5.0,-16.0,0.0,...,8.3,30.2,174.0,12.69,1.02,0.0,0.0,0.0,0.0,3.0
50%,2023-01-11 00:00:00,19805.0,2301.0,12892.0,14100.0,-3.0,15.0,6.0,-7.0,0.0,...,9.5,38.2,333.0,19.0,2.08,0.1,0.0,0.0,1.0,51.0
75%,2024-05-20 00:00:00,20416.0,4508.0,13931.0,14100.0,7.0,19.0,9.0,8.0,0.0,...,11.0,47.9,345.0,24.58,3.5,1.6,0.4,0.21,6.0,71.0
max,2025-07-31 00:00:00,20452.0,8815.0,15919.0,14100.0,3403.0,179.0,296.0,3407.0,1.0,...,25.2,95.4,360.0,34.11,5.89,38.5,20.1,25.34,24.0,75.0
std,,335.343967,1715.869855,1561.446414,0.0,67.863894,8.774088,5.535872,69.242305,0.156133,...,2.275373,13.563238,110.529946,6.769281,1.37268,3.261084,1.849672,1.829776,5.229313,31.9088


In [22]:
# Create na overview
cols = [col for col in delays_raw.columns]
nas = [delays_raw[col].isna().sum() for col in cols]
pct_nas = [na/delays_raw.shape[0] for na in nas]

na_dict = {
    "Column": cols,
    "Number of NA Values": nas,
    "Percent NA": pct_nas
}

na_df = pd.DataFrame(na_dict)
na_df.sort_values(by = "Percent NA", ascending = False)

Unnamed: 0,Column,Number of NA Values,Percent NA
18,CancellationCode,476181,0.974998
25,WeatherDelay,394357,0.807460
24,CarrierDelay,394357,0.807460
27,SecurityDelay,394357,0.807460
28,LateAircraftDelay,394357,0.807460
...,...,...,...
23,Distance,0,0.000000
29,origin_code,0,0.000000
30,origin_name,0,0.000000
31,origin_latitude,0,0.000000


Looking at the NA values, we clearly missed some airport codes when collecting origin weather data. Let's take a closer look and fix that.

In [30]:
delays_raw[delays_raw["origin_time"].isna()]["Origin"].unique()

array(['ORF', 'GRR', 'DAY', 'MIA', 'BDL', 'STT', 'TPA', 'AUS', 'DEN',
       'JAX', 'MDW', 'DAL', 'OKC', 'HVN', 'AGS'], dtype=object)

Let's go back and look at the weather data set.

In [37]:
origin_weather = pd.read_csv("../data/intermediate/origin_weather_data.csv")
origin_weather["origin_lat_long"] = origin_weather["origin_lat_long"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
origin_weather[["origin_latitude", "origin_longitude"]] = pd.DataFrame(origin_weather["origin_lat_long"].tolist(), index = origin_weather.index)
origin_weather["origin_latitude"] = origin_weather["origin_latitude"].round(2)
origin_weather["origin_longitude"] = origin_weather["origin_longitude"].round(2)
origin_weather

Unnamed: 0.1,Unnamed: 0,origin_time,origin_temperature_2m_mean,origin_temperature_2m_max,origin_temperature_2m_min,origin_apparent_temperature_mean,origin_apparent_temperature_max,origin_apparent_temperature_min,origin_wind_speed_10m_max,origin_wind_gusts_10m_max,...,origin_shortwave_radiation_sum,origin_et0_fao_evapotranspiration,origin_precipitation_sum,origin_rain_sum,origin_snowfall_sum,origin_precipitation_hours,origin_weather_code,origin_lat_long,origin_latitude,origin_longitude
0,0,2020-01-01,18.9,23.7,14.4,19.2,24.5,13.3,11.3,20.5,...,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)",26.07,-80.15
1,1,2020-01-02,21.4,25.5,17.2,22.5,27.2,17.4,16.3,31.3,...,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.07,-80.15
2,2,2020-01-03,24.8,27.0,22.9,26.6,28.1,25.2,25.4,46.1,...,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.07,-80.15
3,3,2020-01-04,25.3,29.0,23.3,27.5,30.0,25.5,24.1,43.9,...,14.09,3.30,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.07,-80.15
4,4,2020-01-05,17.9,23.1,13.1,15.7,25.7,9.0,26.3,45.0,...,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)",26.07,-80.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224285,61165,2025-07-27,21.4,23.7,18.7,20.9,22.8,19.6,22.7,51.1,...,14.73,3.55,0.2,0.2,0.0,1.0,51,"(41.671, -70.284)",41.67,-70.28
224286,61166,2025-07-28,23.8,27.1,21.7,26.8,31.1,22.8,21.6,43.6,...,21.77,4.20,0.1,0.1,0.0,1.0,51,"(41.671, -70.284)",41.67,-70.28
224287,61167,2025-07-29,25.5,33.3,20.1,28.2,37.2,22.4,19.5,46.4,...,26.02,5.76,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)",41.67,-70.28
224288,61168,2025-07-30,27.6,31.9,22.5,30.6,36.9,26.4,17.8,36.0,...,25.85,5.95,0.0,0.0,0.0,0.0,3,"(41.671, -70.284)",41.67,-70.28


In [None]:
# Compile missing codes and extract their lats and longs
missing_codes = list(delays_raw[delays_raw["origin_time"].isna()]["Origin"].unique())
missing_codes

missing_lats = [float(delays_raw[delays_raw["Origin"] == code]["origin_latitude"].iloc[0]) for code in missing_codes]
missing_longs = [float(delays_raw[delays_raw["Origin"] == code]["origin_longitude"].iloc[0]) for code in missing_codes]
lat_long_tuples = list(zip(missing_lats, missing_longs))

Let's just check if these codes were present in the airport datasets.

In [69]:
airports = pd.read_csv("../data/raw/airports.csv")
airports[airports["code"].isin(missing_codes)]

Unnamed: 0,code,icao,name,latitude,longitude,elevation,url,time_zone,city_code,country,city,state,county,type
152,AGS,KAGS,Augusta Regional Airport,33.372302,-81.965064,101,,America/New_York,AGS,US,,,,AP
462,AUS,KAUS,Austin-Bergstrom International Airport,30.193489,-97.66501,505,http://www.ci.austin.tx.us/austinairport/,America/Chicago,AUS,US,Hornsby Bend,Texas,Travis County,AP
638,BDL,KBDL,Bradley International Airport,41.940584,-72.68502,173,http://www.bradleyairport.com/,America/New_York,BDL,US,Windsor Locks,Connecticut,Hartford County,AP
1775,DAL,KDAL,Dallas Love Field,32.845023,-96.849841,495,http://www.dallas-lovefield.com/,America/Chicago,DFW,US,University Park,Texas,Dallas County,AP
1786,DAY,KDAY,James M. Cox Dayton International Airport,39.905319,-84.220245,1010,,America/New_York,DAY,US,Vandalia,Ohio,Montgomery County,AP
1835,DEN,KDEN,Denver International Airport,39.860668,-104.685367,5347,http://www.flydenver.com/,America/Denver,DEN,US,Lochbuie,Colorado,Weld County,AP
2884,GRR,KGRR,Gerald R. Ford International Airport,42.876615,-85.534882,784,http://www.grr.org/,America/Detroit,GRR,US,Forest Hills,Michigan,Kent County,AP
3321,HVN,KHVN,New Haven Airport,41.265278,-72.888336,26,,America/New_York,HVN,US,East Haven,Connecticut,New Haven County,AP
3654,JAX,KJAX,Jacksonville International Airport,30.49473,-81.692451,19,http://www.jaa.aero/,America/New_York,JAX,US,Nassau Village-Ratliff,Florida,Nassau County,AP
5001,MDW,KMDW,Chicago Midway International Airport,41.785444,-87.750828,603,https://www.flychicago.com/midway/home/pages/d...,America/Chicago,CHI,US,Stickney,Illinois,Cook County,AP


Now that we know they exist in `airports`, let's call the OpenMeteo API and gather the weather data for the missing airports

In [102]:
%%time

import requests

date_start, date_end = min(pd.to_datetime(delays_raw["FlightDate"])), max(pd.to_datetime(delays_raw["FlightDate"]))

daily_vars = [
    "temperature_2m_mean",
    "temperature_2m_max",
    "temperature_2m_min",
    "apparent_temperature_mean",
    "apparent_temperature_max",
    "apparent_temperature_min",
    "wind_speed_10m_max",
    "wind_gusts_10m_max",
    "wind_direction_10m_dominant",
    "shortwave_radiation_sum",
    "et0_fao_evapotranspiration",
    "precipitation_sum",
    "rain_sum",
    "snowfall_sum",
    "precipitation_hours",
    "weather_code",
]

API_URL = "https://archive-api.open-meteo.com/v1/archive"


params = {
    "latitude": ",".join([str(lat) for lat in missing_lats]),
    "longitude": ",".join([str(long) for long in missing_longs]),
    "start_date": date_start.strftime("%Y-%m-%d"),
    "end_date": date_end.strftime("%Y-%m-%d"),
    "daily": daily_vars
}

response = requests.get(API_URL, params = params)
print(response.status_code) # Check that code is 200

missing_dfs = []
for idx in range(0,len(missing_lats)):
    df = pd.DataFrame(response.json()[idx]["daily"])
    df["lat_long"] = [(missing_lats[idx], missing_longs[idx])] * len(df)
    # df["latitude"] = [missing_lats[idx]] * len(df)
    # df["longitude"] = [missing_longs[idx]] * len(df)
    missing_dfs.append(df)

missing_weather_df = pd.concat(missing_dfs)
# missing_weather_df = missing_weather_df.add_prefix("origin_")

missing_weather_df

200
CPU times: user 739 ms, sys: 24.9 ms, total: 764 ms
Wall time: 3.06 s


Unnamed: 0,time,temperature_2m_mean,temperature_2m_max,temperature_2m_min,apparent_temperature_mean,apparent_temperature_max,apparent_temperature_min,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,weather_code,lat_long
0,2020-01-01,8.4,11.4,5.4,3.7,6.2,0.6,21.6,41.4,269,10.77,2.36,0.0,0.0,0.0,0.0,0,"(36.89, -76.2)"
1,2020-01-02,7.9,14.7,3.0,4.0,10.0,-0.6,20.0,37.4,215,8.65,1.85,0.0,0.0,0.0,0.0,3,"(36.89, -76.2)"
2,2020-01-03,12.6,18.4,8.8,9.7,17.6,5.5,23.7,40.7,228,5.11,1.38,4.0,4.0,0.0,11.0,53,"(36.89, -76.2)"
3,2020-01-04,15.2,16.9,13.8,14.8,15.7,13.8,19.8,45.4,228,3.21,0.63,31.0,31.0,0.0,20.0,63,"(36.89, -76.2)"
4,2020-01-05,8.2,15.2,4.8,2.6,13.3,-1.4,35.4,64.1,305,10.86,1.96,6.0,6.0,0.0,9.0,61,"(36.89, -76.2)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,2025-07-27,32.8,39.3,26.7,35.7,42.3,29.8,12.6,31.3,286,25.91,7.36,0.0,0.0,0.0,0.0,1,"(33.37, -81.97)"
2035,2025-07-28,32.2,38.3,27.0,36.0,43.2,30.4,17.0,43.2,318,22.29,6.06,4.9,4.9,0.0,3.0,63,"(33.37, -81.97)"
2036,2025-07-29,29.7,36.4,24.9,35.2,40.9,30.5,13.0,29.9,26,23.84,5.32,0.2,0.2,0.0,1.0,51,"(33.37, -81.97)"
2037,2025-07-30,27.7,32.6,24.3,33.8,39.8,30.0,10.8,23.4,59,17.09,3.66,8.0,8.0,0.0,10.0,63,"(33.37, -81.97)"


Now we'll save that in the `../data/raw/` directory along with the other batches.

In [105]:
missing_weather_df.to_csv("../data/raw/batch_missing_weather_data.csv")

## FIX DATA TO INCLUDE MISSING ORIGIN WEATHER
Now that we've collected the missing origin weather data, we'll go through the process of recreating the "full" combined dataset and writing over the old dataset. Create the fixed origin weather data set by concating the batched origin weather files.

In [128]:
import os
delay_raw_df = pd.read_csv("../data/intermediate/delays_PHL_2020_2025.csv")
delay_raw_df.drop(columns = ["Unnamed: 0"], inplace = True)
airports_df = pd.read_csv("../data/intermediate/airport_codes.csv")
origin_fixed_df = pd.concat([pd.read_csv("../data/raw/"+ f) for f in sorted([file for file in os.listdir("../data/raw") if file.startswith("batch")])])
origin_fixed_df.drop(columns = ["Unnamed: 0"], inplace = True)
origin_fixed_df = origin_fixed_df.add_prefix("origin_")
dest_weather_df = pd.read_csv("../data/raw/destination_weather_data.csv")

First, merge on `airports_df` to get latitudes and longitudes.

In [129]:
delays_df1 = pd.merge(left = delay_raw_df, right = airports_df[["code", "name", "latitude", "longitude"]].add_prefix("origin_"),
                      how = "left", left_on = "Origin", right_on = "origin_code")

In [130]:
origin_fixed_df

Unnamed: 0,origin_time,origin_temperature_2m_mean,origin_temperature_2m_max,origin_temperature_2m_min,origin_apparent_temperature_mean,origin_apparent_temperature_max,origin_apparent_temperature_min,origin_wind_speed_10m_max,origin_wind_gusts_10m_max,origin_wind_direction_10m_dominant,origin_shortwave_radiation_sum,origin_et0_fao_evapotranspiration,origin_precipitation_sum,origin_rain_sum,origin_snowfall_sum,origin_precipitation_hours,origin_weather_code,origin_lat_long
0,2020-01-01,18.9,23.7,14.4,19.2,24.5,13.3,11.3,20.5,351,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)"
1,2020-01-02,21.4,25.5,17.2,22.5,27.2,17.4,16.3,31.3,111,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
2,2020-01-03,24.8,27.0,22.9,26.6,28.1,25.2,25.4,46.1,160,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
3,2020-01-04,25.3,29.0,23.3,27.5,30.0,25.5,24.1,43.9,189,14.09,3.30,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
4,2020-01-05,17.9,23.1,13.1,15.7,25.7,9.0,26.3,45.0,335,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30580,2025-07-27,32.8,39.3,26.7,35.7,42.3,29.8,12.6,31.3,286,25.91,7.36,0.0,0.0,0.0,0.0,1,"(33.37, -81.97)"
30581,2025-07-28,32.2,38.3,27.0,36.0,43.2,30.4,17.0,43.2,318,22.29,6.06,4.9,4.9,0.0,3.0,63,"(33.37, -81.97)"
30582,2025-07-29,29.7,36.4,24.9,35.2,40.9,30.5,13.0,29.9,26,23.84,5.32,0.2,0.2,0.0,1.0,51,"(33.37, -81.97)"
30583,2025-07-30,27.7,32.6,24.3,33.8,39.8,30.0,10.8,23.4,59,17.09,3.66,8.0,8.0,0.0,10.0,63,"(33.37, -81.97)"


In [131]:
import ast
origin_fixed_df["origin_lat_long"] = origin_fixed_df["origin_lat_long"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
origin_fixed_df[["origin_latitude", "origin_longitude"]] = pd.DataFrame(origin_fixed_df["origin_lat_long"].tolist(), index = origin_fixed_df.index)
origin_fixed_df.head()

Unnamed: 0,origin_time,origin_temperature_2m_mean,origin_temperature_2m_max,origin_temperature_2m_min,origin_apparent_temperature_mean,origin_apparent_temperature_max,origin_apparent_temperature_min,origin_wind_speed_10m_max,origin_wind_gusts_10m_max,origin_wind_direction_10m_dominant,origin_shortwave_radiation_sum,origin_et0_fao_evapotranspiration,origin_precipitation_sum,origin_rain_sum,origin_snowfall_sum,origin_precipitation_hours,origin_weather_code,origin_lat_long,origin_latitude,origin_longitude
0,2020-01-01,18.9,23.7,14.4,19.2,24.5,13.3,11.3,20.5,351,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)",26.072,-80.151
1,2020-01-02,21.4,25.5,17.2,22.5,27.2,17.4,16.3,31.3,111,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.072,-80.151
2,2020-01-03,24.8,27.0,22.9,26.6,28.1,25.2,25.4,46.1,160,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.072,-80.151
3,2020-01-04,25.3,29.0,23.3,27.5,30.0,25.5,24.1,43.9,189,14.09,3.3,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)",26.072,-80.151
4,2020-01-05,17.9,23.1,13.1,15.7,25.7,9.0,26.3,45.0,335,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)",26.072,-80.151


In [132]:
# Make all floats consistent
delays_df1["origin_latitude"] = delays_df1["origin_latitude"].round(2)
delays_df1["origin_longitude"] = delays_df1["origin_longitude"].round(2)
origin_fixed_df["origin_latitude"] = origin_fixed_df["origin_latitude"].round(2)
origin_fixed_df["origin_longitude"] = origin_fixed_df["origin_longitude"].round(2)

In [133]:
# Combine delays_df1 and origin_weather_df
delays_df2 = pd.merge(left = delays_df1, right = origin_fixed_df, how = "left",
                      left_on = ["FlightDate", "origin_latitude", "origin_longitude"], right_on = ["origin_time", "origin_latitude", "origin_longitude"])

delays_df2.head()

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,...,origin_wind_gusts_10m_max,origin_wind_direction_10m_dominant,origin_shortwave_radiation_sum,origin_et0_fao_evapotranspiration,origin_precipitation_sum,origin_rain_sum,origin_snowfall_sum,origin_precipitation_hours,origin_weather_code,origin_lat_long
0,2020-01-01,20409,N655JB,976,11697,FLL,14100,PHL,2152,2143.0,...,20.5,351,15.43,2.79,0.0,0.0,0.0,0.0,1,"(26.072, -80.151)"
1,2020-01-02,20409,N591JB,976,11697,FLL,14100,PHL,2152,2152.0,...,31.3,111,14.55,2.88,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
2,2020-01-03,20409,N657JB,976,11697,FLL,14100,PHL,2152,2150.0,...,46.1,160,12.22,2.86,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
3,2020-01-04,20409,N709JB,976,11697,FLL,14100,PHL,2152,2215.0,...,43.9,189,14.09,3.3,0.0,0.0,0.0,0.0,3,"(26.072, -80.151)"
4,2020-01-05,20409,N627JB,976,11697,FLL,14100,PHL,2152,2149.0,...,45.0,335,14.94,3.15,0.5,0.5,0.0,3.0,51,"(26.072, -80.151)"


In [138]:
dest_weather_df["dest_time"] = pd.to_datetime(dest_weather_df["dest_time"])
dest_weather_df.drop(columns = ["Unnamed: 0"], inplace = True)
delays_df2["FlightDate"] = pd.to_datetime(delays_df2["FlightDate"])
delays_df3 = pd.merge(left = delays_df2, right = dest_weather_df,
                      left_on = "FlightDate", right_on = "dest_time")

delays_df3.head()

Unnamed: 0,FlightDate,DOT_ID_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,...,dest_wind_speed_10m_max,dest_wind_gusts_10m_max,dest_wind_direction_10m_dominant,dest_shortwave_radiation_sum,dest_et0_fao_evapotranspiration,dest_precipitation_sum,dest_rain_sum,dest_snowfall_sum,dest_precipitation_hours,dest_weather_code
0,2020-01-01,20409,N655JB,976,11697,FLL,14100,PHL,2152,2143.0,...,8.0,28.8,338,10.44,0.69,0.0,0.0,0.0,0.0,3
1,2020-01-02,20409,N591JB,976,11697,FLL,14100,PHL,2152,2152.0,...,7.9,26.3,326,9.66,0.64,0.0,0.0,0.0,0.0,3
2,2020-01-03,20409,N657JB,976,11697,FLL,14100,PHL,2152,2150.0,...,7.1,35.3,332,8.78,0.65,0.0,0.0,0.0,0.0,3
3,2020-01-04,20409,N709JB,976,11697,FLL,14100,PHL,2152,2215.0,...,9.1,40.7,354,7.64,0.54,0.0,0.0,0.0,0.0,3
4,2020-01-05,20409,N627JB,976,11697,FLL,14100,PHL,2152,2149.0,...,8.8,25.9,13,10.25,0.54,0.2,0.0,0.14,2.0,71


In [139]:
# Create na overview - check if origin_ columns are fixed
cols = [col for col in delays_df3.columns]
nas = [delays_df3[col].isna().sum() for col in cols]
pct_nas = [na/delays_df3.shape[0] for na in nas]

na_dict = {
    "Column": cols,
    "Number of NA Values": nas,
    "Percent NA": pct_nas
}

na_df = pd.DataFrame(na_dict)
na_df.sort_values(by = "Percent NA", ascending = False) #Fixed!

Unnamed: 0,Column,Number of NA Values,Percent NA
18,CancellationCode,476181,0.974998
27,SecurityDelay,394357,0.807460
28,LateAircraftDelay,394357,0.807460
26,NASDelay,394357,0.807460
25,WeatherDelay,394357,0.807460
...,...,...,...
36,origin_temperature_2m_min,0,0.000000
37,origin_apparent_temperature_mean,0,0.000000
38,origin_apparent_temperature_max,0,0.000000
39,origin_apparent_temperature_min,0,0.000000


In [140]:
delays_df3.columns

Index(['FlightDate', 'DOT_ID_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID', 'Origin',
       'DestAirportID', 'Dest', 'CRSDepTime', 'DepTime', 'DepDelay', 'TaxiOut',
       'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrDelay',
       'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime',
       'ActualElapsedTime', 'AirTime', 'Distance', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'origin_code', 'origin_name', 'origin_latitude', 'origin_longitude',
       'origin_time', 'origin_temperature_2m_mean',
       'origin_temperature_2m_max', 'origin_temperature_2m_min',
       'origin_apparent_temperature_mean', 'origin_apparent_temperature_max',
       'origin_apparent_temperature_min', 'origin_wind_speed_10m_max',
       'origin_wind_gusts_10m_max', 'origin_wind_direction_10m_dominant',
       'origin_shortwave_radiation_sum', 'origin_et0_fao_evapotranspiration',
       'origi

In [141]:
delays_df3.to_csv("../data/processed/delays_PHL_coord_weather_data.csv")
delays_df3["origin_lat_long"] = delays_df3["origin_lat_long"].astype(str)
# Also load as parquet to send to GitHub
delays_df3.to_parquet("../data/processed/delays_PHL_coord_weather_data.parquet", index=False, engine="pyarrow", compression="snappy")

In [143]:
# Also write fixed weather df to intermediate folder
origin_fixed_df.to_csv("../data/intermediate/origin_weather_data.csv")