# Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv("../data/processed/delay_weather_coord.csv", index_col = 0)

In [10]:
df = data.copy()
df.columns

Index(['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'Unnamed: 27', 'time', 'temperature_2m_mean', 'temperature_2m_max',
       'temperature_2m_min', 'apparent_temperature_mean',
       'apparent_temperature_max', 'apparent_temperature_min',
       'wind_speed_10m_max', 'wind_gusts_10m_max',
       'wind_direction_10m_dominant', 'shortwave_radiation_sum',
       'et0_fao_evapotranspiration', 'precipitation_sum', 'rain_sum',
       'snowfall_sum', 'precipitation_hours', 'weather_code', 'code',
       'latitude', 'longitude'],
      dtype='object')

In [13]:
df = df[(df["CANCELLED"] == 0) & (df["DIVERTED"] == 0)]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 417952 entries, 0 to 426975
Data columns (total 48 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   FL_DATE                      417952 non-null  object 
 1   OP_CARRIER                   417952 non-null  object 
 2   OP_CARRIER_FL_NUM            417952 non-null  int64  
 3   ORIGIN                       417952 non-null  object 
 4   DEST                         417952 non-null  object 
 5   CRS_DEP_TIME                 417952 non-null  float64
 6   DEP_TIME                     417952 non-null  float64
 7   DEP_DELAY                    417952 non-null  float64
 8   TAXI_OUT                     417952 non-null  float64
 9   WHEELS_OFF                   417952 non-null  float64
 10  WHEELS_ON                    417952 non-null  float64
 11  TAXI_IN                      417952 non-null  float64
 12  CRS_ARR_TIME                 417952 non-null  float64
 13  ARR_

In [17]:
cols = [col for col in df.columns]
nas = [df[col].isna().sum() for col in cols]
pct_nas = [na/df.shape[0] for na in nas]

na_dict = {
    "Column": cols,
    "Number of NA Values": nas,
    "Percent NA": pct_nas
}

na_df = pd.DataFrame(na_dict)

In [22]:
na_cols = list(na_df[na_df["Percent NA"] > 0.70]["Column"])

In [23]:
df1 = df.copy()
df1 = df1.drop(columns = na_cols)

In [24]:
df.shape[1], df1.shape[1]

(48, 41)

In [25]:
df1.columns

Index(['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME',
       'AIR_TIME', 'DISTANCE', 'time', 'temperature_2m_mean',
       'temperature_2m_max', 'temperature_2m_min', 'apparent_temperature_mean',
       'apparent_temperature_max', 'apparent_temperature_min',
       'wind_speed_10m_max', 'wind_gusts_10m_max',
       'wind_direction_10m_dominant', 'shortwave_radiation_sum',
       'et0_fao_evapotranspiration', 'precipitation_sum', 'rain_sum',
       'snowfall_sum', 'precipitation_hours', 'weather_code', 'code',
       'latitude', 'longitude'],
      dtype='object')

In [58]:
# Rename some columns for clarity, drop others as join artifacts
standardized_cols = [col.strip().lower().replace(" ", "_") for col in df1.columns.to_list()]
df1.columns = standardized_cols

df1 = df1.rename(columns = {
               "longitude":"origin_longitude",
               "latitude":"origin_latitude"
           })

df2 = df1.copy()
df2 = df2.drop(columns = ["time", "code", "cancelled", "diverted"])

In [62]:
df2.head()

Unnamed: 0,fl_date,op_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,...,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,weather_code,origin_latitude,origin_longitude
0,2009-01-01,XE,2514,CLE,PHL,1115.0,1108.0,-7.0,12.0,1120.0,...,225,10.84,0.52,0.0,0.0,0.0,0.0,3,41.406619,-81.851202
1,2009-01-01,XE,2538,CLE,PHL,1545.0,1542.0,-3.0,13.0,1555.0,...,225,10.84,0.52,0.0,0.0,0.0,0.0,3,41.406619,-81.851202
2,2009-01-01,YV,7144,IAD,PHL,832.0,833.0,1.0,14.0,847.0,...,225,10.84,0.52,0.0,0.0,0.0,0.0,3,38.952266,-77.453485
3,2009-01-01,YV,7218,IAD,PHL,1710.0,1706.0,-4.0,17.0,1723.0,...,225,10.84,0.52,0.0,0.0,0.0,0.0,3,38.952266,-77.453485
4,2009-01-01,NW,1762,DTW,PHL,1910.0,1905.0,-5.0,21.0,1926.0,...,225,10.84,0.52,0.0,0.0,0.0,0.0,3,42.205699,-83.352975


In [63]:
from geopy.distance import geodesic

PHL_coords = (39.8730, -75.2437)
df2["geodesic_distance"] = df2.apply(lambda x: geodesic(x[["origin_latitude", "origin_longitude"]], PHL_coords).km, axis = 1)

In [64]:
df2.columns

Index(['fl_date', 'op_carrier', 'op_carrier_fl_num', 'origin', 'dest',
       'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay',
       'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'distance',
       'temperature_2m_mean', 'temperature_2m_max', 'temperature_2m_min',
       'apparent_temperature_mean', 'apparent_temperature_max',
       'apparent_temperature_min', 'wind_speed_10m_max', 'wind_gusts_10m_max',
       'wind_direction_10m_dominant', 'shortwave_radiation_sum',
       'et0_fao_evapotranspiration', 'precipitation_sum', 'rain_sum',
       'snowfall_sum', 'precipitation_hours', 'weather_code',
       'origin_latitude', 'origin_longitude', 'geodesic_distance'],
      dtype='object')

In [65]:
df2[["distance", "geodesic_distance"]]

Unnamed: 0,distance,geodesic_distance
0,363.0,584.130926
1,363.0,584.130926
2,134.0,216.015830
3,134.0,216.015830
4,453.0,729.048113
...,...,...
426971,666.0,1072.041223
426972,666.0,1072.041223
426973,861.0,1387.951879
426974,678.0,1091.018116
