## Notebook for preprocessing flight and weather data to standardize airport, time, date etc and then joining the datasets together

### Preprocess flight data

In [1]:
import pandas as pd

# Read flight data
flight_data = '/mnt/c/Users/Admin/Downloads/processed_flight_data_2022_and_2021.csv'
df_flight = pd.read_csv(flight_data)



In [2]:
df_flight.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_WAC,DEST_AIRPORT_ID,DEST,DEST_WAC,CRS_DEP_TIME,DEP_DELAY_NEW,CRS_ARR_TIME,DISTANCE
0,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,600,2.0,833,946.0
1,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,845,6.0,1121,946.0
2,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,959,2.0,1237,946.0
3,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,1045,34.0,1323,946.0
4,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,1135,0.0,1400,946.0


#### Fix date format

In [3]:
# Convert 'FL_DATE' column to datetime format if it's not already
df_flight['Flight_Date'] = pd.to_datetime(df_flight['FL_DATE'], format='%m/%d/%Y %I:%M:%S %p')

df_flight.drop(columns=['FL_DATE'], inplace=True)

In [4]:
df_flight.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_WAC,DEST_AIRPORT_ID,DEST,DEST_WAC,CRS_DEP_TIME,DEP_DELAY_NEW,CRS_ARR_TIME,DISTANCE,Flight_Date
0,2022,2,4,1,5,10397,ATL,34,10721,BOS,13,600,2.0,833,946.0,2022-04-01
1,2022,2,4,1,5,10397,ATL,34,10721,BOS,13,845,6.0,1121,946.0,2022-04-01
2,2022,2,4,1,5,10397,ATL,34,10721,BOS,13,959,2.0,1237,946.0,2022-04-01
3,2022,2,4,1,5,10397,ATL,34,10721,BOS,13,1045,34.0,1323,946.0,2022-04-01
4,2022,2,4,1,5,10397,ATL,34,10721,BOS,13,1135,0.0,1400,946.0,2022-04-01


In [5]:
df_flight['CRS_DEP_TIME']

0           600
1           845
2           959
3          1045
4          1135
           ... 
1954777    1630
1954778    1800
1954779    1910
1954780    2004
1954781    2100
Name: CRS_DEP_TIME, Length: 1954782, dtype: int64

### Preprocess weather data

In [6]:
# Read weather data
weather_data = '/mnt/c/Users/Admin/Downloads/us_airport_weather_2021_2022.csv'
df_weather = pd.read_csv(weather_data, low_memory=False)

In [7]:
 column_used = [
    "STATION",
    "NAME",
    "DATE",
    # "HourlyAltimeterSetting",
    "HourlyDewPointTemperature",
    "HourlyDryBulbTemperature",
    "HourlyPrecipitation",
    "HourlyPresentWeatherType",
    # "HourlyPressureChange",
    # "HourlyPressureTendency",
    "HourlyRelativeHumidity",
    # "HourlySeaLevelPressure",
    "HourlySkyConditions",
    "HourlyStationPressure",
    "HourlyVisibility",
    # "HourlyWetBulbTemperature",
    "HourlyWindDirection",
    "HourlyWindGustSpeed",
    "HourlyWindSpeed",
]

df_weather= df_weather[column_used] 

In [8]:
df_weather.head()

Unnamed: 0,STATION,NAME,DATE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyRelativeHumidity,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed
0,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2021-01-01T00:54:00,-5.0,0.6,0.0,,66.0,SCT:04 76.20,1029.8,16.09,360,,3.0
1,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2021-01-01T01:00:00,-5.0,0.6,,,66.0,,1028.45,16.0,360,,3.0
2,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2021-01-01T01:54:00,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1030.48,16.09,20,,3.0
3,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2021-01-01T02:54:00,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1031.49,16.09,360,,3.0
4,72408013739,"PHILADELPHIA INTERNATIONAL AIRPORT, PA US",2021-01-01T03:54:00,-6.1,-1.1,0.0,,69.0,FEW:02 76.20,1031.16,16.09,360,,3.0


#### Airport code mapping

In [9]:
airport_codes = {
    "PHILADELPHIA INTERNATIONAL AIRPORT, PA US": "PHL",
    "SEATTLE TACOMA AIRPORT, WA US": "SEA",
    "JFK INTERNATIONAL AIRPORT, NY US": "JFK",
    "DENVER INTERNATIONAL AIRPORT, CO US": "DEN",
    "NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US": "EWR",
    "MCCARRAN INTERNATIONAL AIRPORT, NV US": "LAS",
    "ORLANDO INTERNATIONAL AIRPORT, FL US": "MCO",
    "ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPORT, GA US": "ATL",
    "FORT LAUDERDALE INTERNATIONAL AIRPORT, FL US": "FLL",
    "DETROIT METRO AIRPORT, MI US": "DTW",
    "WASHINGTON DULLES INTERNATIONAL AIRPORT, VA US": "IAD",
    "CHICAGO OHARE INTERNATIONAL AIRPORT, IL US": "ORD",
    "LOS ANGELES INTERNATIONAL AIRPORT, CA US": "LAX",
    "CHARLOTTE DOUGLAS AIRPORT, NC US": "CLT",
    "BOSTON LOGAN INTERNATIONAL AIRPORT, MA US": "BOS",
    "MIAMI INTERNATIONAL AIRPORT, FL US": "MIA",
    "HOUSTON INTERCONTINENTAL AIRPORT, TX US": "IAH",
    "SAN FRANCISCO INTERNATIONAL AIRPORT, CA US": "SFO",
}

In [10]:
# Function to map airport names to airport codes
def map_airport_names_to_codes(airport_name):
    return airport_codes.get(airport_name, airport_name)  # Return code or original name if not found


In [11]:

# Apply the mapping function to the weather data's airport names column
df_weather['NAME'] = df_weather['NAME'].apply(map_airport_names_to_codes)

In [12]:
df_weather.head()

Unnamed: 0,STATION,NAME,DATE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyRelativeHumidity,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed
0,72408013739,PHL,2021-01-01T00:54:00,-5.0,0.6,0.0,,66.0,SCT:04 76.20,1029.8,16.09,360,,3.0
1,72408013739,PHL,2021-01-01T01:00:00,-5.0,0.6,,,66.0,,1028.45,16.0,360,,3.0
2,72408013739,PHL,2021-01-01T01:54:00,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1030.48,16.09,20,,3.0
3,72408013739,PHL,2021-01-01T02:54:00,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1031.49,16.09,360,,3.0
4,72408013739,PHL,2021-01-01T03:54:00,-6.1,-1.1,0.0,,69.0,FEW:02 76.20,1031.16,16.09,360,,3.0


#### Fix date format and split time to separate column

In [13]:
# Convert 'DATE' column to datetime format if it's not already
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])

# Create separate columns for date and time
df_weather['Weather_Date'] = df_weather['DATE'].dt.date
df_weather['Weather_Time'] = df_weather['DATE'].dt.time


In [14]:
df_weather.drop(columns=['DATE'], inplace=True)

In [15]:
df_weather.head()

Unnamed: 0,STATION,NAME,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyRelativeHumidity,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Weather_Date,Weather_Time
0,72408013739,PHL,-5.0,0.6,0.0,,66.0,SCT:04 76.20,1029.8,16.09,360,,3.0,2021-01-01,00:54:00
1,72408013739,PHL,-5.0,0.6,,,66.0,,1028.45,16.0,360,,3.0,2021-01-01,01:00:00
2,72408013739,PHL,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1030.48,16.09,20,,3.0,2021-01-01,01:54:00
3,72408013739,PHL,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1031.49,16.09,360,,3.0,2021-01-01,02:54:00
4,72408013739,PHL,-6.1,-1.1,0.0,,69.0,FEW:02 76.20,1031.16,16.09,360,,3.0,2021-01-01,03:54:00


#### change  time so it is in same 24 hour int64 format as flight data_crs_dep time and_crs_arr time

In [16]:
# Function to convert datetime.time to 24-hour format
def convert_time_to_24_hour_format(time_obj):
    hours = time_obj.hour
    minutes = time_obj.minute
    military_time = hours * 100 + minutes
    return military_time

# Apply the function to the 'Weather_Time' column
df_weather['Weather_Time'] = df_weather['Weather_Time'].apply(convert_time_to_24_hour_format)

In [17]:
df_weather.head()

Unnamed: 0,STATION,NAME,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyPresentWeatherType,HourlyRelativeHumidity,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,HourlyWindDirection,HourlyWindGustSpeed,HourlyWindSpeed,Weather_Date,Weather_Time
0,72408013739,PHL,-5.0,0.6,0.0,,66.0,SCT:04 76.20,1029.8,16.09,360,,3.0,2021-01-01,54
1,72408013739,PHL,-5.0,0.6,,,66.0,,1028.45,16.0,360,,3.0,2021-01-01,100
2,72408013739,PHL,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1030.48,16.09,20,,3.0,2021-01-01,154
3,72408013739,PHL,-5.6,0.0,0.0,,66.0,FEW:02 76.20,1031.49,16.09,360,,3.0,2021-01-01,254
4,72408013739,PHL,-6.1,-1.1,0.0,,69.0,FEW:02 76.20,1031.16,16.09,360,,3.0,2021-01-01,354


In [24]:
df_flight['Flight_Date'] = pd.to_datetime(df_flight['Flight_Date'])
df_weather['Weather_Date'] = pd.to_datetime(df_weather['Weather_Date'])

### Trying to join the data. Brute force attempt wokrs but takes way too long. Trying to figure out how to use pandas to filter. Right now leads to out of memory error

The aim is to filter for each row in df_flight and check if there is a df_weather[NAME] airport code row that matches either df_flight[ORIGIN] or df_flight[DEST] airport codes. 

If this is true I want to take the df_weather[Weather Date] and check which ones matches the df_flight row date. 

If this is true I want to check which df_weather rows have a df_weather[weather_time] that is closest in time to df_flight[crs_dep_time] if origin airport code matched or df_flight[crs_arr_time] if dest airport code matched. Then I want to add for each row in df_flight origin_weather columns which will be the df_weather data matching to origin airport, and dest_weather columns which will be the row matching the dest airport

#### brute force attempt works just seems inefficient and takes way too long

In [29]:
def find_closest_weather(row, weather_data, time_col):
    flight_time = row[time_col]
    valid_weather = weather_data.copy()
    
    if not valid_weather.empty:
        valid_weather['time_diff'] = abs(valid_weather['Weather_Time'] - flight_time)
        closest_weather = valid_weather.loc[valid_weather['time_diff'].idxmin()]
        return closest_weather
    else:
        print("ERROR NO VALID WEATHER")
        return None



In [30]:
result = []

for index, row_flight in df_flight.iterrows():
    mask_origin = (df_weather['NAME'] == row_flight['ORIGIN']) & (df_weather['Weather_Date'] == row_flight['Flight_Date'])
    matching_origin = df_weather[mask_origin]
    
    mask_dest = (df_weather['NAME'] == row_flight['DEST']) & (df_weather['Weather_Date'] == row_flight['Flight_Date'])
    matching_dest = df_weather[mask_dest]

    if not matching_origin.empty and not matching_dest.empty:
        closest_origin_weather = find_closest_weather(row_flight, matching_origin, 'CRS_DEP_TIME')
        closest_dest_weather = find_closest_weather(row_flight, matching_dest, 'CRS_ARR_TIME')

        if closest_origin_weather is not None and closest_dest_weather is not None:
            row_flight['closest_origin_weather'] = closest_origin_weather
            row_flight['closest_dest_weather'] = closest_dest_weather
            result.append(row_flight)

final_result = pd.concat(result)


KeyboardInterrupt: 

In [34]:
print(result[0]['closest_origin_weather'])
print(result[0])

STATION                              72219013874
NAME                                         ATL
HourlyDewPointTemperature                    3.9
HourlyDryBulbTemperature                     8.9
HourlyPrecipitation                         0.00
HourlyPresentWeatherType                     NaN
HourlyRelativeHumidity                      71.0
HourlySkyConditions                 FEW:02 76.20
HourlyStationPressure                     976.63
HourlyVisibility                           16.09
HourlyWindDirection                          280
HourlyWindGustSpeed                          NaN
HourlyWindSpeed                              4.0
Weather_Date                 2022-04-01 00:00:00
Weather_Time                                 552
time_diff                                     48
Name: 225786, dtype: object
YEAR                                                                   2022
QUARTER                                                                   2
MONTH                               

The code should be changed so that the data is not nested for closest_origin_weather and closest_dest_weather, if this approach is chosen.  Also the time difference is just subtraction which will lead to incorrect solutions. for example diff between 6:00 and 5:52 should be 8 minutes but above shows 48 since I used subtraction of 600 and 552. Possible solution would be to convert the times to time object format and take the difference with built in functions and then return it.

#### Attempts at using pandas to filter and merge. Out of memory error because of ineffficient merge. Not even close to a solution 

In [None]:
# Merge based on Origin and Flight_Date
merged_origin = pd.merge(df_flight, df_weather, left_on=['ORIGIN', 'Flight_Date'], right_on=['NAME', 'Weather_Date'], how='inner')
merged_dest = pd.merge(df_flight, df_weather, left_on=['DEST', 'Flight_Date'], right_on=['NAME', 'Weather_Date'], how='inner')


In [None]:

# Function to find the closest weather time
def find_closest_weather_time(row, time_col):
    return row.iloc[(row['Weather_Time'] - row[time_col]).abs().argsort()[0]]

# Filter and find closest weather for origin
origin_weather = merged_origin.groupby(['index']).apply(find_closest_weather_time, 'CRS_DEP_TIME')

# Filter and find closest weather for destination
dest_weather = merged_dest.groupby(['index']).apply(find_closest_weather_time, 'CRS_ARR_TIME')

# Merge the closest weather data with the original flight data
final_result = pd.concat([origin_weather, dest_weather], axis=1)