In [20]:
import pandas as pd
import numpy as np
import os
import json

In [21]:
# Load all available weather data into a dataframe
directory = "vc_api/weather-data/"
files = [file for file in os.listdir(directory) if not file.startswith(".")]

airport = []
date = []
precip = []
snow = []
snowdepth = []
windgust = []
cloudcover = []
icon = []

for file in files:
    with open(directory+file) as f:
        j = json.load(f)
    airport.append(file[:3]) # airport code
    date.append(j['days'][0]['datetime'])
    precip.append(j['days'][0]['precip'])
    snow.append(j['days'][0]['snow'])
#     snowdepth.append(j['days'][0]['snowdepth'])
    windgust.append(j['days'][0]['windgust'])
    cloudcover.append(j['days'][0]['cloudcover'])
#     icon.append(j['days'][0]['icon'])

# create DataFrame
d = {'origin': airport, 'fl_date': date, 'precip': precip, 'snow': snow,
     'windgust': windgust, 'cloudcover': cloudcover}
weather_df = pd.DataFrame(data=d).set_index(['fl_date', 'origin'])

In [24]:
# load your dataframe to add weather features to
df = pd.read_csv('data/model_train_rfeats.csv', index_col=0)

In [25]:
# join weather columns by origin and destination
df = df.join(weather_df, on=['fl_date', 'origin'], how='left').rename(columns={'precip':'origin_precip',
                                                                              'snow':'origin_snow',
                                                                              'windgust':'origin_windgust',
                                                                              'cloudcover': 'origin_cloudcover'})
df = df.join(weather_df, on=['fl_date', 'dest'], how='left').rename(columns={'precip':'dest_precip',
                                                                              'snow':'dest_snow',
                                                                              'windgust':'dest_windgust',
                                                                              'cloudcover': 'dest_cloudcover'})

Engineer weather features

In [26]:
# Bin weather into levels
def precip_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 79:
        return 3
    elif val > 39:
        return 2
    elif val > 0:
        return 1
    else:
        return 0

def snow_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 1.8:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def wind_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 46:
        return 3
    elif val > 35:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def cloud_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 71:
        return 2
    elif val > 47:
        return 1
    else:
        return 0
    
df['origin_precip_cat'] = df['origin_precip'].apply(precip_bins)
df['origin_snow_cat'] = df['origin_snow'].apply(snow_bins)
df['origin_windgust_cat'] = df['origin_windgust'].apply(wind_bins)
df['origin_cloud_cat'] = df['origin_cloudcover'].apply(cloud_bins)
df['dest_precip_cat'] = df['dest_precip'].apply(precip_bins)
df['dest_snow_cat'] = df['dest_snow'].apply(snow_bins)
df['dest_windgust_cat'] = df['dest_windgust'].apply(wind_bins)
df['dest_cloud_cat'] = df['dest_cloudcover'].apply(cloud_bins)

In [27]:
df.head(5)

Unnamed: 0,tail_num,op_carrier_fl_num,fl_date,origin,dest,crs_elapsed_time,distance,delay_flag,month,day,...,dest_windgust,dest_cloudcover,origin_precip_cat,origin_snow_cat,origin_windgust_cat,origin_cloud_cat,dest_precip_cat,dest_snow_cat,dest_windgust_cat,dest_cloud_cat
0,1391,3494,2018-01-01,ATL,TYS,60.0,152.0,0,1,1,...,,,0,0,2.0,2,,,,
1,1473,3500,2018-01-01,ATL,AVL,59.0,164.0,0,1,1,...,,,0,0,2.0,2,,,,
2,1473,3513,2018-01-01,ATL,GSO,78.0,306.0,0,1,1,...,,,0,0,2.0,2,,,,
3,6010,1,2018-01-01,JFK,FLL,190.0,1069.0,1,1,1,...,27.7,67.9,0,0,3.0,0,0.0,0.0,1.0,1.0
4,999,18,2018-01-01,JFK,BOS,69.0,187.0,0,1,1,...,,,0,0,3.0,0,,,,


In [28]:
df.columns

Index(['tail_num', 'op_carrier_fl_num', 'fl_date', 'origin', 'dest',
       'crs_elapsed_time', 'distance', 'delay_flag', 'month', 'day', 'weekday',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest',
       'origin_delay', 'origin_precip', 'origin_snow', 'origin_windgust',
       'origin_cloudcover', 'dest_precip', 'dest_snow', 'dest_windgust',
       'dest_cloudcover', 'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat'],
      dtype='object')

In [29]:
# # Save file
# df.to_csv('data/model_train_final.csv')