In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import json

In [11]:
df = pd.read_csv('data/model_test.csv', index_col=0)

In [12]:
# df.drop(columns=['precip', 'snow', 'windgust', 'cloudcover'], inplace=True)

In [13]:
# add columns for hour of departure and arrival 
def hour(t):
    s = str(t)
    if len(s) < 3:
        return 0
    elif len(s) == 3:
        return int(s[0])
    elif len(s) == 4:
        if int(s[:2]) == 24:
            return 0
        else:
            return int(s[:2])

df['dep_hour'] = df.crs_dep_time.apply(hour)
df['arr_hour'] = df.crs_arr_time.apply(hour)

hour_ranks = pd.read_csv('data/arr_hour_ranks.csv').set_index('arr_hour').rename(columns={'arr_delay':'arr_hour_rank'})
df = df.join(hour_ranks, how='left', on='arr_hour')

In [14]:
# add month column
def month(datestring):
    date = datetime.strptime(datestring, "%Y-%m-%d")
    return date.month

df['month'] = df.fl_date.apply(month)

month_ranks = pd.read_csv('data/month_ranks.csv').rename(columns={'arr_delay':'month_rank'}).set_index('month')
df = df.join(month_ranks, how='left', on='month')

In [15]:
# add flight number delay ranks
fl_num_ranks = pd.read_csv('data/fl_num_ranks.csv', index_col=['op_unique_carrier', 'op_carrier_fl_num']).rename(columns={'arr_delay': 'fl_num_speek_rank'})
df = df.join(fl_num_ranks, how='left', on=['op_unique_carrier', 'op_carrier_fl_num'])
df.fl_num_speek_rank.fillna(value=2, inplace=True)

In [16]:
df.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,dest_airport_id,dest,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,dep_hour,arr_hour,arr_hour_rank,month,month_rank,fl_num_speek_rank
4,2018-01-01,B6,3,B6,N965JB,3,12478,JFK,14843,SJU,...,0.0,4.0,0.0,6.0,11,15,1,1,0,1.0


## Weather features

In [17]:
# Load all available weather data into a dataframe
directory = "vc_api/weather-data/"
files = [file for file in os.listdir(directory) if not file.startswith(".")]

airport = []
date = []
precip = []
snow = []
snowdepth = []
windgust = []
cloudcover = []
icon = []

for file in files:
    with open(directory+file) as f:
        j = json.load(f)
    airport.append(file[:3]) # airport code
    date.append(j['days'][0]['datetime'])
    precip.append(j['days'][0]['precip'])
    snow.append(j['days'][0]['snow'])
#     snowdepth.append(j['days'][0]['snowdepth'])
    windgust.append(j['days'][0]['windgust'])
    cloudcover.append(j['days'][0]['cloudcover'])
#     icon.append(j['days'][0]['icon'])

# create DataFrame
d = {'origin': airport, 'fl_date': date, 'precip': precip, 'snow': snow,
     'windgust': windgust, 'cloudcover': cloudcover}
weather_df = pd.DataFrame(data=d).set_index(['fl_date', 'origin'])

In [18]:
# join weather columns by origin and destination
df = df.join(weather_df, on=['fl_date', 'origin'], how='left').rename(columns={'precip':'origin_precip',
                                                                              'snow':'origin_snow',
                                                                              'windgust':'origin_windgust',
                                                                              'cloudcover': 'origin_cloudcover'})
df = df.join(weather_df, on=['fl_date', 'dest'], how='left').rename(columns={'precip':'dest_precip',
                                                                              'snow':'dest_snow',
                                                                              'windgust':'dest_windgust',
                                                                              'cloudcover': 'dest_cloudcover'})

In [19]:
# Bin weather into levels
def precip_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 79:
        return 3
    elif val > 39:
        return 2
    elif val > 0:
        return 1
    else:
        return 0

def snow_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 1.8:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def wind_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 46:
        return 3
    elif val > 35:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def cloud_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 71:
        return 2
    elif val > 47:
        return 1
    else:
        return 0
    
df['origin_precip_cat'] = df['origin_precip'].apply(precip_bins)
df['origin_snow_cat'] = df['origin_snow'].apply(snow_bins)
df['origin_windgust_cat'] = df['origin_windgust'].apply(wind_bins)
df['origin_cloud_cat'] = df['origin_cloudcover'].apply(cloud_bins)
df['dest_precip_cat'] = df['dest_precip'].apply(precip_bins)
df['dest_snow_cat'] = df['dest_snow'].apply(snow_bins)
df['dest_windgust_cat'] = df['dest_windgust'].apply(wind_bins)
df['dest_cloud_cat'] = df['dest_cloudcover'].apply(cloud_bins)