In [21]:
import pandas as pd
import numpy as np
import os
import json

from sklearn import preprocessing
import time
from datetime import datetime, date, time
import pickle

In [2]:
df = pd.read_csv('data/raw_flights_test.csv', index_col=0)

In [3]:
# add columns for hour of departure and arrival 
def hour(t):
    s = str(t)
    if len(s) < 3:
        return 0
    elif len(s) == 3:
        return int(s[0])
    elif len(s) == 4:
        if int(s[:2]) == 24:
            return 0
        else:
            return int(s[:2])

df['dep_hour'] = df.crs_dep_time.apply(hour)
df['arr_hour'] = df.crs_arr_time.apply(hour)

hour_ranks = pd.read_csv('data/arr_hour_ranks.csv').set_index('arr_hour').rename(columns={'arr_delay':'arr_hour_rank'})
df = df.join(hour_ranks, how='left', on='arr_hour')

In [4]:
# add month column
def month(datestring):
    date = datetime.strptime(datestring, "%Y-%m-%d")
    return date.month

df['month'] = df.fl_date.apply(month)

month_ranks = pd.read_csv('data/month_ranks.csv').rename(columns={'arr_delay':'month_rank'}).set_index('month')
df = df.join(month_ranks, how='left', on='month')

In [5]:
# add flight number delay ranks
fl_num_ranks = pd.read_csv('data/fl_num_ranks.csv', index_col=['op_unique_carrier', 'op_carrier_fl_num']).rename(columns={'arr_delay': 'fl_num_speek_rank'})
df = df.join(fl_num_ranks, how='left', on=['op_unique_carrier', 'op_carrier_fl_num'])
df.fl_num_speek_rank.fillna(value=2, inplace=True)

In [6]:
df.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dup,crs_elapsed_time,flights,distance,dep_hour,arr_hour,arr_hour_rank,month,month_rank,fl_num_speek_rank
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,...,N,95,1,363,18,19,2,1,0,2.0


In [7]:
# add carrier rank
carr_rank = pd.read_csv('data/carrier_ranks.csv').set_index('mkt_unique_carrier').rename(columns={'arr_delay': 'carrier_rank'})
df = df.join(carr_rank, how='left', on='mkt_unique_carrier')

## Weather features

In [8]:
# Load all available weather data into a dataframe
directory = "vc_api/weather-data/"
files = [file for file in os.listdir(directory) if not file.startswith(".")]

airport = []
date = []
precip = []
snow = []
snowdepth = []
windgust = []
cloudcover = []
icon = []

for file in files:
    with open(directory+file) as f:
        j = json.load(f)
    airport.append(file[:3]) # airport code
    date.append(j['days'][0]['datetime'])
    precip.append(j['days'][0]['precip'])
    snow.append(j['days'][0]['snow'])
#     snowdepth.append(j['days'][0]['snowdepth'])
    windgust.append(j['days'][0]['windgust'])
    cloudcover.append(j['days'][0]['cloudcover'])
#     icon.append(j['days'][0]['icon'])

# create DataFrame
d = {'origin': airport, 'fl_date': date, 'precip': precip, 'snow': snow,
     'windgust': windgust, 'cloudcover': cloudcover}
weather_df = pd.DataFrame(data=d).set_index(['fl_date', 'origin'])

In [9]:
# join weather columns by origin and destination
df = df.join(weather_df, on=['fl_date', 'origin'], how='left').rename(columns={'precip':'origin_precip',
                                                                              'snow':'origin_snow',
                                                                              'windgust':'origin_windgust',
                                                                              'cloudcover': 'origin_cloudcover'})
df = df.join(weather_df, on=['fl_date', 'dest'], how='left').rename(columns={'precip':'dest_precip',
                                                                              'snow':'dest_snow',
                                                                              'windgust':'dest_windgust',
                                                                              'cloudcover': 'dest_cloudcover'})

In [10]:
# Bin weather into levels
def precip_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 79:
        return 3
    elif val > 39:
        return 2
    elif val > 0:
        return 1
    else:
        return 0

def snow_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 1.8:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def wind_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 46:
        return 3
    elif val > 35:
        return 2
    elif val > 0:
        return 1
    else:
        return 0
    
def cloud_bins(val):
    if np.isnan(val):
        return np.nan
    elif val > 71:
        return 2
    elif val > 47:
        return 1
    else:
        return 0
    
df['origin_precip_cat'] = df['origin_precip'].apply(precip_bins)
df['origin_snow_cat'] = df['origin_snow'].apply(snow_bins)
df['origin_windgust_cat'] = df['origin_windgust'].apply(wind_bins)
df['origin_cloud_cat'] = df['origin_cloudcover'].apply(cloud_bins)
df['dest_precip_cat'] = df['dest_precip'].apply(precip_bins)
df['dest_snow_cat'] = df['dest_snow'].apply(snow_bins)
df['dest_windgust_cat'] = df['dest_windgust'].apply(wind_bins)
df['dest_cloud_cat'] = df['dest_cloudcover'].apply(cloud_bins)

In [11]:
# df.to_csv('data/my_features.csv')

## Robyn's features

In [31]:
# CRS_ELAPSED_TIME --> HAUL_LENGTH
def haul(df, col):
    '''Determine if flight length is SHORT, MEDIUM or LONG based on expected elapsed flight time. 
            Input: 
            (0) df containing flight information, 
            (1) column containing the elapsed flight time in minutes   
            Output:   'haul_length' column determining haul length category per row in df'''
    global length
    length=[]
    for i in df[col]:
        if i < (3*60): # up to 3 hours
            length.append(0) # 0 = SHORT HAUL
        elif (i >= (3*60)) and (i < (6*60)): # 3-6 hours
            length.append(1) # 1 = MEDIUM HAUL
        elif i >= (6*60):# 6+ hours
            length.append(2) # 2 = LONG HAUL
    df['haul_length'] = length
# example of implementation: haul(flight10k, 'crs_elapsed_time')

# CRS_DEP_TIME (hhmm) --> CRS_DEP_TIME (hh) -- to be used within time_day function
def gethour(df,col):
    '''Convert hhmm to hh (24-hr) hour-only output
            Input: 
            (0) df containing flight information, 
            (1) column containing the hhmm time                  
            Output:   rewrite on input column in rounded hh format'''
    values = []
    for i in df[col]:
        mins = (i % 100) / 60 
        hour = i // 100
        hh = round(hour+mins)
        values.append(hh)
    df[col] = values
# example of implementation: gethour(flight10k, 'crs_dep_time')

# CRS_DEP/ARR_TIME (hhmm) --> hot encoded categorical time of day 'morning, aft...' 
def time_day(df, col):
    ''' Input:
            (0) df containing flight information
            (1) corresponding column of time of flight (i.e. departure or arrival) (format hhmm)
        Output:   rewrite of time column into categorical MORNING, AFTERNOON, EVENING, or OVERNIGHT'''
    gethour(df, col)
    timeday = []
    for i in df[col]:
        if (i>=23) or (i<5):
            timeday.append(0) # 0 = OVERNIGHT
        elif (i>=5) and (i<12):
            timeday.append(1) # 1 = MORNING
        elif (i>=12) and (i<18):
            timeday.append(2) # 2 = AFTERNOON
        elif (i>=18) and (i<23):
            timeday.append(3) # 3 = EVENING
    return timeday
# example of implementation: time_day(flight10k, 'crs_dep_time')

In [13]:
airline_rating = pd.read_csv('data/airline_delay_rating.csv', index_col=0)
origin_traffic = pd.read_csv('data/origin_traffic_rating.csv', index_col=0)
origin_delay = pd.read_csv('data/origin_delay_rating.csv', index_col=0)
dest_traffic = pd.read_csv('data/dest_traffic_rating.csv', index_col=0)
delay_dep_h = pd.read_csv('data/crs_dep_time_delay_rating.csv', index_col=0)
delay_arr_h = pd.read_csv('data/crs_arr_time_delay_rating.csv', index_col=0)

In [14]:
# mask = df[df.crs_elapsed_time.isnull()].index
# df.drop(mask.values, inplace=True)

In [15]:
df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'dep_hour', 'arr_hour', 'arr_hour_rank', 'month', 'month_rank',
       'fl_num_speek_rank', 'carrier_rank', 'origin_precip', 'origin_snow',
       'origin_windgust', 'origin_cloudcover', 'dest_precip', 'dest_snow',
       'dest_windgust', 'dest_cloudcover', 'origin_precip_cat',
       'origin_snow_cat', 'origin_windgust_cat', 'origin_cloud_cat',
       'dest_precip_cat', 'dest_snow_cat', 'dest_windgust_cat',
       'dest_cloud_cat'],
      dtype='object')

In [33]:
# A - if this is a training dataset, we need arr_delay as our target variable so use this first block of code
model_df = df.copy()
# B - if this is a testing dataset, we will not have arr_delay and cannot include it
#model_df = flights[['tail_num','op_carrier_fl_num','fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]

# first regression will be simple-- is the flight going to be delayed or not?
if 'arr_delay' in model_df:
    model_df['delay_flag'] = model_df['arr_delay'].map(lambda x: 0 if x <= 0 else 1)
    arr_delay = model_df['arr_delay']
#     model_df.drop(columns='arr_delay', inplace=True)

# label encode tail_num for identification of the flight later
le = preprocessing.LabelEncoder()
tail_num = model_df['tail_num'].values
model_df['tail_num'] = le.fit_transform(tail_num)

# convert date to datetime in order to grab the month
model_df['fl_date'] = pd.to_datetime(model_df['fl_date'])
#model_df['year'] = model_df['fl_date'].dt.year
model_df['month'] = model_df['fl_date'].dt.month
model_df['day'] = model_df['fl_date'].dt.day
model_df['weekday'] = model_df['fl_date'].dt.dayofweek
# model_df.drop(columns='fl_date', inplace=True) # this won't be needed after we got month

# set delay rating based on expected performance of the airline
model_df = model_df.merge(airline_rating, left_on='op_unique_carrier', right_on='airline', how='left')
model_df.drop(columns=['op_unique_carrier','airline'],inplace=True) 

# obtain haul length of the flight using haul function defined above
haul(model_df, 'crs_elapsed_time')
#model_df.drop(columns=['crs_elapsed_time'],inplace=True)

# new column of categorical time of day information using time_day function defined above as well as expected delays relating to the time of day departure
model_df['dep_timeday'] = time_day(model_df, 'crs_dep_time')
model_df['arr_timeday'] = time_day(model_df, 'crs_arr_time')
model_df = model_df.merge(delay_dep_h, left_on='crs_dep_time', right_on='crs_dep_time', how='left')
model_df = model_df.merge(delay_arr_h, left_on='crs_arr_time', right_on='crs_arr_time', how='left')
model_df.drop(columns=['crs_dep_time','crs_arr_time'],inplace=True)

# classify the expected traffic of the origin and departure airports
model_df = model_df.merge(origin_traffic, left_on='origin', right_on='origin', how='left')
model_df = model_df.merge(dest_traffic, left_on='dest', right_on='dest', how='left')
model_df = model_df.fillna(model_df['busy_origin'].mean())
model_df = model_df.merge(origin_delay, left_on='origin', right_on='origin', how='left')
# model_df.drop(columns=['origin','dest'],inplace=True)

#if 'arr_delay' in model_df:
#    training_full = model_df.copy(deep=True)
#    model_df.drop(columns='arr_delay', inplace=True)

# have a look at the dataset
model_df.head()
model_df.shape

(150623, 51)

In [17]:
# model_df.to_csv('data/model_df_full.csv')

In [20]:
# # save progress
# model_df.to_csv('data/flights_test_processed.csv')

# Predict

In [45]:
# load best model
model = pickle.load(open('models/model_04.sav', 'rb'))
model

LinearRegression()

In [42]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = model_df[cols].to_numpy()

poly = preprocessing.PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

In [43]:
y_pred = model.predict(X_poly)

In [44]:
y_pred

array([-2.11923725e+07, -2.11923883e+07, -2.11923625e+07, ...,
        2.41693014e+00,  4.96139689e+07,  1.08851661e+01])

In [48]:
print(len(df))
print(len(y_pred))

150623
150623


In [51]:
pred = pd.Series(y_pred)
df_sub = pd.concat([df, pred], axis=1)

In [55]:
df_sub.columns

Index([            'fl_date',  'mkt_unique_carrier',  'branded_code_share',
               'mkt_carrier',  'mkt_carrier_fl_num',   'op_unique_carrier',
                  'tail_num',   'op_carrier_fl_num',   'origin_airport_id',
                    'origin',    'origin_city_name',     'dest_airport_id',
                      'dest',      'dest_city_name',        'crs_dep_time',
              'crs_arr_time',                 'dup',    'crs_elapsed_time',
                   'flights',            'distance',            'dep_hour',
                  'arr_hour',       'arr_hour_rank',               'month',
                'month_rank',   'fl_num_speek_rank',        'carrier_rank',
             'origin_precip',         'origin_snow',     'origin_windgust',
         'origin_cloudcover',         'dest_precip',           'dest_snow',
             'dest_windgust',     'dest_cloudcover',   'origin_precip_cat',
           'origin_snow_cat', 'origin_windgust_cat',    'origin_cloud_cat',
           '

In [56]:
df_sub = df_sub[['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num','origin','dest',0]]

In [60]:
df_sub = df_sub.rename(columns={0: 'predicted_delay'})

In [62]:
# df_sub.to_csv('submission.csv')