Import Packages

In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import time
from datetime import datetime, date, time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

Completed Functions

In [12]:
# CRS_ELAPSED_TIME --> HAUL_LENGTH
def haul(df, col):
    '''Determine if flight length is SHORT, MEDIUM or LONG based on expected elapsed flight time. 
            Input: 
            (0) df containing flight information, 
            (1) column containing the elapsed flight time in minutes   
            Output:   'haul_length' column determining haul length category per row in df'''
    length=[]
    for i in df[col]:
        if i < (3*60): # up to 3 hours
            length.append(0) # 0 = SHORT HAUL
        elif (i >= (3*60)) and (i < (6*60)): # 3-6 hours
            length.append(1) # 1 = MEDIUM HAUL
        elif i >= (6*60):# 6+ hours
            length.append(2) # 2 = LONG HAUL
    df['haul_length'] = length
# example of implementation: haul(flight10k, 'crs_elapsed_time')

# CRS_DEP_TIME (hhmm) --> CRS_DEP_TIME (hh) -- to be used within time_day function
def gethour(df,col):
    '''Convert hhmm to hh (24-hr) hour-only output
            Input: 
            (0) df containing flight information, 
            (1) column containing the hhmm time                  
            Output:   rewrite on input column in rounded hh format'''
    values = []
    for i in df[col]:
        mins = (i % 100) / 60 
        hour = i // 100
        hh = round(hour+mins)
        values.append(hh)
    df[col] = values
# example of implementation: gethour(flight10k, 'crs_dep_time')

# CRS_DEP/ARR_TIME (hhmm) --> hot encoded categorical time of day 'morning, aft...' 
def time_day(df, col):
    ''' Input:
            (0) df containing flight information
            (1) corresponding column of time of flight (i.e. departure or arrival) (format hhmm)
        Output:   rewrite of time column into categorical MORNING, AFTERNOON, EVENING, or OVERNIGHT'''
    gethour(df, col)
    timeday = []
    for i in df[col]:
        if (i>=23) or (i<5):
            timeday.append(0) # 0 = OVERNIGHT
        elif (i>=5) and (i<12):
            timeday.append(1) # 1 = MORNING
        elif (i>=12) and (i<18):
            timeday.append(2) # 2 = AFTERNOON
        elif (i>=18) and (i<23):
            timeday.append(3) # 3 = EVENING
    return timeday
# example of implementation: time_day(flight10k, 'crs_dep_time')

Open CSVs of Pre-Evaluated Features

In [13]:
airline_rating = pd.read_csv('data/airline_delay_rating.csv',index_col=0)
origin_traffic = pd.read_csv('data/origin_traffic_rating.csv', index_col=0)
dest_traffic = pd.read_csv('data/dest_traffic_rating.csv', index_col=0)

Open CSV of Flight Information to Model

In [14]:
# This is for the dataset you want to investigate
flights = pd.read_csv('data/weather_feature_transfer.csv', index_col=0)
flights.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,dest_airport_id,dest,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,diverted,crs_elapsed_time,actual_elapsed_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,precip,snow,windgust,cloudcover,carrier_speed_rank,flight_num_speed_rank,month,month_rank,dep_hour,arr_hour,hour_rank,dep_hour_rank,arr_hour_rank,precip_cat,snow_cat,windgust_cat,cloud_cat
0,2018-01-01,DL,3494,9E,N302PQ,3494,10397,ATL,15412,TYS,1910,1912.0,2.0,23.0,1935.0,2004.0,4.0,2010,2008.0,-2.0,0.0,60.0,56.0,152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.7,74.3,0,1,1,0,19,20,2,2,2,,,moderate,2


In [15]:
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'mkt_carrier_fl_num',
       'op_unique_carrier', 'tail_num', 'op_carrier_fl_num',
       'origin_airport_id', 'origin', 'dest_airport_id', 'dest',
       'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'precip', 'snow', 'windgust', 'cloudcover',
       'carrier_speed_rank', 'flight_num_speed_rank', 'month', 'month_rank',
       'dep_hour', 'arr_hour', 'hour_rank', 'dep_hour_rank', 'arr_hour_rank',
       'precip_cat', 'snow_cat', 'windgust_cat', 'cloud_cat'],
      dtype='object')

Build df based on columns we will use in transformation - Data Cleaning and Feature Implementation

**See option A or B in first rows to build df based on training or test dataset**

In [16]:
# A - if this is a training dataset, we need arr_delay as our target variable so use this first block of code
model_df = flights[['arr_delay','fl_date','op_unique_carrier','origin',
                    'dest','crs_dep_time','crs_arr_time','crs_elapsed_time',
                   'precip', 'snow', 'windgust', 'cloudcover',
                   'carrier_speed_rank', 'flight_num_speed_rank', 'month', 'month_rank',
                   'dep_hour', 'arr_hour', 'hour_rank', 'dep_hour_rank', 'arr_hour_rank',
                   'precip_cat', 'snow_cat', 'windgust_cat', 'cloud_cat']]

# B - if this is a testing dataset, we will not have arr_delay and cannot include it
#model_df = flights[['fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time']]

# convert date to datetime in order to grab the month
model_df['fl_date'] = pd.to_datetime(model_df['fl_date'])
model_df['fl_month'] = model_df['fl_date'].dt.month
model_df.drop(columns='fl_date', inplace=True) # this won't be needed after we got month

# set delay rating based on expected performance of the airline
model_df = model_df.merge(airline_rating, left_on='op_unique_carrier', right_on='airline', how='left')
model_df.drop(columns=['op_unique_carrier','airline'],inplace=True) 

# obtain haul length of the flight using haul function defined above
haul(model_df, 'crs_elapsed_time')
model_df.drop(columns=['crs_elapsed_time'],inplace=True)

# new column of categorical time of day information using time_day function defined above
model_df['dep_timeday'] = time_day(model_df, 'crs_dep_time')
model_df['arr_timeday'] = time_day(model_df, 'crs_arr_time')
model_df.drop(columns=['crs_dep_time','crs_arr_time'],inplace=True)

# classify the expected traffic of the origin and departure airports
model_df = model_df.merge(origin_traffic, left_on='origin', right_on='origin', how='left')
model_df = model_df.merge(dest_traffic, left_on='dest', right_on='dest', how='left')
model_df = model_df.fillna(model_df['busy_origin'].mean())
model_df.drop(columns=['origin','dest'],inplace=True)

# have a look at the dataset
model_df.head()
model_df.shape

Unnamed: 0,arr_delay,precip,snow,windgust,cloudcover,carrier_speed_rank,flight_num_speed_rank,month,month_rank,dep_hour,arr_hour,hour_rank,dep_hour_rank,arr_hour_rank,precip_cat,snow_cat,windgust_cat,cloud_cat,fl_month,airline_delay,haul_length,dep_timeday,arr_timeday,busy_origin,busy_dest
0,-2.0,0.0,0.0,40.7,74.3,0,1,1,0,19,20,2,2,2,,,moderate,2,1,1,0,3,3,4,3.0
1,-6.0,0.0,0.0,40.7,74.3,0,0,1,0,22,22,2,2,2,,,moderate,2,1,1,0,3,0,4,2.0
2,-12.0,0.0,0.0,40.7,74.3,0,3,1,0,15,16,1,1,1,,,moderate,2,1,1,0,2,2,4,3.0
3,74.0,0.0,0.0,55.4,24.9,0,2,1,0,10,14,1,1,1,,,strong,0,1,3,1,1,2,3,3.0
4,32.0,0.0,0.0,55.4,24.9,0,1,1,0,11,15,1,1,1,,,strong,0,1,3,1,1,2,3,3.0


(3089145, 25)

In [18]:
# # save
# model_df.to_csv('data/model_df.csv')