In [2]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [3]:
flights = pd.read_pickle('flights.pkl')

In [5]:
# drop no_name column, it is all NaN
flights.drop('no_name',axis=1,inplace=True)

In [6]:
# consolidate delay codes into single column
conditions = [
    (flights['carrier_delay'] > 0),
    (flights['weather_delay'] > 0),
    (flights['nas_delay'] > 0),
    (flights['security_delay'] > 0),
    (flights['late_aircraft_delay'] > 0),
    (flights['arr_delay'] <=0)
    ]

# create a list of the values we want to assign for each code
values = [1,2,3,4,5,0]

# create a new column and use np.select to assign values to it using our lists as arguments
flights['delay_code'] = np.select(conditions, values)


In [7]:
# remove the delay type columns
flights.drop(['carrier_delay','weather_delay','nas_delay','security_delay','late_aircraft_delay'],axis=1,inplace=True)

# remove gtime columns as these are only calculated after a flight has already been cancelled
flights.drop(['total_add_gtime','longest_add_gtime'],axis=1,inplace=True)

# remove mkt_unique_carrier, branded_code_share, mkt_carrier, mkt_carrier_fl_num as these are redundant
flights.drop(['mkt_unique_carrier','branded_code_share','mkt_carrier','mkt_carrier_fl_num'],axis=1,inplace=True)

# remove tail_num, origin_city_name, origin_airport_id, dest_airport_id, dest_city_name. Redundant info
flights.drop(['tail_num','origin_city_name','origin_airport_id','dest_airport_id','dest_city_name'],axis=1,inplace=True)

#remove first_dep_time, nearly all NaN
flights.drop('first_dep_time',axis=1, inplace=True)

#remove dup, all same value
flights.drop('dup',axis=1, inplace=True)

# remove flight number
flights.drop('op_carrier_fl_num',axis=1,inplace=True)

In [8]:
# replace NaN values with 0 for cancellation code, actual_elapsed_time, air_time, crs_elapsed_time

flights['cancellation_code'].fillna(0,inplace=True)
flights['actual_elapsed_time'].fillna(0,inplace=True)
flights['air_time'].fillna(0,inplace=True)
flights['crs_elapsed_time'].fillna(0,inplace=True)

In [9]:
#change date to datetime instead of object
flights['fl_date'] = flights['fl_date'].astype('datetime64[ns]')

# change dep_time and arr_time to binned groups, every 4 hours
bins = [0, 400, 800, 1200, 1600, 2000, 2400]
labels = [1,2,3,4,5,6]

flights['crs_arr_binned'] = pd.cut(flights['crs_arr_time'], bins=bins, labels=labels)
flights['crs_drp_binned'] = pd.cut(flights['crs_dep_time'], bins=bins, labels=labels)

#drop former dep and arr time columns
flights.drop(['crs_dep_time','crs_arr_time'],axis=1, inplace=True)

In [10]:
# change categorical variables from 'object' to 'category'
flights['op_unique_carrier'] = flights['op_unique_carrier'].astype('category')
flights['origin'] = flights['origin'].astype('category')
flights['dest'] = flights['dest'].astype('category')
flights['cancellation_code'] = flights['cancellation_code'].astype('category')

In [11]:
# add columns to capture year, month, date, day of week
flights['year'] = flights.fl_date.dt.year
flights['month'] = flights.fl_date.dt.month
flights['day'] = flights.fl_date.dt.day
flights['day_of_week'] = flights.fl_date.dt.dayofweek

# remove fl_date
flights.drop('fl_date',axis=1, inplace=True)

In [12]:
flights.dtypes

op_unique_carrier      category
origin                 category
dest                   category
dep_time                float64
dep_delay               float64
taxi_out                float64
wheels_off              float64
wheels_on               float64
taxi_in                 float64
arr_time                float64
arr_delay               float64
cancelled                 int64
cancellation_code      category
diverted                  int64
crs_elapsed_time        float64
actual_elapsed_time     float64
air_time                float64
flights                   int64
distance                  int64
delay_code                int64
crs_arr_binned         category
crs_drp_binned         category
year                      int64
month                     int64
day                       int64
day_of_week               int64
dtype: object

In [13]:
# bin distance into short, medium, long haul
"""
Short Haul - Distance <800 miles
Medium Haul - Distance 800-2200 miles
Long Haul - Distance >2200miles
"""

dist_bins = [0, 800, 2200, 40000]
dist_labels = [1,2,3]

flights['dist_binned'] = pd.cut(flights['distance'], bins=dist_bins, labels=dist_labels)

# remove distance column
flights.drop('distance',axis=1, inplace=True)

In [40]:
# bin delays
"""
Large airlines must pay:
$400 if the passenger arrives three hours or more hours late, but less than six hours;
$700 if the passenger arrives six or more hours late, but less than nine hours; and
$1,000 if the passenger arrives nine or more hours late.
"""

delay_bins = [-3000, 0, 30, 60, 180, 360, 540, 5000]
delay_labels = [0,1,2,3,4,5,6]

flights['dep_delay_bin'] = pd.cut(flights['dep_delay'], bins=delay_bins, labels=delay_labels)
flights['arr_delay_bin'] = pd.cut(flights['arr_delay'], bins=delay_bins, labels=delay_labels)

# remove distance column
flights.drop(['dep_delay','arr_delay'],axis=1, inplace=True)

In [41]:
# create two separate dataframes, one to handle delays prediction, one to handle cancellation prediction
delay_df = deepcopy(flights)
cancel_df = deepcopy(flights)

In [42]:
# remove info for once the plane is in the air, not helpful in predicting
delay_df.drop(['crs_elapsed_time','air_time','actual_elapsed_time'],axis=1,inplace=True)

# for delays_df, remove cancellation, diverted columns and cancellation_code
delay_df.drop(['cancelled','diverted','cancellation_code'],axis=1,inplace=True)

# remove arr_time, dep_time,taxi and wheels on/off info, I don't believe this is actually useful
delay_df.drop(['dep_time','taxi_out','wheels_off','wheels_on','taxi_in','arr_time'],axis=1,inplace=True)

In [43]:
# remove any leftover rows with NaN values on dep_delay and arr_delay
delay_df = delay_df[(delay_df['dep_delay_bin'].isna()==False)
                   & (delay_df['arr_delay_bin'].isna()==False)]


In [44]:
print('Dataframe Size:',delay_df.shape)
print(delay_df.isna().sum())
delay_df.head(10)

Dataframe Size: (15611152, 14)
op_unique_carrier    0
origin               0
dest                 0
flights              0
delay_code           0
crs_arr_binned       0
crs_drp_binned       0
year                 0
month                0
day                  0
day_of_week          0
dist_binned          0
dep_delay_bin        0
arr_delay_bin        0
dtype: int64


Unnamed: 0,op_unique_carrier,origin,dest,flights,delay_code,crs_arr_binned,crs_drp_binned,year,month,day,day_of_week,dist_binned,dep_delay_bin,arr_delay_bin
0,WN,DEN,PHL,1,0,6,5,2018,9,30,6,2,0,0
1,WN,DEN,PHX,1,0,5,5,2018,9,30,6,1,1,0
2,WN,DEN,PHX,1,0,3,3,2018,9,30,6,1,0,0
3,WN,DEN,PHX,1,0,6,5,2018,9,30,6,1,1,1
4,WN,DEN,PHX,1,0,6,6,2018,9,30,6,1,1,1
5,WN,DEN,PHX,1,0,2,2,2018,9,30,6,1,0,0
6,WN,DEN,PHX,1,0,5,5,2018,9,30,6,1,1,1
7,WN,DEN,PHX,1,0,5,4,2018,9,30,6,1,1,0
8,WN,DEN,PHX,1,0,3,3,2018,9,30,6,1,1,1
9,WN,DEN,PHX,1,0,4,4,2018,9,30,6,1,0,0


In [72]:
# for cancel model, remove all columns that deal with delays
cancel_df.drop(['dep_time','dep_delay_bin','taxi_out','wheels_off','wheels_on','taxi_in','arr_time','arr_delay_bin','delay_code'],axis=1,inplace=True)

In [75]:
# remove variables that have to do when the flight is in the air
cancel_df.drop(['crs_elapsed_time','actual_elapsed_time','air_time'],axis=1,inplace=True)

In [77]:
print(cancel_df.shape)
cancel_df.head(10)

(15927485, 14)


Unnamed: 0,op_unique_carrier,origin,dest,cancelled,cancellation_code,diverted,flights,crs_arr_binned,crs_drp_binned,year,month,day,day_of_week,dist_binned
0,WN,DEN,PHL,0,0,0,1,6,5,2018,9,30,6,2
1,WN,DEN,PHX,0,0,0,1,5,5,2018,9,30,6,1
2,WN,DEN,PHX,0,0,0,1,3,3,2018,9,30,6,1
3,WN,DEN,PHX,0,0,0,1,6,5,2018,9,30,6,1
4,WN,DEN,PHX,0,0,0,1,6,6,2018,9,30,6,1
5,WN,DEN,PHX,0,0,0,1,2,2,2018,9,30,6,1
6,WN,DEN,PHX,0,0,0,1,5,5,2018,9,30,6,1
7,WN,DEN,PHX,0,0,0,1,5,4,2018,9,30,6,1
8,WN,DEN,PHX,0,0,0,1,3,3,2018,9,30,6,1
9,WN,DEN,PHX,0,0,0,1,4,4,2018,9,30,6,1


In [78]:
print(delay_df.shape)
delay_df.head(10)

(15611152, 14)


Unnamed: 0,op_unique_carrier,origin,dest,flights,delay_code,crs_arr_binned,crs_drp_binned,year,month,day,day_of_week,dist_binned,dep_delay_bin,arr_delay_bin
0,WN,DEN,PHL,1,0,6,5,2018,9,30,6,2,0,0
1,WN,DEN,PHX,1,0,5,5,2018,9,30,6,1,1,0
2,WN,DEN,PHX,1,0,3,3,2018,9,30,6,1,0,0
3,WN,DEN,PHX,1,0,6,5,2018,9,30,6,1,1,1
4,WN,DEN,PHX,1,0,6,6,2018,9,30,6,1,1,1
5,WN,DEN,PHX,1,0,2,2,2018,9,30,6,1,0,0
6,WN,DEN,PHX,1,0,5,5,2018,9,30,6,1,1,1
7,WN,DEN,PHX,1,0,5,4,2018,9,30,6,1,1,0
8,WN,DEN,PHX,1,0,3,3,2018,9,30,6,1,1,1
9,WN,DEN,PHX,1,0,4,4,2018,9,30,6,1,0,0
