### Feature Engineering

Feature engineering will play a crucial role in this problems. We have only very little attributes so we need to create some features that will have some predictive power.

- weather: we can use some weather API to look for the weather in time of the scheduled departure and scheduled arrival.
- statistics (avg, mean, median, std, min, max...): we can take a look at previous delays and compute descriptive statistics
- airports encoding: we need to think about what to do with the airports and other categorical variables
- time of the day: the delay probably depends on the airport traffic which varies during the day.
- airport traffic
- unsupervised learning as feature engineering?
- **what are the additional options?**: Think about what we could do more to improve the model.

In [1]:
# importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# loading accessory tables
dep_delays = pd.read_csv('../DB/dep_delays.csv')
taxi_times = pd.read_csv('../DB/taxi_times.csv')
weather_delays = pd.read_csv('../DB/weather_delays.csv')

In [3]:
dep_delays.head()

Unnamed: 0.1,Unnamed: 0,dep_delay,arr_delay,difference
0,1,3.0,1.0,2.0
1,2,28.0,18.0,10.0
2,3,12.0,32.0,-20.0
3,8,21.0,3.0,18.0
4,12,51.0,28.0,23.0


In [4]:
taxi_times.head()

Unnamed: 0.1,Unnamed: 0,crs_dep_time,taxi_out,crs_arr_time,taxi_in
0,0,1300,15.0,1444,13.0
1,1,630,18.0,854,8.0
2,2,1500,28.0,1709,8.0
3,3,2041,50.0,2159,6.0
4,4,2140,15.0,2257,17.0


In [5]:
weather_delays.head()

Unnamed: 0.1,Unnamed: 0,fl_date,origin,weather_delay,weather
0,17,2018-01-01,ORD,87.0,Clear
1,118,2018-01-01,ORD,1.0,Clear
2,121,2018-01-01,XNA,9.0,Clear
3,122,2018-01-01,LEX,8.0,Overcast
4,134,2018-01-01,ORD,26.0,Clear


In [6]:
# renaming unnamed column
weather_delays = weather_delays.rename(columns={'Unnamed: 0':'order'})
taxi_times = taxi_times.rename(columns={'Unnamed: 0':'order'})
dep_delays = dep_delays.rename(columns={'Unnamed: 0':'order'})

In [7]:
# loading main table
flights_df = pd.read_csv('../DB/flights_data.csv')
# viewing
flights_df.head()

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,733.0,,,,,,,,,
1,1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,...,1075.0,,,,,,,,,
2,2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,...,488.0,0.0,0.0,0.0,0.0,18.0,,,,
3,3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,...,199.0,12.0,0.0,20.0,0.0,0.0,,,,
4,4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,...,224.0,,,,,,,,,


In [8]:
# renaming unnamed column
flights_df = flights_df.rename(columns={'Unnamed: 0':'order'})
# checking
flights_df.head()

Unnamed: 0,order,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,733.0,,,,,,,,,
1,1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,...,1075.0,,,,,,,,,
2,2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,...,488.0,0.0,0.0,0.0,0.0,18.0,,,,
3,3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,...,199.0,12.0,0.0,20.0,0.0,0.0,,,,
4,4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,...,224.0,,,,,,,,,


In [9]:
flights_df = pd.merge(flights_df, weather_delays[['order','weather']], how='left', on=['order'])
flights_df = pd.merge(flights_df, dep_delays[['order','difference']], how='left', on=['order'])
flights_df.head()

Unnamed: 0,order,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,weather,difference
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,,,,,,,,,,
1,1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,...,,,,,,,,,,2.0
2,2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,...,0.0,0.0,0.0,18.0,,,,,,10.0
3,3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,...,0.0,20.0,0.0,0.0,,,,,,-20.0
4,4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,...,,,,,,,,,,


In [10]:
# finding average taxi out for given dep/arr time
avg_out = taxi_times.groupby('crs_dep_time').mean().reset_index()
avg_in = taxi_times.groupby('crs_arr_time').mean().reset_index()
flights_df = pd.merge(flights_df, avg_out[['crs_dep_time','taxi_out']], how='left', on=['crs_dep_time'])
flights_df = pd.merge(flights_df, avg_in[['crs_arr_time','taxi_in']], how='left', on=['crs_arr_time'])
flights_df.head()

Unnamed: 0,order,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,weather,difference,taxi_out_y,taxi_in_y
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,,,,,,,,,17.372642,8.39823
1,1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,...,,,,,,,,2.0,16.813246,9.232143
2,2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,...,0.0,18.0,,,,,,10.0,18.789406,8.639053
3,3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,...,0.0,0.0,,,,,,-20.0,21.924528,7.333333
4,4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,...,,,,,,,,,17.312977,8.170455


In [29]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165072 entries, 0 to 165071
Data columns (total 47 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order                165072 non-null  int64  
 1   fl_date              165072 non-null  object 
 2   mkt_unique_carrier   165072 non-null  object 
 3   branded_code_share   165072 non-null  object 
 4   mkt_carrier          165072 non-null  object 
 5   mkt_carrier_fl_num   165072 non-null  int64  
 6   op_unique_carrier    165072 non-null  object 
 7   tail_num             164593 non-null  object 
 8   op_carrier_fl_num    165072 non-null  int64  
 9   origin_airport_id    165072 non-null  int64  
 10  origin               165072 non-null  object 
 11  origin_city_name     165072 non-null  object 
 12  dest_airport_id      165072 non-null  int64  
 13  dest                 165072 non-null  object 
 14  dest_city_name       165072 non-null  object 
 15  crs_dep_time     

In [11]:
# average delay for each departure time
avg_dep_delay = flights_df[['dep_delay','crs_dep_time']].groupby('crs_dep_time').mean().reset_index()
avg_dep_delay = avg_dep_delay.rename(columns={'dep_delay':'mean_dep_delay/time'})

# average delay for each arrival time
avg_arr_delay = flights_df[['arr_delay','crs_arr_time']].groupby('crs_arr_time').mean().reset_index()
avg_arr_delay = avg_arr_delay.rename(columns={'arr_delay':'mean_arr_delay/time'})

# average dep. delay for flight distance
avg_dist_dep = flights_df[['dep_delay','distance']].groupby('distance').mean().reset_index()
avg_dist_dep = avg_dist_dep.rename(columns={'dep_delay':'mean_dep_delay/distance'})

# avg arr. delay for distance
avg_dist_arr = flights_df[['arr_delay','distance']].groupby('distance').mean().reset_index()
avg_dist_arr = avg_dist_arr.rename(columns={'arr_delay':'mean_arr_delay/distance'})

# mean dep delay for each carrier
avg_dep_carrier = flights_df[['dep_delay','mkt_unique_carrier']].groupby('mkt_unique_carrier').mean().reset_index()
avg_dep_carrier = avg_dep_carrier.rename(columns={'dep_delay':'mean_dep_delay/carrier'})

# mean arr delay for each carrier
avg_arr_carrier = flights_df[['arr_delay','mkt_unique_carrier']].groupby('mkt_unique_carrier').mean().reset_index()
avg_arr_carrier = avg_arr_carrier.rename(columns={'arr_delay':'mean_arr_delay/carrier'})


# joining all to main df
flights_df = pd.merge(flights_df, avg_dep_delay[['crs_dep_time','mean_dep_delay/time']], how='left', on=['crs_dep_time'])
flights_df = pd.merge(flights_df, avg_arr_delay[['crs_arr_time','mean_arr_delay/time']], how='left', on=['crs_arr_time'])
flights_df = pd.merge(flights_df, avg_dist_dep[['distance','mean_dep_delay/distance']], how='left', on=['distance'])
flights_df = pd.merge(flights_df, avg_dist_arr[['distance','mean_arr_delay/distance']], how='left', on=['distance'])
flights_df = pd.merge(flights_df, avg_dep_carrier[['mean_dep_delay/carrier','mkt_unique_carrier']], how='left', on=['mkt_unique_carrier'])
flights_df = pd.merge(flights_df, avg_arr_carrier[['mean_arr_delay/carrier','mkt_unique_carrier']], how='left', on=['mkt_unique_carrier'])

# viewing
flights_df.head()

Unnamed: 0,order,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,weather,difference,taxi_out_y,taxi_in_y,mean_dep_delay/time,mean_arr_delay/time,mean_dep_delay/distance,mean_arr_delay/distance,mean_dep_delay/carrier,mean_arr_delay/carrier
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,,,17.372642,8.39823,9.557783,16.20354,17.384397,13.644793,12.628894,8.687159
1,1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,...,,2.0,16.813246,9.232143,4.561346,5.321429,8.236994,0.011561,12.628894,8.687159
2,2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,...,,10.0,18.789406,8.639053,14.756792,10.242604,13.277929,7.323288,12.628894,8.687159
3,3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,...,,-20.0,21.924528,7.333333,11.867925,18.743961,14.227513,6.284946,12.628894,8.687159
4,4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,...,,,17.312977,8.170455,14.675573,3.227273,10.140485,8.314578,12.628894,8.687159


In [12]:
# renaming mean taxi time columns
flights_df = flights_df.rename(columns={'taxi_out_y':'mean_taxi_out/time',
                                       'taxi_in_y':'mean_taxi_in/time'})
flights_df.head(1)

Unnamed: 0,order,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,weather,difference,mean_taxi_out/time,mean_taxi_in/time,mean_dep_delay/time,mean_arr_delay/time,mean_dep_delay/distance,mean_arr_delay/distance,mean_dep_delay/carrier,mean_arr_delay/carrier
0,0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,...,,,17.372642,8.39823,9.557783,16.20354,17.384397,13.644793,12.628894,8.687159


In [13]:
# saving updated flights_df with created features
flights_df.to_csv('../DB/flights_updated.csv')

In [14]:
# saving features created
created = ['order',
           'distance',
           'crs_dep_time',
           'crs_arr_time',
           'mkt_unique_carrier',
           'mean_taxi_out/time',
           'mean_taxi_in/time',
           'mean_dep_delay/time',
           'mean_arr_delay/time',
           'mean_dep_delay/distance',
           'mean_arr_delay/distance',
           'mean_dep_delay/carrier',
           'mean_arr_delay/carrier'
          ]

features_created = flights_df[created]
features_created.to_csv('../DB/features_created.csv')