## Final Training Data

In [145]:
import pandas as pd
import numpy as np

In [146]:
# bring in testing data
test_df = pd.read_csv('DB/test_sample.csv', index_col='Unnamed: 0')
test_df = test_df.dropna()

# bring in flights data
flights_df = pd.read_csv('DB/flights_data.csv', index_col='Unnamed: 0')

In [147]:
# find testing data features
feature_cols = list(test_df.columns)
feature_cols.append('arr_delay')

# create base training features from existing testing features
X = flights_df[feature_cols]
X = feature_df.dropna()

In [148]:
# find numeric and categorical features
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

# remove redundant numeric columns
final_num_cols = list(num_cols)
final_num_cols.remove('op_carrier_fl_num')
final_num_cols.remove('flights')

# remove redundant categorical columns
final_cat_cols = ['mkt_unique_carrier', 'fl_date', 'tail_num', 'branded_code_share']

# combine final features
final_features = final_num_cols + final_cat_cols

X = X[final_features]

# convert fl_date feature into datetime
X['fl_date'] = pd.to_datetime(X['fl_date'])

# separate datetime into date features
X['year'] = X['fl_date'].dt.year
X['month'] = X['fl_date'].dt.month
X['week'] = X['fl_date'].dt.isocalendar().week
X['day'] = X['fl_date'].dt.day
X['day_of_week'] = X['fl_date'].dt.dayofweek

# reset index for collaborative data sorting structure
X = X.reset_index()
X.index.name = 'order'
X = X.drop(columns=['index'])

# drop original fl_date and arr_delay columns
X = X.drop(columns=['fl_date'])

***

## Join New Features

In [149]:
# bring in feature data
taxi_and_delay = pd.read_csv('James/features_created.csv', index_col='order')
avg_monthly_pas = pd.read_csv('Riley/avg_monthly_pas.csv')

In [150]:
# drop extra column
taxi_and_delay = taxi_and_delay.drop(columns=['Unnamed: 0'])

In [151]:
# merge taxi and delay features onto base training dataset
X = pd.merge(X, taxi_and_delay, how='left', on=['order'])

In [152]:
# merge onto training DataFrame
final = pd.merge(X, avg_monthly_pas, how='left', on=['origin_airport_id','month'])

In [153]:
final = final.dropna()

In [154]:
final

Unnamed: 0,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,mkt_unique_carrier,tail_num,...,day_of_week,mean_taxi_out/time,mean_taxi_in/time,mean_dep_delay/time,mean_arr_delay/time,mean_dep_delay/distance,mean_arr_delay/distance,mean_dep_delay/air_time,mean_arr_delay/air_time,avg_monthly_pas
0,3501,12953,13930,1300,1444,164.0,733.0,-28.0,UA,N744YX,...,0,17.372642,8.398230,9.557783,16.203540,17.384397,13.644793,10.378079,5.599754,66470.5
1,3502,11433,12266,630,854,204.0,1075.0,1.0,UA,N640RW,...,0,16.813246,9.232143,4.561346,5.321429,8.236994,0.011561,10.037453,4.868914,108812.5
2,3503,11618,11433,1500,1709,129.0,488.0,18.0,UA,N641RW,...,0,18.789406,8.639053,14.756792,10.242604,13.277929,7.323288,10.255463,5.307982,94693.5
3,3504,11618,11278,2041,2159,78.0,199.0,32.0,UA,N722YX,...,0,21.924528,7.333333,11.867925,18.743961,14.227513,6.284946,9.990928,5.078103,94693.5
4,3505,12266,11298,2140,2257,77.0,224.0,-1.0,UA,N855RW,...,0,17.312977,8.170455,14.675573,3.227273,10.140485,8.314578,10.631954,6.373485,155988.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161997,2789,13487,14771,925,1143,258.0,1589.0,-20.0,DL,N886DN,...,2,16.813246,6.856410,4.561346,-4.579487,16.477011,12.132184,7.679787,2.613636,136111.0
161998,2790,10721,13487,1841,2101,200.0,1124.0,308.0,DL,N302DN,...,2,16.813246,7.857143,4.561346,-0.821429,9.118557,5.860104,11.231527,5.439655,84468.5
161999,2791,10397,11298,1000,1116,136.0,731.0,-6.0,DL,N375NC,...,2,17.097222,6.627760,30.958333,-2.760252,16.477011,12.132184,7.679787,2.613636,379869.5
162000,2791,11298,10397,1201,1512,131.0,731.0,-9.0,DL,N375NC,...,2,17.868852,7.225000,16.262295,-1.537815,10.369014,6.923944,9.211840,4.413695,262691.5


In [155]:
final.to_csv('final_training.csv', index=False)

***

## ML Setup

In [156]:
df = pd.read_csv('final_training.csv')

In [157]:
# set X & y
X = df.drop(columns=['arr_delay'])
y = df['arr_delay']

In [158]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101)

In [159]:
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['mkt_unique_carrier', 'tail_num', 'branded_code_share'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)