# Feature Selection Sensitivity Testing

## One airline in 2019

In [1]:
#!pip install autofeatselect
#!pip install lazypredict


In [10]:
import pandas as pd
import sqlite3
import time
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.model_selection import train_test_split
from autofeatselect import CorrelationCalculator, FeatureSelector, AutoFeatureSelect
import seaborn as sns
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
import sys
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
os.getcwd()

'/home/ccecilia'

In [4]:
n_jobs = 2 #make sure to request 2 cores
random_state = 42
seed = 24

In [5]:
df = pd.read_parquet('bts_faa_coords.parquet')

In [6]:
df_dl_2019 = df[(df['Reporting_Airline']=='DL') &
             (df['Year']==2019) &
             (df['Cancelled']==0) &
             (df['Diverted']==0)].copy()
del df #free up memory

### Test if the different dataset with same features will change models performance

In [7]:
final_features = pd.read_csv('final_features.csv')

In [11]:
def labe_encoder(df):
    label_encoder = LabelEncoder()
    # Separate numeric and object columns
    object_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
    for c in object_columns:
        df[c] = label_encoder.fit_transform(df[c])
    return df

In [16]:
df_dl_2019_test = labe_encoder(df_dl_2019[final_features['feature']])

In [13]:
# define the delay label
df_dl_2019.loc[:, 'delay_label'] = df_dl_2019.apply(lambda row: 1 if row['ArrDel15'] == 1 or row['DepDel15'] == 1 else 0, axis=1)

In [18]:
# Split the data set into the training set and a test set
X_train, X_test, y_train, y_test = train_test_split(df_dl_2019_test,
                                                  df_dl_2019['delay_label'],
                                                  test_size=0.2, 
                                                  random_state=random_state,
                                                  #stratify=X_train_temp['Reporting_Airline']
                                                 )

In [19]:
classifiers_copy = lazypredict.Supervised.CLASSIFIERS
lazypredict.Supervised.CLASSIFIERS = [classifiers_copy[15], classifiers_copy[21], classifiers_copy[27], classifiers_copy[28]]
lazypredict.Supervised.CLASSIFIERS

[('LogisticRegression', sklearn.linear_model._logistic.LogisticRegression),
 ('RandomForestClassifier', sklearn.ensemble._forest.RandomForestClassifier),
 ('XGBClassifier', xgboost.sklearn.XGBClassifier),
 ('LGBMClassifier', lightgbm.sklearn.LGBMClassifier)]

In [21]:
clf = LazyClassifier(verbose=0, 
                     ignore_warnings=True, 
                     custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

models

 75%|███████▌  | 3/4 [04:40<01:30, 90.13s/it] 

[LightGBM] [Info] Number of positive: 145045, number of negative: 658174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 803219, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.180580 -> initscore=-1.512425
[LightGBM] [Info] Start training from score -1.512425


100%|██████████| 4/4 [04:45<00:00, 71.29s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.94,0.86,0.86,0.94,275.39
XGBClassifier,0.93,0.84,0.84,0.93,4.02
LGBMClassifier,0.92,0.81,0.81,0.92,4.57
LogisticRegression,0.84,0.58,0.58,0.8,1.2


### Apply Boruta Rankings Method

In [22]:
#select numeric columns
num_feats = [
    #'DepDelay', #highly correlated to the delay lable
    #'DepDelayMinutes',#highly correlated to the delay lable
   'TaxiOut',
   'TaxiIn',
   #'ArrDelay',#highly correlated to the delay lable
   #'ArrDelayMinutes',#highly correlated to the delay lable
   'AirTime',
   'Distance',
   'HORSEPOWER',
   'THRUST',
   'NO-ENG',
   'NO-SEATS',
   'SPEED',
   'Origin_LATITUDE',
   'Origin_LONGITUDE',
   'Dest_LATITUDE',
   'Dest_LONGITUDE'
]

In [23]:
cat_feats = [
    #'Year',
    'Quarter',
    'Month',
    'DayofMonth',
    'DayOfWeek',
    #'Reporting_Airline',
    #'DepartureDelayGroups', #highly correlated to the delay lable
    'DepTimeBlk',
    'WheelsOff',
    'WheelsOn',
    #'ArrivalDelayGroups',#highly correlated to the delay lable
    'ArrTimeBlk',
    'CRSElapsedTime',
    'ActualElapsedTime',
    'DistanceGroup',
    'SERIAL NUMBER',
    'MFR MDL CODE',
    'ENG MFR MDL',
    'YEAR MFR',
    'TYPE REGISTRANT',
    'REGION',
    'COUNTY',
    'COUNTRY',
    'TYPE AIRCRAFT',
    'TYPE ENGINE',
    'STATUS CODE',
    'MODE S CODE',
    'FRACT OWNER',
    'AIR WORTH DATE',
    'MFR',
    'MODEL',
    'TYPE',
    'MFR_aircraft',
    'MODEL_aircraft',
    'TYPE-ACFT',
    'TYPE-ENG',
    'AC-WEIGHT'
]

In [24]:
# Convert specified columns to object type
df_dl_2019[cat_feats] = df_dl_2019[cat_feats].astype(str)

# Replace NaN values with empty strings
df_dl_2019[cat_feats] = df_dl_2019[cat_feats].fillna('')

In [25]:
# Split the data set into the training set and a test set
X_train, X_test, y_train, y_test = train_test_split(df_dl_2019,
                                                  df_dl_2019['delay_label'],
                                                  test_size=0.2, 
                                                  random_state=random_state,
                                                  #stratify=X_train_temp['Reporting_Airline']
                                                 )

In [26]:
# Calculate Pearson correlation coefficient to eliminate highly related features.
corr_df_num, num_remove_list = CorrelationCalculator.numeric_correlations(X_train,
                                                                          features=num_feats,
                                                                          static_features=None,
                                                                          corr_method='pearson',
                                                                          threshold=0.8)

corr_df_cat, cat_remove_list = CorrelationCalculator.categorical_correlations(X_train,
                                                                              features=cat_feats,
                                                                              static_features=None,
                                                                              threshold=0.8)

In [27]:
num_remove_list

['AirTime']

In [28]:
cat_remove_list

['Quarter',
 'DepTimeBlk',
 'ArrTimeBlk',
 'MFR MDL CODE',
 'ENG MFR MDL',
 'YEAR MFR',
 'TYPE REGISTRANT',
 'REGION',
 'COUNTY',
 'COUNTRY',
 'TYPE AIRCRAFT',
 'TYPE ENGINE',
 'STATUS CODE',
 'SERIAL NUMBER',
 'FRACT OWNER',
 'AIR WORTH DATE',
 'MFR',
 'MODEL',
 'TYPE',
 'MFR_aircraft',
 'MODEL_aircraft',
 'TYPE-ACFT',
 'TYPE-ENG',
 'AC-WEIGHT']

In [29]:
#Remove correlated features
num_feats_final = [c for c in num_feats if c not in num_remove_list]
cat_feats_final = [c for c in cat_feats if c not in cat_remove_list]

In [30]:
#Create Feature Selector Object
feat_selector = FeatureSelector(modeling_type = 'classification',
                                X_train = X_train, 
                                y_train = y_train,
                                X_test = X_test, 
                                y_test = y_test,
                                numeric_columns = num_feats_final,
                                categorical_columns = cat_feats_final,
                                seed = seed)

In [32]:
#Boruta Ranking
boruta_importance_df = feat_selector.boruta_rankings(rf_hyperparams={'n_jobs':n_jobs, 
                                                                     'random_state':random_state, 
                                                                     n_estimators=200})

boruta_importance_df.to_csv('boruta_importance_df.csv', index=False)
boruta_importance_df

Unnamed: 0,feature,boruta_support,boruta_support_weak,boruta_ranking
0,TaxiOut,1,0,1
1,ActualElapsedTime,1,0,1
2,WheelsOn,1,0,1
3,WheelsOff,1,0,1
4,MODE S CODE,1,0,1
5,TaxiIn,1,0,1
6,CRSElapsedTime,0,0,2
7,DayofMonth,0,0,3
8,Distance,0,0,4
9,Month,0,0,5
