## Feature Selection

**Primary Goals:** Remove/reduce features based on several reduction/cleaning techniques and prepare data for use with regression models.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [5]:
test_df = pd.read_csv('DB/test_sample.csv', index_col='Unnamed: 0')

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197560 entries, 0 to 197559
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   fl_date             197560 non-null  object
 1   mkt_unique_carrier  197560 non-null  object
 2   branded_code_share  197560 non-null  object
 3   mkt_carrier         197560 non-null  object
 4   mkt_carrier_fl_num  197560 non-null  int64 
 5   op_unique_carrier   197560 non-null  object
 6   tail_num            197074 non-null  object
 7   op_carrier_fl_num   197560 non-null  int64 
 8   origin_airport_id   197560 non-null  int64 
 9   origin              197560 non-null  object
 10  origin_city_name    197560 non-null  object
 11  dest_airport_id     197560 non-null  int64 
 12  dest                197560 non-null  object
 13  dest_city_name      197560 non-null  object
 14  crs_dep_time        197560 non-null  int64 
 15  crs_arr_time        197560 non-null  int64 
 16  du

In [7]:
test_df = test_df.dropna()

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197074 entries, 0 to 197559
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   fl_date             197074 non-null  object
 1   mkt_unique_carrier  197074 non-null  object
 2   branded_code_share  197074 non-null  object
 3   mkt_carrier         197074 non-null  object
 4   mkt_carrier_fl_num  197074 non-null  int64 
 5   op_unique_carrier   197074 non-null  object
 6   tail_num            197074 non-null  object
 7   op_carrier_fl_num   197074 non-null  int64 
 8   origin_airport_id   197074 non-null  int64 
 9   origin              197074 non-null  object
 10  origin_city_name    197074 non-null  object
 11  dest_airport_id     197074 non-null  int64 
 12  dest                197074 non-null  object
 13  dest_city_name      197074 non-null  object
 14  crs_dep_time        197074 non-null  int64 
 15  crs_arr_time        197074 non-null  int64 
 16  du

In [11]:
flights_df = pd.read_csv('DB/flights_data.csv', index_col='Unnamed: 0')

In [12]:
flights_df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

In [15]:
feature_cols = list(test_df.columns)
feature_cols.append('arr_delay')

In [16]:
feature_df = flights_df[feature_cols]
feature_df = feature_df.dropna()

In [17]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162002 entries, 0 to 165071
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fl_date             162002 non-null  object 
 1   mkt_unique_carrier  162002 non-null  object 
 2   branded_code_share  162002 non-null  object 
 3   mkt_carrier         162002 non-null  object 
 4   mkt_carrier_fl_num  162002 non-null  int64  
 5   op_unique_carrier   162002 non-null  object 
 6   tail_num            162002 non-null  object 
 7   op_carrier_fl_num   162002 non-null  int64  
 8   origin_airport_id   162002 non-null  int64  
 9   origin              162002 non-null  object 
 10  origin_city_name    162002 non-null  object 
 11  dest_airport_id     162002 non-null  int64  
 12  dest                162002 non-null  object 
 13  dest_city_name      162002 non-null  object 
 14  crs_dep_time        162002 non-null  int64  
 15  crs_arr_time        162002 non-nul

In [18]:
feature_df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay
0,2018-01-01,UA,UA_CODESHARE,UA,3501,YX,N744YX,3501,12953,LGA,...,13930,ORD,"Chicago, IL",1300,1444,N,164.0,1.0,733.0,-28.0
1,2018-01-01,UA,UA_CODESHARE,UA,3502,YX,N640RW,3502,11433,DTW,...,12266,IAH,"Houston, TX",630,854,N,204.0,1.0,1075.0,1.0
2,2018-01-01,UA,UA_CODESHARE,UA,3503,YX,N641RW,3503,11618,EWR,...,11433,DTW,"Detroit, MI",1500,1709,N,129.0,1.0,488.0,18.0
3,2018-01-01,UA,UA_CODESHARE,UA,3504,YX,N722YX,3504,11618,EWR,...,11278,DCA,"Washington, DC",2041,2159,N,78.0,1.0,199.0,32.0
4,2018-01-01,UA,UA_CODESHARE,UA,3505,YX,N855RW,3505,12266,IAH,...,11298,DFW,"Dallas/Fort Worth, TX",2140,2257,N,77.0,1.0,224.0,-1.0


In [44]:
# removing columns with duplicated data
to_drop = ['mkt_carrier', # redundant with mkt_unique_carrier
           'op_unique_carrier', # redundant with mkt_unique_carrier
           'op_carrier_fl_num', # duplicate of other column
           'origin', # duplicate of other column
           'origin_city_name', # duplicate of other column
           'dest', # duplicate of other column
           'dest_city_name', # duplicate of other column
           'dup', # all same value (N)
           'flights'] # all same value (1.0)

# dropping columns
filtered_df = feature_df.drop(to_drop, axis=1)
# checking
filtered_df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,tail_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2018-01-01,UA,UA_CODESHARE,3501,N744YX,12953,13930,1300,1444,164.0,733.0,-28.0
1,2018-01-01,UA,UA_CODESHARE,3502,N640RW,11433,12266,630,854,204.0,1075.0,1.0
2,2018-01-01,UA,UA_CODESHARE,3503,N641RW,11618,11433,1500,1709,129.0,488.0,18.0
3,2018-01-01,UA,UA_CODESHARE,3504,N722YX,11618,11278,2041,2159,78.0,199.0,32.0
4,2018-01-01,UA,UA_CODESHARE,3505,N855RW,12266,11298,2140,2257,77.0,224.0,-1.0


In [30]:
filtered_df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier_fl_num', 'tail_num', 'origin_airport_id',
       'dest_airport_id', 'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time',
       'distance', 'arr_delay'],
      dtype='object')

In [31]:
# checking types
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162002 entries, 0 to 165071
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fl_date             162002 non-null  object 
 1   mkt_unique_carrier  162002 non-null  object 
 2   branded_code_share  162002 non-null  object 
 3   mkt_carrier_fl_num  162002 non-null  int64  
 4   tail_num            162002 non-null  object 
 5   origin_airport_id   162002 non-null  int64  
 6   dest_airport_id     162002 non-null  int64  
 7   crs_dep_time        162002 non-null  int64  
 8   crs_arr_time        162002 non-null  int64  
 9   crs_elapsed_time    162002 non-null  float64
 10  distance            162002 non-null  float64
 11  arr_delay           162002 non-null  float64
dtypes: float64(3), int64(5), object(4)
memory usage: 16.1+ MB


In [45]:
# converting fl_date from string to datetime
filtered_df['fl_date'] = pd.to_datetime(filtered_df['fl_date'])
# checking type
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162002 entries, 0 to 165071
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             162002 non-null  datetime64[ns]
 1   mkt_unique_carrier  162002 non-null  object        
 2   branded_code_share  162002 non-null  object        
 3   mkt_carrier_fl_num  162002 non-null  int64         
 4   tail_num            162002 non-null  object        
 5   origin_airport_id   162002 non-null  int64         
 6   dest_airport_id     162002 non-null  int64         
 7   crs_dep_time        162002 non-null  int64         
 8   crs_arr_time        162002 non-null  int64         
 9   crs_elapsed_time    162002 non-null  float64       
 10  distance            162002 non-null  float64       
 11  arr_delay           162002 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(3)
memory usage: 16.1+ MB


In [46]:
# splitting datetime into year/month/day and creating day-of-week column
filtered_df['fl_year'] = filtered_df['fl_date'].dt.year
filtered_df['fl_month'] = filtered_df['fl_date'].dt.month
filtered_df['fl_week'] = filtered_df['fl_date'].dt.week
filtered_df['fl_day'] = filtered_df['fl_date'].dt.day
filtered_df['day_of_week'] = filtered_df['fl_date'].dt.dayofweek
# dropping original date column
filtered_df = filtered_df.drop(['fl_date'], axis=1)

  filtered_df['fl_week'] = filtered_df['fl_date'].dt.week


In [47]:
# viewing changes
filtered_df.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,tail_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_year,fl_month,fl_week,fl_day,day_of_week
0,UA,UA_CODESHARE,3501,N744YX,12953,13930,1300,1444,164.0,733.0,-28.0,2018,1,1,1,0
1,UA,UA_CODESHARE,3502,N640RW,11433,12266,630,854,204.0,1075.0,1.0,2018,1,1,1,0
2,UA,UA_CODESHARE,3503,N641RW,11618,11433,1500,1709,129.0,488.0,18.0,2018,1,1,1,0
3,UA,UA_CODESHARE,3504,N722YX,11618,11278,2041,2159,78.0,199.0,32.0,2018,1,1,1,0
4,UA,UA_CODESHARE,3505,N855RW,12266,11298,2140,2257,77.0,224.0,-1.0,2018,1,1,1,0


In [48]:
X = filtered_df.drop(['arr_delay'], axis=1)
y = filtered_df['arr_delay']

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101)

In [50]:
# checking for non-numeric features
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113401 entries, 42491 to 139208
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   mkt_unique_carrier  113401 non-null  object 
 1   branded_code_share  113401 non-null  object 
 2   mkt_carrier_fl_num  113401 non-null  int64  
 3   tail_num            113401 non-null  object 
 4   origin_airport_id   113401 non-null  int64  
 5   dest_airport_id     113401 non-null  int64  
 6   crs_dep_time        113401 non-null  int64  
 7   crs_arr_time        113401 non-null  int64  
 8   crs_elapsed_time    113401 non-null  float64
 9   distance            113401 non-null  float64
 10  fl_year             113401 non-null  int64  
 11  fl_month            113401 non-null  int64  
 12  fl_week             113401 non-null  int64  
 13  fl_day              113401 non-null  int64  
 14  day_of_week         113401 non-null  int64  
dtypes: float64(2), int64(10), obje

In [51]:
X_train.columns

Index(['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier_fl_num',
       'tail_num', 'origin_airport_id', 'dest_airport_id', 'crs_dep_time',
       'crs_arr_time', 'crs_elapsed_time', 'distance', 'fl_year', 'fl_month',
       'fl_week', 'fl_day', 'day_of_week'],
      dtype='object')

In [52]:
# importing encoder
import category_encoders as ce

# instantiating
encoder = ce.OrdinalEncoder(cols=['mkt_unique_carrier', 'tail_num', 'branded_code_share'])

# fitting/transforming
X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [53]:
X_train.shape

(113401, 15)

In [54]:
X_train.columns

Index(['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier_fl_num',
       'tail_num', 'origin_airport_id', 'dest_airport_id', 'crs_dep_time',
       'crs_arr_time', 'crs_elapsed_time', 'distance', 'fl_year', 'fl_month',
       'fl_week', 'fl_day', 'day_of_week'],
      dtype='object')

In [56]:
X['mkt_carrier_fl_num'].value_counts()

64      95
832     85
65      82
61      80
788     79
        ..
6746     1
6819     1
6885     1
6988     1
6579     1
Name: mkt_carrier_fl_num, Length: 6805, dtype: int64

In [57]:
X.distance.head()

0     733.0
1    1075.0
2     488.0
3     199.0
4     224.0
Name: distance, dtype: float64