# Feature Selection

https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

In [1]:
import pandas as pd
import seaborn as sns
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler

import sys
sys.path.append("..")
from Functions.UNSW_DF import *

train, test = DF_original()

Reading Original CSV Files..
	 Train Shape:  	 (175341, 45)
	 Test Shape:  	 (82332, 45)
Dataset Loaded!


In [2]:
# Defining the columns that need to be label encoded.
cols = ['proto', 'service', 'state', 'attack_cat']
le = preprocessing.LabelEncoder()

In [3]:
# Label encoding the columns for the test and training set
test[cols] = test[cols].apply(le.fit_transform)
train[cols] = train[cols].apply(le.fit_transform)

In [4]:
# Splitting the dataset into inputs and outputs
X_train = train.iloc[:, 0:44]
X_test = test.iloc[:, 0:44]
y_train = train.iloc[:, 44]
y_test = test.iloc[:, 44]

In [None]:
# OPTIONAL 1: Applying Min Max Scaler on X
mm_scaler = preprocessing.MinMaxScaler()
X_train_minmax = mm_scaler.fit_transform(X_train)
X_test_minmax = mm_scaler.fit_transform(X_test)

In [5]:
# OPTIONAL 2: Applying StandardScaler on X
ss = StandardScaler()
X_train_ss = pd.DataFrame(ss.fit_transform(X_train),columns = X_train.columns)

## Recursive Feature Elimination (RFE)

In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [9]:
# Selecting the Best important features according to Logistic Regression
rfe_selector = RFE(estimator=LogisticRegression(solver='lbfgs', max_iter=1000), step = 1, verbose = 1)

In [10]:
rfe_fit = rfe_selector.fit(X_train_ss, y_train)

Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.


In [11]:
X_train_ss.columns[rfe_selector.get_support()]

Index(['id', 'proto', 'service', 'state', 'dpkts', 'sbytes', 'dbytes', 'sttl',
       'dttl', 'dload', 'dloss', 'sinpkt', 'swin', 'synack', 'smean', 'dmean',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_srv_dst',
       'is_sm_ips_ports', 'attack_cat'],
      dtype='object')

In [12]:
print("Num Features: %d" % rfe_fit.n_features_)
print("Selected Features: %s" % rfe_fit.support_)
print("Feature Ranking: %s" % rfe_fit.ranking_,)

Num Features: 22
Selected Features: [ True False  True  True  True False  True  True  True False  True  True
 False  True False  True  True False False False  True False False False
 False  True False  True  True False False False False  True  True  True
 False False False False False  True  True  True]
Feature Ranking: [ 1 17  1  1  1  8  1  1  1 10  1  1 15  1  5  1  1 18 14 13  1 21 19  2
  6  1  7  1  1 22 16  4  3  1  1  1 11 23 20 12  9  1  1  1]


## Univariate Feature Selection with SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [None]:
# Select top "all" features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k = "all")
selector.fit(X_train_ss, y_train)
X_train_ss.columns[selector.get_support()]

## Sequential Feature Selection (SFS)

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

In [None]:
#Selecting the Best important features according to Logistic Regression
sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(max_iter=100, verbose=1, n_jobs=-1), cv =10, direction ='backward')

In [None]:
sfs_selector.fit(X_train_ss, y_train)

In [None]:
X_train_ss.columns[sfs_selector.get_support()]

## Testing

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X_train, y_train)