# Feature Selection

https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

In [1]:
import pandas as pd
import seaborn as sns
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading the training and testing datasets.
test = pd.read_csv("../Dataset/UNSW_NB15_testing-set.csv", sep=',', header=0)
train = pd.read_csv("../Dataset/UNSW_NB15_training-set.csv", sep=',', header=0)

In [3]:
# Defining the columns that need to be label encoded.
cols = ['proto', 'service', 'state', 'attack_cat']
le = preprocessing.LabelEncoder()

In [4]:
# Label encoding the columns for the test and training set
test[cols] = test[cols].apply(le.fit_transform)
train[cols] = train[cols].apply(le.fit_transform)

In [5]:
# Splitting the dataset into inputs and outputs
X_train = train.iloc[:, 0:44]
X_test = test.iloc[:, 0:44]
y_train = train.iloc[:, 44]
y_test = test.iloc[:, 44]

In [None]:
# OPTIONAL 1: Applying Min Max Scaler on X
mm_scaler = preprocessing.MinMaxScaler()
X_train_minmax = mm_scaler.fit_transform(X_train)
X_test_minmax = mm_scaler.fit_transform(X_test)

In [6]:
# OPTIONAL 2: Applying StandardScaler on X
ss = StandardScaler()
X_train_ss = pd.DataFrame(ss.fit_transform(X_train),columns = X_train.columns)

## Recursive Feature Elimination (RFE)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# Selecting the Best important features according to Logistic Regression
rfe_selector = RFE(estimator=LogisticRegression(solver='lbfgs', max_iter=100000), step = 1, verbose = 1)

In [None]:
rfe_fit = rfe_selector.fit(X_train_ss, y_train)

In [None]:
X_train_ss.columns[rfe_selector.get_support()]

In [None]:
print("Num Features: %d" % rfe_fit.n_features_)
print("Selected Features: %s" % rfe_fit.support_)
print("Feature Ranking: %s" % rfe_fit.ranking_,)

## Univariate Feature Selection with SelectKBest

In [7]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [8]:
# Select top "all" features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k = "all")
selector.fit(X_train_ss, y_train)
X_train_ss.columns[selector.get_support()]

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat'],
      dtype='object')

## Sequential Feature Selection (SFS)

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

In [None]:
#Selecting the Best important features according to Logistic Regression
sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(max_iter=100, verbose=1, n_jobs=-1), cv =10, direction ='backward')

In [None]:
sfs_selector.fit(X_train_ss, y_train)

In [None]:
X_train_ss.columns[sfs_selector.get_support()]

## Testing

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X_train, y_train)