# Feature Selection using Recursive Feature Elimination Model (RFE) using Random Forest, Gradient Boosting, Logistic Regression,  and SVM

In [90]:
import pandas as pd
import numpy as np

In [91]:
data = pd.read_csv("datasets/cicids2017.csv")
data.columns = data.columns.str.replace(' ', '')
data.shape

(286467, 79)

In [92]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)
data.shape


(286096, 79)

In [93]:
target = "Label"
x = data.loc[:, data.columns != target]
x.head()


Unnamed: 0,DestinationPort,FlowDuration,TotalFwdPackets,TotalBackwardPackets,TotalLengthofFwdPackets,TotalLengthofBwdPackets,FwdPacketLengthMax,FwdPacketLengthMin,FwdPacketLengthMean,FwdPacketLengthStd,...,act_data_pkt_fwd,min_seg_size_forward,ActiveMean,ActiveStd,ActiveMax,ActiveMin,IdleMean,IdleStd,IdleMax,IdleMin
0,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,24,32,0.0,0.0,0,0,0.0,0.0,0,0
1,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,24,32,0.0,0.0,0,0,0.0,0.0,0,0
2,22,160,1,1,0,0,0,0,0.0,0.0,...,0,32,0.0,0.0,0,0,0.0,0.0,0,0
3,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,24,32,0.0,0.0,0,0,0.0,0.0,0,0
4,35396,77,1,2,0,0,0,0,0.0,0.0,...,0,32,0.0,0.0,0,0,0.0,0.0,0,0


In [94]:
y = data[target]
y.value_counts()


PortScan    158804
BENIGN      127292
Name: Label, dtype: int64

In [95]:
y = pd.factorize(y)[0]
pd.value_counts(y)

1    158804
0    127292
dtype: int64

In [102]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=0)

## Univariate Feature Selection
For each feature we find its statistical significance to the target. Then we keep the features with the strongest statistical significance.

In [104]:
from sklearn.feature_selection import VarianceThreshold
constant_filter = VarianceThreshold(threshold=0).fit(x_train)
x_train = x_train.loc[:, x_train.columns[constant_filter.get_support()]]
x_train.shape


(228876, 68)

In [105]:
from sklearn.feature_selection import SelectKBest, f_classif
select_k_best = SelectKBest(f_classif, k="all").fit(x_train, y_train)

In [117]:
f_test_scores = pd.DataFrame(
    dict(feature=x_train.columns, score=select_k_best.scores_))
f_test_scores.sort_values("score", ascending=False, inplace=True)
f_test_scores.to_csv("f-scores of features.csv", index=False)
f_test_scores


Unnamed: 0,feature,score
43,PSHFlagCount,402748.513269
35,MinPacketLength,95775.872270
11,BwdPacketLengthMin,54978.622854
48,AveragePacketSize,53074.613015
7,FwdPacketLengthMin,50416.442630
...,...,...
14,FlowBytes/s,352.934315
19,FlowIATMin,271.036904
46,ECEFlagCount,19.938896
42,RSTFlagCount,19.938896


## Multivariate Analysis