In [4]:
# -*- coding: utf-8 -*- 
# This code is written in Python 3.7. To install the required packages, run the following commands:
# pip install pandas numpy matplotlib seaborn scikit-learn sympy
# This code is applicable to the CICIDS2017 dataset. 
# This code to get the top K features using Information Gain (IG) method. 

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('/home/ibibers@ads.iu.edu/IDS_Datasets/Combined_datasets/CICIDS2017_combined_dataset.csv')

main_df_1 = df.drop_duplicates(keep='first')

one_value = main_df_1.columns[main_df_1.nunique() == 1]
main_df_2 = main_df_1.drop(columns = one_value, axis=1)

main_df_2['Flow Bytes/s'] = main_df_2['Flow Bytes/s'].fillna(main_df_2['Flow Bytes/s'].mean())

main_df_2.rename(columns=lambda x: x.lstrip(), inplace=True)

sample_df_1 = main_df_2.copy()


print (" Extract subsample of data: ")
print (sample_df_1['Label'].value_counts())


 Extract subsample of data: 
Label
BENIGN                        2096484
DoS Hulk                       172849
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [5]:
print ("the Features are: ", sample_df_1.columns)

# le = LabelEncoder()
# sample_df_1['Label'] = le.fit_transform(sample_df_1['Label'])


the Features are:  Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count'

In [6]:
le = LabelEncoder()
sample_df_1['Label'] = le.fit_transform(sample_df_1['Label'])

In [7]:
dropped_df = ['Label', 'Flow Packets/s', 'Flow Bytes/s']
X = sample_df_1.drop( dropped_df, axis=1) 
y = sample_df_1['Label']  
feature_names = X.columns

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [72]:
# ------------------------------------------------- K-best for Calssification features selection -------------------------------------------- 
from sklearn.feature_selection import SelectKBest, f_classif

# KBest with f_classif as scoring function 
selector = SelectKBest(score_func=f_classif, k=15)
X_new = selector.fit_transform(X, y)

# feature scores
scores = selector.scores_
selected_indices = selector.get_support(indices=True)
print("Scores for each feature:", scores)
print("Selected features:", selected_indices)




Scores for each feature: [5.88638116e+03 5.24036630e+04 1.30362483e+01 6.14664006e+00
 3.38446775e+03 8.13009752e+00 1.72791333e+03 3.29714118e+03
 1.66324958e+03 1.43680314e+03 2.27261900e+05 1.36392185e+04
 2.37422162e+05 2.38502985e+05 2.76907268e+04 1.10049705e+05
 1.79395042e+05 1.07700787e+04 5.30919966e+04 2.14882807e+04
 2.44600688e+05 1.78395451e+05 4.48069656e+03 5.59687431e+03
 7.77861899e+03 2.83372165e+04 2.01803609e+04 4.82913640e+03
 2.91607168e+03 1.16083260e+00 6.32795464e-02 1.12379148e-01
 1.19649667e+03 4.67460512e+03 1.52152459e+04 1.75743988e+05
 1.52015564e+05 1.94582480e+05 1.42593352e+05 5.24741308e+04
 2.91607168e+03 9.95701776e+00 2.52873172e+04 1.03023655e+04
 4.11474448e+03 1.16083260e+00 1.00005759e+01 3.37249367e+03
 1.53946135e+05 1.66324958e+03 2.37422162e+05 6.32795464e-02
 1.30362483e+01 3.39381646e+03 6.14664006e+00 8.13026364e+00
 6.36145810e+03 2.77003072e+03 5.32666801e+00 2.67309427e-01
 1.59873700e+04 6.93674637e+03 8.00808000e+03 1.70636032e+04

In [73]:
selected_indices = selector.get_support(indices=True)

selected_features = [X.columns[i] for i in selected_indices]

In [74]:
print ("The X features: ", X.columns)
print ("the size of the X features: ", len(X.columns))
print ("Selected features: ", selected_features)
print ("The size of the selected features: ", len(selected_features))

The X features:  Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Mean', 'Flow IAT Std',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
       'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count'

In [23]:
# ------------------------------------------------- K-best for Regression features selection --------------------------------------------

from sklearn.feature_selection import SelectKBest, f_regression

# KBest with f_regression as scoring function
selector = SelectKBest(score_func=f_regression, k=15)


In [24]:
print(sample_df_1.dtypes)


Destination Port                 int64
Flow Duration                    int64
Total Fwd Packets                int64
Total Backward Packets           int64
Total Length of Fwd Packets      int64
                                ...   
Idle Mean                      float64
Idle Std                       float64
Idle Max                         int64
Idle Min                         int64
Label                            int64
Length: 71, dtype: object


In [25]:
X_new = selector.fit_transform(X, y)



In [26]:


scores = selector.scores_
selected_indices = selector.get_support(indices=True)
print("Scores for each feature:", scores)
print("Selected features:", selected_indices)


selected_indices = selector.get_support(indices=True)
selected_features = [X.columns[i] for i in selected_indices]



Scores for each feature: [2.29278905e+04 5.45309553e+04 1.96787946e+01 1.85106811e+01
 3.46753109e+02 1.04981320e+01 6.51264308e+03 3.01623597e+04
 1.25086712e+04 2.87353661e+03 2.00832179e+05 1.18015898e+05
 1.86256665e+05 2.10927359e+05 3.62473465e+04 1.49031295e+05
 2.04018095e+05 1.30149504e+01 5.57558501e+04 2.93627376e+04
 2.48260673e+05 2.05002351e+05 9.01640886e+02 4.12129474e+02
 3.35203977e+02 3.90854493e+04 1.97390057e+04 1.16707976e+03
 9.77537041e+03 1.09956614e+01 5.98813053e-01 1.04323697e+00
 5.43133904e+03 2.19897391e+04 1.40809356e+05 1.62793301e+05
 1.17688887e+05 1.65880200e+05 1.42020652e+05 7.86673093e+04
 9.77537041e+03 9.43135707e+01 1.64295646e+05 1.35312242e+03
 3.61089105e+04 1.09956614e+01 9.47261489e+01 5.40559548e+02
 1.13853341e+05 1.25086712e+04 1.86256665e+05 5.98813053e-01
 1.96787946e+01 3.47669196e+02 1.85106811e+01 1.04978100e+01
 1.48602654e+04 1.13657262e+04 1.46457447e+01 2.53618747e+00
 9.92521605e+01 4.79157580e+02 6.53073726e+02 2.25347846e+02

In [27]:
print ("The X features: ", X.columns)
print ("the size of the X features: ", len(X.columns))
print ("Selected features: ", selected_features)
print ("The size of the selected features: ", len(selected_features))

The X features:  Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Mean', 'Flow IAT Std',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
       'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count'

In [8]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif  # For classification tasks
# For regression tasks, you can use f_regression instead of f_classif



In [9]:
k = 65  
selector = SelectKBest(score_func=f_classif, k=k)

X_new = selector.fit_transform(X, y)

selected_feature_indices = selector.get_support(indices=True)

selected_feature_names = X.columns[selected_feature_indices]

print("Selected Features:", selected_feature_names)

feature_scores = selector.scores_

feature_scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Score': feature_scores
})


feature_scores_df = feature_scores_df.sort_values(by='Score', ascending=False)

print("Feature Scores:")
print(feature_scores_df)
print (feature_scores_df)

Selected Features: Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Mean', 'Flow IAT Std',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
       'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
       'URG Flag Count', 'CWE Flag Count', 'E

In [10]:
print ("The top K features are: ", feature_scores_df['Feature'].head(10))

The top K features are:  20               Fwd IAT Std
13     Bwd Packet Length Std
12    Bwd Packet Length Mean
50      Avg Bwd Segment Size
10     Bwd Packet Length Max
67                  Idle Min
64                 Idle Mean
37         Packet Length Std
66                  Idle Max
16              Flow IAT Max
Name: Feature, dtype: object
