In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../dataset/IDS_Dataset_processed.csv")

In [3]:
# a. Drop columns with >90% zeros
zero_ratio = (data == 0).sum() / data.shape[0]
columns_to_drop_zero = zero_ratio[zero_ratio > 0.9].index.tolist()
print("Dropping columns with >90% zeros:", columns_to_drop_zero)
data = data.drop(columns=columns_to_drop_zero)

# b. Drop columns with any null values
data = data.dropna(axis=1)

# c. Drop columns that are constant (only one unique value)
data = data.loc[:, data.nunique() > 1]

# d. Drop obviously irrelevant features (e.g., source/destination IPs/ports)
columns_to_drop_irrelevant = [
    'SrcIP', 'DstIP', 'SrcPort', 'DstPort',
    'sVid_', 'dVid_', 'SrcTCPBase_', 'DstTCPBase_', 'Attack Type_'
]
data = data.drop(columns=[col for col in columns_to_drop_irrelevant if col in data.columns], errors='ignore')

Dropping columns with >90% zeros: ['Loss', 'SrcLoss', 'DstLoss', 'pLoss', 'sTos_', 'dTos_', 'Proto_icmp', 'State_ECO', 'State_FIN', 'State_RST']


In [4]:
print(data)

         Seq       Dur  sHops  dHops  TotPkts  SrcPkts  DstPkts  TotBytes  \
0          1  0.000000    6.0   99.0        1        1        0        98   
1          2  0.000000    6.0   99.0        1        1        0        98   
2          3  4.998020   11.0    0.0      214      196       18    249093   
3          4  4.998037   11.0    0.0      184      163       21    221712   
4          5  4.999453   11.0    0.0      223      204       19    280216   
...      ...       ...    ...    ...      ...      ...      ...       ...   
1215885    1  0.000000    4.0    1.0        2        1        1       200   
1215886    3  0.235607    1.0    6.0        6        3        3      3056   
1215887  764  0.099927    0.0    0.0        3        2        1       252   
1215888    3  1.307852    1.0    6.0        6        3        3       596   
1215889    1  0.476803    4.0    1.0        4        2        2       392   

         SrcBytes  DstBytes  ...  dTtl_  SrcWin_  DstWin_  Proto_tcp  \
0  

In [5]:
print(data.columns)

Index(['Seq', 'Dur', 'sHops', 'dHops', 'TotPkts', 'SrcPkts', 'DstPkts',
       'TotBytes', 'SrcBytes', 'DstBytes', 'Offset', 'sMeanPktSz',
       'dMeanPktSz', 'Load', 'SrcLoad', 'DstLoad', 'Rate', 'SrcRate',
       'DstRate', 'TcpRtt', 'SynAck', 'AckDat', 'sTtl_', 'dTtl_', 'SrcWin_',
       'DstWin_', 'Proto_tcp', 'Proto_udp', 'Cause_Status', 'State_CON',
       'State_INT', 'State_REQ', 'Label__Malicious'],
      dtype='object')


In [6]:
# Encode Categorical Variables
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le

# Separate Features & Target, then Apply Feature Selection
target_col = "Label__Malicious"
if target_col not in data.columns:
    raise ValueError(f"Target column '{target_col}' not found in the dataset!")
    
y = data[target_col]
X = data.drop(columns=[target_col])

In [7]:
# Feature Selection Using Pearson Correlation
pearson_corr = X.corrwith(y)
print("\nPearson Correlation with Target:")
print(pearson_corr.sort_values(ascending=False))

# Select features with absolute correlation > threshold (e.g., 0.1)
threshold = 0.1
selected_features = pearson_corr[abs(pearson_corr) > threshold].index.tolist()
print(f"\nSelected Features based on Pearson correlation (|corr| > {threshold}):")
print(selected_features)
X = X[selected_features]


Pearson Correlation with Target:
Proto_tcp       0.378910
AckDat          0.293770
TcpRtt          0.195211
Dur             0.162825
dTtl_           0.138945
dMeanPktSz      0.108514
Cause_Status    0.105866
State_CON       0.056073
State_REQ       0.025951
SynAck          0.023518
DstLoad         0.005740
Load            0.005732
SrcRate         0.005529
DstRate         0.005333
Rate            0.004231
SrcLoad         0.003917
DstWin_         0.001312
DstPkts        -0.032770
DstBytes       -0.034669
SrcWin_        -0.089969
TotPkts        -0.099134
SrcPkts        -0.111448
TotBytes       -0.117561
SrcBytes       -0.121247
sHops          -0.162490
sMeanPktSz     -0.175133
State_INT      -0.196258
dHops          -0.233123
Proto_udp      -0.289052
sTtl_          -0.419852
Offset         -0.456065
Seq            -0.527940
dtype: float64

Selected Features based on Pearson correlation (|corr| > 0.1):
['Seq', 'Dur', 'sHops', 'dHops', 'SrcPkts', 'TotBytes', 'SrcBytes', 'Offset', 'sMeanPkt

In [8]:
# Optional: Chi-square Test (for non-negative features)
# Chi-square test requires non-negative values. If needed, ensure that features are non-negative.
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X, y)
chi2_scores = chi2_selector.scores_
chi2_pvalues = chi2_selector.pvalues_
chi2_results = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi2_scores,
    'p-value': chi2_pvalues
}).sort_values(by='Chi2 Score', ascending=False)
print("\nChi-square Test Results:")
print(chi2_results)


Chi-square Test Results:
         Feature    Chi2 Score  p-value
7         Offset  2.339150e+12      0.0
0            Seq  1.198481e+10      0.0
6       SrcBytes  4.224510e+09      0.0
5       TotBytes  4.197740e+09      0.0
9     dMeanPktSz  1.070118e+07      0.0
8     sMeanPktSz  1.069171e+07      0.0
12         sTtl_  7.949852e+06      0.0
13         dTtl_  1.433668e+06      0.0
4        SrcPkts  1.383413e+06      0.0
3          dHops  1.305621e+06      0.0
2          sHops  2.024878e+05      0.0
14     Proto_tcp  1.345595e+05      0.0
1            Dur  6.756040e+04      0.0
17     State_INT  3.417022e+04      0.0
15     Proto_udp  2.613902e+04      0.0
16  Cause_Status  5.460666e+03      0.0
10        TcpRtt  3.013776e+03      0.0
11        AckDat  2.849205e+03      0.0


In [9]:
import joblib

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split into Train (60%), Validation (20%), and Test (20%) Sets
# First, split into 60% train and 40% temporary
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
# Then, split temporary set equally into validation and test (20% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [11]:
print(X_test)

           Seq       Dur  sHops  dHops  SrcPkts  TotBytes  SrcBytes    Offset  \
71792    18844  0.000000    1.0   99.0        1        42        42   1844360   
1000203  10308  0.000000    1.0   99.0        1        42        42  18947356   
844730   17916  2.589051    1.0   99.0        2        84        84   5842776   
1167840  44762  0.000000    7.0   99.0        1        74        74  17006620   
805712   28343  0.000000    1.0   99.0        1        42        42   2552292   
...        ...       ...    ...    ...      ...       ...       ...       ...   
575525    1939  0.000000    1.0    5.0        0        66         0   2281636   
1151237  34295  0.000000    1.0    5.0        1       132        66  14117200   
629888   22638  0.000000    1.0    5.0        1       132        66  13359684   
871497   23583  2.577277    1.0   99.0        2        84        84   8099956   
419810   98294  0.000000    1.0   99.0        1        42        42  31311072   

         sMeanPktSz  dMeanP

In [12]:
print(y_test)

71792       True
1000203     True
844730      True
1167840    False
805712      True
           ...  
575525      True
1151237     True
629888      True
871497      True
419810     False
Name: Label__Malicious, Length: 243178, dtype: bool


In [13]:
print(len(y_test))
print(len(X_test))

243178
243178


In [14]:
print(X_test)

           Seq       Dur  sHops  dHops  SrcPkts  TotBytes  SrcBytes    Offset  \
71792    18844  0.000000    1.0   99.0        1        42        42   1844360   
1000203  10308  0.000000    1.0   99.0        1        42        42  18947356   
844730   17916  2.589051    1.0   99.0        2        84        84   5842776   
1167840  44762  0.000000    7.0   99.0        1        74        74  17006620   
805712   28343  0.000000    1.0   99.0        1        42        42   2552292   
...        ...       ...    ...    ...      ...       ...       ...       ...   
575525    1939  0.000000    1.0    5.0        0        66         0   2281636   
1151237  34295  0.000000    1.0    5.0        1       132        66  14117200   
629888   22638  0.000000    1.0    5.0        1       132        66  13359684   
871497   23583  2.577277    1.0   99.0        2        84        84   8099956   
419810   98294  0.000000    1.0   99.0        1        42        42  31311072   

         sMeanPktSz  dMeanP

In [15]:
df= X_test

In [16]:
print(df)

           Seq       Dur  sHops  dHops  SrcPkts  TotBytes  SrcBytes    Offset  \
71792    18844  0.000000    1.0   99.0        1        42        42   1844360   
1000203  10308  0.000000    1.0   99.0        1        42        42  18947356   
844730   17916  2.589051    1.0   99.0        2        84        84   5842776   
1167840  44762  0.000000    7.0   99.0        1        74        74  17006620   
805712   28343  0.000000    1.0   99.0        1        42        42   2552292   
...        ...       ...    ...    ...      ...       ...       ...       ...   
575525    1939  0.000000    1.0    5.0        0        66         0   2281636   
1151237  34295  0.000000    1.0    5.0        1       132        66  14117200   
629888   22638  0.000000    1.0    5.0        1       132        66  13359684   
871497   23583  2.577277    1.0   99.0        2        84        84   8099956   
419810   98294  0.000000    1.0   99.0        1        42        42  31311072   

         sMeanPktSz  dMeanP

In [17]:
test_data = df.reset_index(drop=True)
print(test_data)

          Seq       Dur  sHops  dHops  SrcPkts  TotBytes  SrcBytes    Offset  \
0       18844  0.000000    1.0   99.0        1        42        42   1844360   
1       10308  0.000000    1.0   99.0        1        42        42  18947356   
2       17916  2.589051    1.0   99.0        2        84        84   5842776   
3       44762  0.000000    7.0   99.0        1        74        74  17006620   
4       28343  0.000000    1.0   99.0        1        42        42   2552292   
...       ...       ...    ...    ...      ...       ...       ...       ...   
243173   1939  0.000000    1.0    5.0        0        66         0   2281636   
243174  34295  0.000000    1.0    5.0        1       132        66  14117200   
243175  22638  0.000000    1.0    5.0        1       132        66  13359684   
243176  23583  2.577277    1.0   99.0        2        84        84   8099956   
243177  98294  0.000000    1.0   99.0        1        42        42  31311072   

        sMeanPktSz  dMeanPktSz    TcpRt

In [18]:
test_data["label"] = pd.Series(y_test).reset_index(drop=True)

In [19]:
print(test_data["label"])

0          True
1          True
2          True
3         False
4          True
          ...  
243173     True
243174     True
243175     True
243176     True
243177    False
Name: label, Length: 243178, dtype: bool


In [20]:
test_data.to_csv("../dataset/TestData2.csv")