In [1]:
import numpy as np
import pandas as pd
import os
import torch
from utils import CustomDataset
from torch.utils.data import Dataset, DataLoader

# Read datasets (2017: silo0, silo1; 2018: silo2, silo3)

In [2]:
pd0 = pd.read_csv("datasets/nids/silo0.csv")

In [3]:
pd1 = pd.read_csv("datasets/nids/silo1.csv")

In [4]:
pd2 = pd.read_csv("datasets/nids/silo2.csv")

In [5]:
pd3 = pd.read_csv("datasets/nids/silo3.csv")

In [6]:
extra_benign = pd.read_csv("datasets/nids/extra_benign.csv").dropna()

In [7]:
extra_benign.drop("Unnamed: 0", axis=1, inplace=True)

In [8]:
print(pd0.shape)
print(pd1.shape)
print(pd2.shape)
print(pd3.shape)
print(extra_benign.shape)

(378309, 1526)
(156972, 1526)
(1003038, 1526)
(935209, 1526)
(134472, 1526)


In [9]:
print(pd0["Label"].value_counts())
print(pd1["Label"].value_counts())
print(pd2["Label"].value_counts())
print(pd3["Label"].value_counts())
print(extra_benign["Label"].value_counts())

Benign       144865
Port Scan    113841
DoS           99442
DDoS          20161
Name: Label, dtype: int64
Benign          66240
Brute Force     60910
Infiltration    20917
Web Attack       8905
Name: Label, dtype: int64
Benign         494256
Brute Force    270701
DoS            238081
Name: Label, dtype: int64
Benign          463634
ddos            382379
Infiltration     60101
Web Attack       29095
Name: Label, dtype: int64
Benign    134472
Name: Label, dtype: int64


### Get samples equally for all silos

In [10]:
def get_subset_by_ratio(df, label_column, ratio):
    # Group the DataFrame by the label column
    grouped = df.groupby(label_column)

    # Initialize an empty DataFrame to store the subset
    subset = pd.DataFrame()

    # Iterate over the groups
    for label, group in grouped:
        # Determine the number of samples to keep for this class
        num_samples = int(len(group) * ratio)
        
        # Sample the required number of samples from this class
        sampled_group = group.sample(n=num_samples, random_state=42)
        
        # Append the sampled group to the subset DataFrame
        subset = subset.append(sampled_group)

    # Reset the index of the subset DataFrame
    subset = subset.reset_index(drop=True)
    
    return subset


In [11]:
pd0_sub = get_subset_by_ratio(pd0, "Label", 0.46)
pd0_sub["Label"].value_counts()

  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)


Benign       66637
Port Scan    52366
DoS          45743
DDoS          9274
Name: Label, dtype: int64

In [12]:
pd1_sub = get_subset_by_ratio(pd1, "Label", 1)
pd1_sub["Label"].value_counts()

  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)


Benign          66240
Brute Force     60910
Infiltration    20917
Web Attack       8905
Name: Label, dtype: int64

In [13]:
pd2_sub = get_subset_by_ratio(pd2, "Label", 0.135)
pd2_sub["Label"].value_counts()

  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)


Benign         66724
Brute Force    36544
DoS            32140
Name: Label, dtype: int64

In [14]:
pd3_sub = get_subset_by_ratio(pd3, "Label", 0.144)
pd3_sub["Label"].value_counts()

  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)
  subset = subset.append(sampled_group)


Benign          66763
ddos            55062
Infiltration     8654
Web Attack       4189
Name: Label, dtype: int64

In [15]:
pd0_sub['label_bin'] = pd0_sub['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd0_sub.drop('Label', axis=1, inplace=True)
pd1_sub['label_bin'] = pd1_sub['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd1_sub.drop('Label', axis=1, inplace=True)
pd2_sub['label_bin'] = pd2_sub['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd2_sub.drop('Label', axis=1, inplace=True)
pd3_sub['label_bin'] = pd3_sub['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd3_sub.drop('Label', axis=1, inplace=True)

In [16]:
print(pd0_sub["label_bin"].value_counts())
print(pd1_sub["label_bin"].value_counts())
print(pd2_sub["label_bin"].value_counts())
print(pd3_sub["label_bin"].value_counts())

1    107383
0     66637
Name: label_bin, dtype: int64
1    90732
0    66240
Name: label_bin, dtype: int64
1    68684
0    66724
Name: label_bin, dtype: int64
1    67905
0    66763
Name: label_bin, dtype: int64


In [17]:
pd0_sub_attack = pd0_sub[pd0_sub['label_bin'] == 1]
pd1_sub_attack = pd1_sub[pd1_sub['label_bin'] == 1]
pd2_sub_attack = pd2_sub[pd2_sub['label_bin'] == 1]
pd3_sub_attack = pd3_sub[pd3_sub['label_bin'] == 1]

In [18]:
print(pd0_sub_attack["label_bin"].value_counts())
print(pd1_sub_attack["label_bin"].value_counts())
print(pd2_sub_attack["label_bin"].value_counts())
print(pd3_sub_attack["label_bin"].value_counts())

1    107383
Name: label_bin, dtype: int64
1    90732
Name: label_bin, dtype: int64
1    68684
Name: label_bin, dtype: int64
1    67905
Name: label_bin, dtype: int64


# Label Encoding and make 2017 benign for all silos

In [19]:
pd00 = pd0.copy()
pd11 = pd1.copy()
pd22 = pd2.copy()
pd33 = pd3.copy()

In [20]:
pd00['label_bin'] = pd0['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd00.drop('Label', axis=1, inplace=True)
pd11['label_bin'] = pd1['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd11.drop('Label', axis=1, inplace=True)
pd22['label_bin'] = pd2['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd22.drop('Label', axis=1, inplace=True)
pd33['label_bin'] = pd3['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
pd33.drop('Label', axis=1, inplace=True)

In [21]:

extra_benign['label_bin'] = extra_benign['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
extra_benign.drop('Label', axis=1, inplace=True)

In [22]:
print(pd00["label_bin"].value_counts())
print(pd11["label_bin"].value_counts())
print(pd22["label_bin"].value_counts())
print(pd33["label_bin"].value_counts())
print(extra_benign["label_bin"].value_counts())

1    233444
0    144865
Name: label_bin, dtype: int64
1    90732
0    66240
Name: label_bin, dtype: int64
1    508782
0    494256
Name: label_bin, dtype: int64
1    471575
0    463634
Name: label_bin, dtype: int64
0    134472
Name: label_bin, dtype: int64


### Get all benign data from silo0, silo1 and combine with extra_benign, then randomly distributed for 4 silos

In [23]:
pd00_benign = pd00[pd00['label_bin'] == 0]

In [24]:
pd11_benign = pd11[pd11['label_bin'] == 0]

In [25]:
print(pd00_benign.shape)
print(pd11_benign.shape)

(144865, 1526)
(66240, 1526)


In [26]:
combined_benign = pd.concat([pd00_benign, pd11_benign, extra_benign], ignore_index=True)

In [27]:
combined_benign = combined_benign.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [28]:
combined_benign.shape

(345577, 1526)

In [29]:
# combined_benign.to_csv('datasets/nids/combined_benign.csv', index=False)

# Make silos dataset

In [30]:
# combined_benign = pd.read_csv("datasets/nids/extra_benign.csv").dropna()

In [31]:
sample_size = 68000
# pd0_sub_benign = combined_benign[:sample_size]
# pd1_sub_benign = combined_benign[sample_size:sample_size*2]
# pd2_sub_benign = combined_benign[sample_size*2:sample_size*3]
# pd3_sub_benign = combined_benign[sample_size*3:sample_size*4]

shuffled_pd = combined_benign.sample(frac=1, random_state=42)  # Shuffle the DataFrame

# Step 2: Divide the DataFrame into three non-overlapping subsets
total_rows = len(shuffled_pd)
subset_size = total_rows // 4

subset0 = shuffled_pd.iloc[:subset_size]
subset1 = shuffled_pd.iloc[subset_size:2 * subset_size]
subset2 = shuffled_pd.iloc[2 * subset_size:3 * subset_size]
subset3 = shuffled_pd.iloc[3 * subset_size:]

pd0_sub_benign = subset0.sample(n=sample_size, random_state=42).reset_index(drop=True)
pd1_sub_benign = subset1.sample(n=sample_size, random_state=42).reset_index(drop=True)
pd2_sub_benign = subset2.sample(n=sample_size, random_state=42).reset_index(drop=True)
pd3_sub_benign = subset3.sample(n=sample_size, random_state=42).reset_index(drop=True)

In [32]:
pd3_sub_benign.shape

(68000, 1526)

In [33]:
silo0 = pd.concat([pd0_sub_benign, pd0_sub_attack], ignore_index=True)
silo0.shape

(175383, 1526)

In [34]:
silo0["label_bin"].value_counts()

1    107383
0     68000
Name: label_bin, dtype: int64

In [35]:
# silo0 = silo0.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [36]:
silo0.head(20)

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.156863,0.45098,0.580392,0.25098,0.0,0.501961,0.666667,0.909804,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.156863,0.454902,0.545098,0.25098,0.0,0.501961,0.662745,0.945098,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.156863,0.411765,0.666667,0.25098,0.0,0.501961,0.705882,0.823529,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.156863,0.223529,0.976471,0.25098,0.0,0.501961,0.894118,0.509804,0.643137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.156863,0.337255,0.070588,0.25098,0.0,0.501961,0.784314,0.415686,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0.156863,0.027451,0.501961,0.25098,0.0,0.501961,0.086275,0.992157,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.203922,0.4,0.862745,0.25098,0.0,0.501961,0.690196,0.341176,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.156863,0.329412,0.458824,0.25098,0.0,0.501961,0.792157,0.027451,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.0,0.156863,0.058824,0.160784,0.25098,0.0,0.501961,0.058824,0.329412,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,0.156863,0.235294,0.545098,0.25098,0.0,0.501961,0.882353,0.941176,0.643137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [37]:
silo1 = pd.concat([pd1_sub_benign, pd1_sub_attack], ignore_index=True)
# silo1 = silo1.sample(frac=1.0, random_state=42).reset_index(drop=True)
silo2 = pd.concat([pd2_sub_benign, pd2_sub_attack], ignore_index=True)
# silo2 = silo2.sample(frac=1.0, random_state=42).reset_index(drop=True)
silo3 = pd.concat([pd3_sub_benign, pd3_sub_attack], ignore_index=True)
# silo3 = silo3.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [38]:
print(silo1["label_bin"].value_counts())
print(silo2["label_bin"].value_counts())
print(silo3["label_bin"].value_counts())

1    90732
0    68000
Name: label_bin, dtype: int64
1    68684
0    68000
Name: label_bin, dtype: int64
0    68000
1    67905
Name: label_bin, dtype: int64


In [52]:
from sklearn.model_selection import train_test_split
silo0_train, silo0_test = train_test_split(silo0, test_size=0.2, random_state=41)
silo1_train, silo1_test = train_test_split(silo1, test_size=0.2, random_state=42)
silo2_train, silo2_test = train_test_split(silo2, test_size=0.2, random_state=43)
silo3_train, silo3_test = train_test_split(silo3, test_size=0.2, random_state=44)

In [48]:
silo0_train.to_csv('client_data/nids/0/0_train_new.csv', index=False)
silo1_train.to_csv('client_data/nids/1/1_train_new.csv', index=False)
silo2_train.to_csv('client_data/nids/2/2_train_new.csv', index=False)
silo3_train.to_csv('client_data/nids/3/3_train_new.csv', index=False)


In [53]:
silo0_test.to_csv('client_data/nids/0/0_test_new.csv', index=False)
silo1_test.to_csv('client_data/nids/1/1_test_new.csv', index=False)
silo2_test.to_csv('client_data/nids/2/2_test_new.csv', index=False)
silo3_test.to_csv('client_data/nids/3/3_test_new.csv', index=False)


In [54]:
test_central_new = pd.concat([silo0_test, silo1_test, silo2_test, silo3_test], ignore_index=True)
# test_central_new = test_central_new.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [55]:
test_central_new["label_bin"].value_counts()

1    67190
0    54308
Name: label_bin, dtype: int64

In [56]:
test_central_new.to_csv('datasets/nids/test/test_central_new.csv', index=False)

In [None]:
test_central_new = CustomDataset('datasets/nids/test/test_central_new.csv')

In [46]:
torch.save(test_central_new,'./datasets/nids/test/test_central_new.pkl')

In [None]:
# train_central_new = pd.concat([silo0_train, silo0_train, silo0_train, silo0_train], ignore_index=True)
# train_central_new = train_central_new.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
# train_central_new.to_csv('datasets/nids/train/train_central_new.csv', index=False)

In [None]:
# train_central_new = CustomDataset('datasets/nids/train/train_central_new.csv')

In [1]:
# torch.save(train_central_new,'./datasets/nids/train/train_central_new.pkl')

### Adv samples

In [12]:
adv_truerandom = pd.read_excel("datasets/nids/adv/Adv_all_DNN_truerandom.xlsx").drop("Unnamed: 0", axis=1)

In [13]:
adv_truerandom["label_bin"] = 1
adv_truerandom

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.007843,0.686275,0.219608,0.941176,0.25098,0.0,0.501961,0.505882,0.690196,0.196078,...,0,0,0,0,0,0,0,0,0,1
1,0.000000,0.235294,0.682353,0.352941,0.25098,0.0,0.243137,0.090196,0.462745,0.039216,...,0,0,0,0,0,0,0,0,0,1
2,0.007843,0.109804,0.407843,0.898039,0.25098,0.0,0.501961,0.321569,0.305882,0.192157,...,0,0,0,0,0,0,0,0,0,1
3,0.011765,0.086275,0.145098,0.552941,0.25098,0.0,0.501961,0.580392,0.674510,0.196078,...,0,0,0,0,0,0,0,0,0,1
4,0.003922,0.701961,0.894118,0.317647,0.25098,0.0,0.243137,0.878431,0.027451,0.192157,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9402,0.015686,0.058824,0.498039,0.545098,0.25098,0.0,0.501961,0.223529,0.709804,0.196078,...,0,0,0,0,0,0,0,0,0,1
9403,0.003922,0.623529,0.427451,0.545098,0.25098,0.0,0.501961,0.305882,0.145098,0.196078,...,0,0,0,0,0,0,0,0,0,1
9404,0.000000,0.203922,0.709804,0.580392,0.25098,0.0,0.243137,0.062745,0.266667,0.400000,...,0,0,0,0,0,0,0,0,0,1
9405,0.007843,0.490196,0.239216,0.043137,0.25098,0.0,0.243137,0.525490,0.517647,0.200000,...,0,0,0,0,0,0,0,0,0,1


In [14]:
adv_truerandom.iloc[0:1800].to_csv('datasets/nids/adv/20_adv_truerandom_test.csv', index=False)

In [15]:
adv_truerandom.iloc[1800:].to_csv('datasets/nids/adv/80_adv_truerandom_test.csv', index=False)

In [16]:
adv_incremental = pd.read_excel("datasets/nids/adv/Adv_all_DNN_incremental.xlsx").drop("Unnamed: 0", axis=1)

In [18]:
adv_incremental["label_bin"] = 1
adv_incremental

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.007843,0.686275,0.219608,0.941176,0.25098,0.0,0.501961,0.505882,0.690196,0.196078,...,0,0,0,0,0,0,0,0,0,1
1,0.000000,0.235294,0.113725,0.478431,0.25098,0.0,0.243137,0.658824,0.337255,0.392157,...,0,0,0,0,0,0,0,0,0,1
2,0.000000,0.235294,0.682353,0.352941,0.25098,0.0,0.243137,0.090196,0.462745,0.039216,...,0,0,0,0,0,0,0,0,0,1
3,0.007843,0.109804,0.407843,0.898039,0.25098,0.0,0.501961,0.321569,0.305882,0.192157,...,0,0,0,0,0,0,0,0,0,1
4,0.011765,0.086275,0.145098,0.552941,0.25098,0.0,0.501961,0.580392,0.674510,0.196078,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5180,0.007843,0.490196,0.658824,0.286275,0.25098,0.0,0.243137,0.105882,0.274510,0.172549,...,0,0,0,0,0,0,0,0,0,1
5181,0.007843,0.490196,0.866667,0.670588,0.25098,0.0,0.243137,0.898039,0.890196,0.105882,...,0,0,0,0,0,0,0,0,0,1
5182,0.003922,0.701961,0.380392,0.160784,0.25098,0.0,0.243137,0.388235,0.188235,0.611765,...,0,0,0,0,0,0,0,0,0,1
5183,0.007843,0.682353,0.035294,0.796078,0.25098,0.0,0.501961,0.690196,0.839216,0.929412,...,0,0,0,0,0,0,0,0,0,1


In [19]:
adv_incremental.iloc[0:1000].to_csv('datasets/nids/adv/20_adv_incremental_test.csv', index=False)

In [20]:
adv_incremental.iloc[1000:].to_csv('datasets/nids/adv/80_adv_incremental_test.csv', index=False)

In [21]:
adv_all = pd.read_csv("datasets/nids/adv/adv_examples_all_allDNN_bigger.csv")

In [22]:
adv_all["label_bin"] = 1

In [23]:
adv_all

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.156863,0.482353,0.580392,0.25098,0.0,0.525490,0.007843,0.313725,0.952941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.235294,0.180392,0.184314,0.25098,0.0,0.282353,0.552941,0.631373,0.607843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.156863,0.207843,0.917647,0.25098,0.0,0.494118,0.309804,0.980392,0.780392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.156863,0.027451,0.850980,0.25098,0.0,0.517647,0.470588,0.043137,0.780392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.235294,0.254902,0.937255,0.25098,0.0,0.282353,0.474510,0.882353,0.917647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11557,0.0,0.156863,0.384314,0.752941,0.25098,0.0,0.494118,0.137255,0.141176,0.792157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
11558,0.0,0.235294,0.031373,0.541176,0.25098,0.0,0.274510,0.709804,0.274510,0.235294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
11559,0.0,0.235294,0.262745,0.670588,0.25098,0.0,0.290196,0.462745,0.145098,0.415686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
11560,0.0,0.203922,0.431373,0.411765,0.25098,0.0,0.541176,0.043137,0.435294,0.564706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [24]:
adv_all_20_sub = adv_all.sample(n=4000, random_state=42).reset_index(drop=True)

In [25]:
adv_all_20_sub = adv_all_20_sub.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
adv_all_20_sub

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.156863,0.407843,0.329412,0.25098,0.0,0.549020,0.058824,0.564706,0.254902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.156863,0.333333,0.819608,0.25098,0.0,0.517647,0.164706,0.074510,0.117647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.156863,0.462745,0.133333,0.25098,0.0,0.556863,1.000000,0.756863,0.721569,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.172549,0.090196,0.419608,0.00000,0.0,0.278431,0.898039,0.458824,0.611765,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.156863,0.243137,0.827451,0.25098,0.0,0.517647,0.254902,0.066667,0.533333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.235294,0.317647,0.282353,0.25098,0.0,0.305882,0.392157,0.533333,0.913725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3996,0.0,0.172549,0.603922,0.129412,0.00000,0.0,0.396078,0.266667,0.749020,0.490196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3997,0.0,0.235294,0.250980,0.262745,0.25098,0.0,0.258824,0.505882,0.552941,0.470588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3998,0.0,0.313725,0.294118,0.556863,0.25098,0.0,0.494118,0.227451,0.180392,0.784314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [27]:
adv_all_20_sub.to_csv('datasets/nids/adv/20_adv_all_test.csv', index=False)

In [28]:
adv_all_80_sub = adv_all.sample(n=7562, random_state=42).reset_index(drop=True)

In [29]:
adv_all_80_sub = adv_all_80_sub.sample(frac=1, random_state=42).reset_index(drop=True)

In [30]:
adv_all_80_sub.to_csv('datasets/nids/adv/80_adv_all_test.csv', index=False)

### Add more 80% adv samples into silo3 for training

In [31]:
silo3_train = pd.read_csv('client_data/nids/3/3_train_new.csv')
silo3_train

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.000000,0.235294,0.062745,0.588235,0.25098,0.0,0.243137,0.709804,0.227451,0.823529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.003922,0.584314,0.180392,0.709804,0.25098,0.0,0.501961,0.933333,0.352941,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.000000,0.156863,0.121569,0.647059,0.25098,0.0,0.501961,0.996078,0.843137,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.000000,0.172549,0.717647,0.945098,0.00000,0.0,0.215686,0.329412,0.937255,0.556863,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.000000,0.298039,0.450980,0.470588,0.25098,0.0,0.501961,0.666667,0.878431,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147863,0.000000,0.235294,0.380392,0.470588,0.25098,0.0,0.243137,0.392157,0.345098,0.380392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
147864,0.000000,0.203922,0.011765,0.600000,0.25098,0.0,0.243137,0.760784,0.247059,0.176471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
147865,0.000000,0.172549,0.694118,0.670588,0.00000,0.0,0.200000,0.372549,0.207843,0.611765,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
147866,0.000000,0.274510,0.486275,0.654902,0.12549,0.0,0.231373,0.423529,0.121569,0.462745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [32]:
adv_all_80_train = pd.read_csv('datasets/nids/adv/80_adv_all_test.csv')
adv_all_80_train

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.203922,0.109804,0.133333,0.25098,0.0,0.494118,0.411765,0.713725,0.262745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.313725,0.086275,0.627451,0.25098,0.0,0.494118,0.435294,0.109804,0.870588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.274510,0.192157,0.509804,0.12549,0.0,0.482353,0.466667,0.266667,0.117647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.172549,0.482353,0.329412,0.00000,0.0,0.298039,0.486275,0.549020,0.580392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.156863,0.262745,0.694118,0.25098,0.0,0.517647,0.235294,0.200000,0.494118,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7557,0.0,0.172549,0.588235,0.866667,0.00000,0.0,0.345098,0.333333,0.011765,0.796078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7558,0.0,0.156863,0.035294,0.172549,0.25098,0.0,0.494118,0.486275,0.721569,0.525490,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7559,0.0,0.156863,0.047059,0.490196,0.25098,0.0,0.494118,0.474510,0.403922,0.450980,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7560,0.0,0.235294,0.250980,0.262745,0.25098,0.0,0.258824,0.505882,0.552941,0.470588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [33]:
silo3_train_adv = pd.concat([silo3_train, adv_all_80_train], ignore_index=True)

In [34]:
silo3_train_adv = silo3_train_adv.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [61]:
silo3_train_adv

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.000000,0.235294,0.980392,0.290196,0.25098,0.0,0.243137,0.796078,0.521569,0.643137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.000000,0.235294,0.168627,0.439216,0.25098,0.0,0.243137,0.603922,0.376471,0.717647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.000000,0.203922,0.454902,0.458824,0.25098,0.0,0.243137,0.317647,0.384314,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.000000,0.172549,0.168627,0.784314,0.00000,0.0,0.145098,0.952941,0.094118,0.694118,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.000000,0.156863,0.200000,0.894118,0.25098,0.0,0.501961,0.917647,0.596078,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155425,0.000000,0.235294,0.227451,0.945098,0.25098,0.0,0.243137,0.541176,0.874510,0.290196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
155426,0.003922,0.137255,0.501961,0.949020,0.25098,0.0,0.243137,0.262745,0.968627,0.090196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
155427,0.000000,0.203922,0.505882,0.043137,0.25098,0.0,0.250980,0.141176,0.200000,0.231373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
155428,0.000000,0.156863,0.286275,0.145098,0.25098,0.0,0.494118,0.235294,0.749020,0.180392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
silo3_train_adv.to_csv('client_data/nids/3/3_train_new_more_80_adv.csv', index=False)

In [63]:
silo0_train = pd.read_csv('client_data/nids/0/0_train_new.csv')
silo0_train_adv = pd.concat([silo0_train, adv_all_80_train], ignore_index=True)
silo0_train_adv = silo0_train_adv.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [64]:
silo0_train_adv.to_csv('client_data/nids/0/0_train_new_more_80_adv.csv', index=False)

In [65]:
silo1_train = pd.read_csv('client_data/nids/1/1_train_new.csv')
silo1_train_adv = pd.concat([silo1_train, adv_all_80_train], ignore_index=True)
silo1_train_adv = silo1_train_adv.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [66]:
silo1_train_adv.to_csv('client_data/nids/1/1_train_new_more_80_adv.csv', index=False)

In [67]:
silo2_train = pd.read_csv('client_data/nids/2/2_train_new.csv')
silo2_train_adv = pd.concat([silo2_train, adv_all_80_train], ignore_index=True)
silo2_train_adv = silo2_train_adv.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [68]:
silo2_train_adv.to_csv('client_data/nids/2/2_train_new_more_80_adv.csv', index=False)

### add benign samples into adv data

In [42]:
combined_benign = pd.read_csv("datasets/nids/extra_benign.csv").dropna().drop("Unnamed: 0", axis=1)
combined_benign

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,Label
0,0.0,0.266667,0.098039,0.756863,0.25098,0.0,0.501961,0.015686,0.627451,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.0,0.203922,0.400000,0.462745,0.25098,0.0,0.501961,0.690196,0.741176,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,0.0,0.156863,0.294118,0.517647,0.25098,0.0,0.501961,0.823529,0.972549,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.0,0.156863,0.321569,0.254902,0.25098,0.0,0.501961,0.800000,0.231373,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.0,0.156863,0.035294,0.580392,0.25098,0.0,0.501961,0.078431,0.913725,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134467,0.0,0.156863,0.035294,0.764706,0.25098,0.0,0.501961,0.078431,0.729412,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
134468,0.0,0.156863,0.349020,0.929412,0.25098,0.0,0.501961,0.768627,0.560784,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
134469,0.0,0.156863,0.219608,0.274510,0.25098,0.0,0.501961,0.901961,0.211765,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
134470,0.0,0.156863,0.482353,0.133333,0.25098,0.0,0.501961,0.639216,0.352941,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [44]:
combined_benign['label_bin'] = combined_benign['Label'].apply(lambda x: 0 if x == 'Benign' else 1)

In [45]:
combined_benign

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,Label,label_bin
0,0.0,0.266667,0.098039,0.756863,0.25098,0.0,0.501961,0.015686,0.627451,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
1,0.0,0.203922,0.400000,0.462745,0.25098,0.0,0.501961,0.690196,0.741176,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
2,0.0,0.156863,0.294118,0.517647,0.25098,0.0,0.501961,0.823529,0.972549,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
3,0.0,0.156863,0.321569,0.254902,0.25098,0.0,0.501961,0.800000,0.231373,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
4,0.0,0.156863,0.035294,0.580392,0.25098,0.0,0.501961,0.078431,0.913725,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134467,0.0,0.156863,0.035294,0.764706,0.25098,0.0,0.501961,0.078431,0.729412,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
134468,0.0,0.156863,0.349020,0.929412,0.25098,0.0,0.501961,0.768627,0.560784,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
134469,0.0,0.156863,0.219608,0.274510,0.25098,0.0,0.501961,0.901961,0.211765,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0
134470,0.0,0.156863,0.482353,0.133333,0.25098,0.0,0.501961,0.639216,0.352941,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,0


In [46]:
combined_benign = combined_benign.drop('Label', axis=1)
combined_benign

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.266667,0.098039,0.756863,0.25098,0.0,0.501961,0.015686,0.627451,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.203922,0.400000,0.462745,0.25098,0.0,0.501961,0.690196,0.741176,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.156863,0.294118,0.517647,0.25098,0.0,0.501961,0.823529,0.972549,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.156863,0.321569,0.254902,0.25098,0.0,0.501961,0.800000,0.231373,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.156863,0.035294,0.580392,0.25098,0.0,0.501961,0.078431,0.913725,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134467,0.0,0.156863,0.035294,0.764706,0.25098,0.0,0.501961,0.078431,0.729412,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
134468,0.0,0.156863,0.349020,0.929412,0.25098,0.0,0.501961,0.768627,0.560784,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
134469,0.0,0.156863,0.219608,0.274510,0.25098,0.0,0.501961,0.901961,0.211765,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
134470,0.0,0.156863,0.482353,0.133333,0.25098,0.0,0.501961,0.639216,0.352941,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [50]:
adv_truerandom_benign_attack_20 = pd.concat([adv_truerandom.iloc[0:1800], combined_benign.iloc[0:1800]], ignore_index=True)

In [51]:
adv_truerandom_benign_attack_20

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.007843,0.686275,0.219608,0.941176,0.25098,0.0,0.501961,0.505882,0.690196,0.196078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.000000,0.235294,0.682353,0.352941,0.25098,0.0,0.243137,0.090196,0.462745,0.039216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.007843,0.109804,0.407843,0.898039,0.25098,0.0,0.501961,0.321569,0.305882,0.192157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.011765,0.086275,0.145098,0.552941,0.25098,0.0,0.501961,0.580392,0.674510,0.196078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.003922,0.701961,0.894118,0.317647,0.25098,0.0,0.243137,0.878431,0.027451,0.192157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,0.000000,0.156863,0.196078,0.207843,0.25098,0.0,0.501961,0.925490,0.278431,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3596,0.000000,0.156863,0.039216,0.498039,0.25098,0.0,0.501961,0.074510,0.996078,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3597,0.000000,0.156863,0.443137,0.262745,0.25098,0.0,0.501961,0.678431,0.223529,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3598,0.000000,0.203922,0.435294,0.725490,0.25098,0.0,0.501961,0.682353,0.717647,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [52]:
adv_truerandom_benign_attack_20 = adv_truerandom_benign_attack_20.sample(frac=1.0, random_state=42).reset_index(drop=True)
adv_truerandom_benign_attack_20

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.235294,0.368627,0.717647,0.25098,0.0,0.243137,0.389629,0.098039,0.921569,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.203922,0.815686,0.478431,0.25098,0.0,0.243137,0.960784,0.364706,0.952941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.156863,0.411765,0.105882,0.25098,0.0,0.501961,0.709804,0.380392,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.156863,0.219608,0.019608,0.25098,0.0,0.501961,0.901961,0.466667,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.203922,0.200000,0.564706,0.25098,0.0,0.243137,0.572549,0.282353,0.129412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,0.0,0.203922,0.831373,0.807843,0.25098,0.0,0.243137,0.945098,0.035294,0.352941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3596,0.0,0.203922,0.722886,0.635294,0.25098,0.0,0.243137,0.674510,0.211765,0.427451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3597,0.0,0.203922,0.733333,0.492921,0.25098,0.0,0.243137,0.039216,0.486275,0.388235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3598,0.0,0.156863,0.384314,0.741176,0.25098,0.0,0.501961,0.780392,0.109804,0.529412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [56]:
adv_truerandom_benign_attack_20.to_csv('datasets/nids/adv/adv_truerandom_benign_attack_20.csv', index=False)

In [53]:
adv_incremental_benign_attack_20 = pd.concat([adv_incremental.iloc[0:1000], combined_benign.iloc[1000:2000]], ignore_index=True)
adv_incremental_benign_attack_20 = adv_incremental_benign_attack_20.sample(frac=1.0, random_state=42).reset_index(drop=True)
adv_incremental_benign_attack_20

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.019608,0.862745,0.325490,0.305882,0.25098,0.0,0.501961,0.109804,0.462745,0.952941,...,0.447059,0.219608,0.666667,0.239216,0.168627,0.529412,0.254902,0.32549,0.415686,0
1,0.000000,0.203922,0.682353,0.219608,0.25098,0.0,0.243137,0.090196,0.627451,0.647059,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,1
2,0.000000,0.156863,0.121569,0.101961,0.25098,0.0,0.501961,1.000000,0.384314,0.858824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0
3,0.000000,0.203922,0.682353,0.560784,0.25098,0.0,0.243137,0.090196,0.286275,0.207843,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,1
4,0.000000,0.156863,0.070588,0.149020,0.25098,0.0,0.501961,0.047059,0.341176,0.858824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.156863,0.192157,0.172549,0.25098,0.0,0.501961,0.929412,0.313725,0.858824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0
1996,0.000000,0.156863,0.082353,0.541176,0.25098,0.0,0.501961,0.031373,0.952941,0.858824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0
1997,0.007843,0.705882,0.901961,0.666667,0.25098,0.0,0.243137,0.862745,0.678431,0.866667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,1
1998,0.000000,0.156863,0.047059,0.392157,0.25098,0.0,0.501961,0.070588,0.098039,0.858824,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0


In [57]:
adv_incremental_benign_attack_20.to_csv('datasets/nids/adv/adv_incremental_benign_attack_20.csv', index=False)

In [59]:
adv_all_benign_attack_20 = pd.concat([adv_all_20_sub, combined_benign.iloc[2000:6000]], ignore_index=True)
adv_all_benign_attack_20 = adv_all_benign_attack_20.sample(frac=1.0, random_state=42).reset_index(drop=True)
adv_all_benign_attack_20

Unnamed: 0,ip_header_byte_2,ip_header_byte_3,ip_header_byte_4,ip_header_byte_5,ip_header_byte_6,ip_header_byte_7,ip_header_byte_8,ip_header_byte_10,ip_header_byte_11,tcp_header_byte_4,...,tcp_segment_data_byte_1451,tcp_segment_data_byte_1452,tcp_segment_data_byte_1453,tcp_segment_data_byte_1454,tcp_segment_data_byte_1455,tcp_segment_data_byte_1456,tcp_segment_data_byte_1457,tcp_segment_data_byte_1458,tcp_segment_data_byte_1459,label_bin
0,0.0,0.156863,0.078431,0.329412,0.25098,0.0,0.494118,0.443137,0.564706,0.698039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.235294,0.011765,0.674510,0.25098,0.0,0.329412,0.674510,0.141176,0.407843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.274510,0.709804,0.050980,0.12549,0.0,0.254902,0.176471,0.725490,0.223529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.156863,0.109804,0.145098,0.25098,0.0,0.494118,0.411765,0.749020,0.698039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.156863,0.043137,0.019608,0.25098,0.0,0.501961,0.074510,0.470588,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.0,0.156863,0.098039,0.737255,0.25098,0.0,0.501961,0.015686,0.756863,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7996,0.0,0.156863,0.149020,0.141176,0.25098,0.0,0.501961,0.972549,0.345098,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7997,0.0,0.156863,0.105882,0.788235,0.25098,0.0,0.525490,0.384314,0.105882,0.062745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7998,0.0,0.156863,0.415686,0.988235,0.25098,0.0,0.501961,0.701961,0.501961,0.858824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [60]:
adv_all_benign_attack_20.to_csv('datasets/nids/adv/adv_all_benign_attack_20.csv', index=False)