In [1]:
import pandas as pd
import numpy as np

In [2]:
nsl_kdd = pd.read_csv("data/nsl_kdd.csv")
nsl_kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [3]:
nsl_kdd['class'].describe()

count     125973
unique         5
top       normal
freq       67343
Name: class, dtype: object

In [4]:
nsl_kdd['class'].value_counts()

normal    67343
dos       45927
probe     11656
r2l         995
u2r          52
Name: class, dtype: int64

In [5]:
dos_df = nsl_kdd.loc[nsl_kdd['class'] == 'dos']
probe_df = nsl_kdd.loc[nsl_kdd['class'] == 'probe']
r2l_df = nsl_kdd.loc[nsl_kdd['class'] == 'r2l']
u2r_df = nsl_kdd.loc[nsl_kdd['class'] == 'u2r']
normal_df = nsl_kdd.loc[nsl_kdd['class'] == 'normal']

In [6]:
attack_samples = []

attack_samples.append(dos_df.shape[0])
attack_samples.append(probe_df.shape[0])
attack_samples.append(r2l_df.shape[0])
attack_samples.append(u2r_df.shape[0])

print(attack_samples)

[45927, 11656, 995, 52]


In [7]:
def normalize(arr):
    total_sum = sum(arr)
    result = [x/total_sum for x in arr]
    return result

In [8]:
samples_ratio = normalize(attack_samples)
print(samples_ratio)

[0.7833361760191029, 0.1988060719768037, 0.016970834044004776, 0.0008869179600886918]


In [9]:
normal_samples = [int(normal_df.shape[0] * i) for i in samples_ratio]
print(normal_samples)

[52752, 13388, 1142, 59]


In [10]:
cumulative_samples = np.cumsum(normal_samples)
print(cumulative_samples)

[52752 66140 67282 67341]


In [11]:
n1 = normal_df[0:cumulative_samples[0]]
n2 = normal_df[cumulative_samples[0]:cumulative_samples[1]]
n3 = normal_df[cumulative_samples[1]:cumulative_samples[2]]
n4 = normal_df[cumulative_samples[2]:cumulative_samples[3]]

In [12]:
print(n1.shape)
print(n2.shape)
print(n3.shape)
print(n4.shape)

(52752, 43)
(13388, 43)
(1142, 43)
(59, 43)


In [13]:
merges = [[dos_df, n1], [probe_df, n2], [r2l_df, n3], [u2r_df, n4]]

sets = []
for frames in merges:
    sets.append(pd.concat(frames))

In [14]:
print(sets[0]['class'].value_counts())
print(sets[1]['class'].value_counts())
print(sets[2]['class'].value_counts())
print(sets[3]['class'].value_counts())

normal    52752
dos       45927
Name: class, dtype: int64
normal    13388
probe     11656
Name: class, dtype: int64
normal    1142
r2l        995
Name: class, dtype: int64
normal    59
u2r       52
Name: class, dtype: int64


In [15]:
for i in range(len(sets)):
    sets[i].to_csv('data/nsl-splits/set-' + str(i+1) + '.csv')