In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer

In [2]:
nsl_kdd = pd.read_csv("data/nsl_kdd.csv")
nsl_kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [3]:
def preprocess(df_x, df_y):
    df_x = pd.get_dummies(df_x, columns = ["protocol_type","service","flag"])
    x_normalise = Normalizer().fit(df_x)
    df_x = x_normalise.transform(df_x)
    x_new = SelectFpr(chi2, alpha=0.05).fit_transform(df_x, df_y)
    return x_new

def bool_attack(x):
    if(x != "normal"):
        return "attack"
    else:
        return "normal"
    
def convert_bool(x):
    if(x == "attack"):
        return 1
    else:
        return 0

In [4]:
df_x = nsl_kdd.drop('class', axis=1).drop('difficulty_level', axis=1)
df_y = nsl_kdd['class'].apply(bool_attack).apply(convert_bool)
x_new = preprocess(df_x, df_y)

In [5]:
_, n_columns = x_new.shape
print(n_columns)

33


In [6]:
column_names = []

for i in range(n_columns):
    column_names.append('f' + str(i))

In [7]:
x_dataframe = pd.DataFrame(data=x_new, columns = column_names)
x_dataframe.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32
0,0.0,0.955213,0.0,0.0,0.0,0.003891,0.003891,0.0,0.0,0.0,...,0.001945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.496369,0.0,0.0,0.0,0.044197,0.0034,0.0,0.0,0.0,...,0.0,0.0034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.432517,0.021098,0.003516,0.003516,0.0,...,0.003516,0.0,0.0,0.0,0.0,0.003516,0.0,0.0,0.0,0.003516
3,0.0,0.02843,0.9991,0.0,0.000123,0.000613,0.000613,2.5e-05,2.5e-05,0.0,...,0.000123,0.0,0.0,0.0,0.000123,0.0,0.0,0.0,0.0,0.0
4,0.0,0.337346,0.711986,0.0,0.001695,0.050856,0.054247,0.0,0.0,0.0,...,0.001695,0.0,0.0,0.0,0.001695,0.0,0.0,0.0,0.0,0.0


In [8]:
y_dataframe = nsl_kdd['class']
y_dataframe.head()

0    normal
1    normal
2       dos
3    normal
4    normal
Name: class, dtype: object

In [9]:
normalized_dataset = pd.concat([x_dataframe, y_dataframe], axis=1)
normalized_dataset.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f24,f25,f26,f27,f28,f29,f30,f31,f32,class
0,0.0,0.955213,0.0,0.0,0.0,0.003891,0.003891,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
1,0.0,0.496369,0.0,0.0,0.0,0.044197,0.0034,0.0,0.0,0.0,...,0.0034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
2,0.0,0.0,0.0,0.0,0.0,0.432517,0.021098,0.003516,0.003516,0.0,...,0.0,0.0,0.0,0.0,0.003516,0.0,0.0,0.0,0.003516,dos
3,0.0,0.02843,0.9991,0.0,0.000123,0.000613,0.000613,2.5e-05,2.5e-05,0.0,...,0.0,0.0,0.0,0.000123,0.0,0.0,0.0,0.0,0.0,normal
4,0.0,0.337346,0.711986,0.0,0.001695,0.050856,0.054247,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001695,0.0,0.0,0.0,0.0,0.0,normal


In [10]:
normalized_dataset.describe()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,0.023171,0.209325,0.308326,8.5e-05,0.000385,0.221715,0.058968,0.001039,0.001038,0.00064,...,0.002183,0.000365,0.000221,0.000513,0.000493,0.00072,8.3e-05,0.000543,4.8e-05,0.001006
std,0.144371,0.315134,0.407845,0.001059,0.001201,0.280445,0.12488,0.003669,0.003769,0.00476,...,0.006379,0.001457,0.001074,0.003662,0.003557,0.003293,0.001899,0.004606,0.000612,0.003575
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.002717,0.002861,0.0,0.0,0.0,...,0.000105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.05272,0.0,0.0,0.0,0.020344,0.01217,0.0,0.0,0.0,...,0.001265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.276666,0.764164,0.0,0.000394,0.453999,0.046098,0.002653,0.00265,0.0,...,0.003085,0.0,0.0,0.0,0.000192,0.0,0.0,0.0,0.0,0.002649
max,0.999998,1.0,1.0,0.037842,0.063186,0.999776,0.999758,0.267261,0.267261,0.353553,...,0.353553,0.163846,0.057831,0.116248,0.353553,0.316228,0.258199,0.353553,0.107583,0.267261


In [11]:
normalized_dataset['class'].describe()

count     125973
unique         5
top       normal
freq       67343
Name: class, dtype: object

In [12]:
normalized_dataset.to_csv('data/nsl_kdd_normalized.csv', index=False)