# Introduction

This data set captures different types of network intrusions or attacks. We follow the preprocessing performed by Nguyen and Gopalkrishnan [1,2], without removing the categorical attributes, using U2R outlier and normal as inlier. U2R as referred to by Nguyen and Gopalkrishnan contains the following attacks: buffer_overflow, ftp_write, imap, load_module, multihop, nmap, perl, phf, pod, rootkit, and teardrop. These attacks have 246 instances of the database, defined as outliers. After the preprocessing, this database has 38 numerical attributes, 3 categorical attributes and 60,839 instances, divided into 60,593 inliers and 246 outliers.

# Imports

In [16]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = "25,10"
plt.rcParams["legend.fontsize"] = 16
plt.rcParams["axes.labelsize"] = 16

import pandas_profiling as pp
import os


# Loading the data

In [7]:
data_dir = "../data/anomaly detection/"
data_file = "KDDCup99_withoutdupl_norm_idf.results.csv"

In [8]:
# with open(os.path.join(data_dir, data_file),'r') as f:
#     with open(os.path.join(data_dir, "updated_" + data_file), 'w') as f1:
#         next(f) # skip header line
#         for line in f:
#             f1.write(line)

In [9]:
updated_data_path = os.path.join(data_dir, "updated_" + data_file)

data = pd.read_csv(updated_data_path, sep=" ")

In [10]:
data.head()

Unnamed: 0,bylabel,0.0,0.0.1,0.0.2,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8,...,0.0.47903,0.0.47904,0.0.47905,0.0.47906,0.0.47907,0.0.47908,0.0.47909,0.0.47910,0.0.47911,0.0.47912
0,KNN-001,1.921902e-07,0.357846,1.921902e-07,0.004665,0.319914,1.921902e-07,0.080664,0.004841,0.004826,...,0.004376,2e-06,0.012239,0.010376,0.010376,0.013991,0.010088,0.008427,0.010088,0.006198
1,KNN-002,7.879798e-06,0.474082,7.879798e-06,0.004815,0.322647,7.879798e-06,0.164854,0.004847,0.004831,...,0.004376,3e-06,0.015777,0.015173,0.015173,0.01441,0.011092,0.010088,0.011092,0.010088
2,KNN-003,2.805977e-05,0.543009,2.805977e-05,0.005535,0.328407,2.805977e-05,0.165041,0.005268,0.005549,...,0.004376,2e-05,0.018172,0.015777,0.017197,0.018172,0.01393,0.011092,0.016638,0.011092
3,KNN-004,0.002767541,0.577367,0.002767541,0.006784,0.330008,0.002767541,0.166708,0.005791,0.005562,...,0.004376,0.001957,0.021087,0.017197,0.020628,0.020803,0.016638,0.016638,0.022184,0.016638
4,KNN-005,0.002767541,0.579861,0.002767541,0.006805,0.33018,0.002767541,0.167113,0.008305,0.006797,...,0.005535,0.001957,0.022924,0.020628,0.020628,0.025457,0.016638,0.022184,0.038822,0.022184


In [11]:
data.shape

(999, 48114)

In [12]:
data.describe(include="all").transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
bylabel,999,999,LOF-041,1,,,,,,,
0.0,999,,,,0.746332,0.609514,0,0.361113,0.889333,0.997579,9.9999
0.0.1,999,,,,5.46506,13.9355,0,0.8371,1.15214,1.3411,86.9513
0.0.2,999,,,,0.709609,0.562079,0,0.192496,0.944658,1.00755,9.9999
0.0.3,999,,,,0.79481,0.613782,0,0.482437,0.98662,1.04853,6.65548
0.0.4,999,,,,4.59819,5.9478,0,0.597282,3.54832,5.54521,33.8297
0.0.5,999,,,,0.734379,0.561714,0,0.34405,0.884935,0.997183,9.9999
0.0.6,999,,,,2.0181,3.46568,0,0.269842,1.10954,1.60424,21.3175
0.0.7,999,,,,0.763525,0.57068,0,0.350236,0.950029,1.03557,7.13126
0.0.8,999,,,,0.75417,0.571369,0,0.116621,0.993007,1.03233,7.23971


In [17]:
data["bylabel"]

0      KNN-001
1      KNN-002
2      KNN-003
3      KNN-004
4      KNN-005
5      KNN-006
6      KNN-007
7      KNN-008
8      KNN-009
9      KNN-010
10     KNN-011
11     KNN-012
12     KNN-013
13     KNN-014
14     KNN-015
15     KNN-016
16     KNN-017
17     KNN-018
18     KNN-019
19     KNN-020
20     KNN-021
21     KNN-022
22     KNN-023
23     KNN-024
24     KNN-025
25     KNN-026
26     KNN-027
27     KNN-028
28     KNN-029
29     KNN-030
        ...   
969    COF-071
970    COF-072
971    COF-073
972    COF-074
973    COF-075
974    COF-076
975    COF-077
976    COF-078
977    COF-079
978    COF-080
979    COF-081
980    COF-082
981    COF-083
982    COF-084
983    COF-085
984    COF-086
985    COF-087
986    COF-088
987    COF-089
988    COF-090
989    COF-091
990    COF-092
991    COF-093
992    COF-094
993    COF-095
994    COF-096
995    COF-097
996    COF-098
997    COF-099
998    COF-100
Name: bylabel, Length: 999, dtype: object