In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
url_base = "http://kdd.ics.uci.edu/databases/kddcup99"
kdd_10_percent_url = url_base + '/' + 'kddcup.data_10_percent.gz'
kdd_colnames_url = url_base + '/' + 'kddcup.names'

In [3]:
df_colnames = pd.read_csv(kdd_colnames_url, skiprows=1, sep=':', names=['f_names', 'f_types'])

In [4]:
df_colnames

Unnamed: 0,f_names,f_types
0,duration,continuous.
1,protocol_type,symbolic.
2,service,symbolic.
3,flag,symbolic.
4,src_bytes,continuous.
5,dst_bytes,continuous.
6,land,symbolic.
7,wrong_fragment,continuous.
8,urgent,continuous.
9,hot,continuous.


In [5]:
df_colnames.loc[df_colnames.shape[0]] = ['status', ' symbolic.']

In [6]:
df_colnames.values

array([['duration', ' continuous.'],
       ['protocol_type', ' symbolic.'],
       ['service', ' symbolic.'],
       ['flag', ' symbolic.'],
       ['src_bytes', ' continuous.'],
       ['dst_bytes', ' continuous.'],
       ['land', ' symbolic.'],
       ['wrong_fragment', ' continuous.'],
       ['urgent', ' continuous.'],
       ['hot', ' continuous.'],
       ['num_failed_logins', ' continuous.'],
       ['logged_in', ' symbolic.'],
       ['num_compromised', ' continuous.'],
       ['root_shell', ' continuous.'],
       ['su_attempted', ' continuous.'],
       ['num_root', ' continuous.'],
       ['num_file_creations', ' continuous.'],
       ['num_shells', ' continuous.'],
       ['num_access_files', ' continuous.'],
       ['num_outbound_cmds', ' continuous.'],
       ['is_host_login', ' symbolic.'],
       ['is_guest_login', ' symbolic.'],
       ['count', ' continuous.'],
       ['srv_count', ' continuous.'],
       ['serror_rate', ' continuous.'],
       ['srv_serror_rate', '

In [7]:
df_colnames['f_names'].values

array(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'status'], dtype=object)

In [8]:
df = pd.read_csv(kdd_10_percent_url, header=None, names=df_colnames['f_names'].values)

In [9]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [10]:
df.shape

(494021, 42)

In [11]:
df_symbolic = df_colnames[df_colnames['f_types'].str.contains('symbolic.')]

In [12]:
df_symbolic['f_names'][:-1].values  # except status

array(['protocol_type', 'service', 'flag', 'land', 'logged_in',
       'is_host_login', 'is_guest_login'], dtype=object)

In [34]:
X = pd.get_dummies(df.iloc[:,:-1], columns=df_symbolic['f_names'][:-1])

In [35]:
X.shape

(494021, 121)

In [37]:
list(X)

['duration',
 'src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'protocol_type_icmp',
 'protocol_type_tcp',
 'protocol_type_udp',
 'service_IRC',
 'service_X11',
 'service_Z39_50',
 'service_auth',
 'service_bgp',
 'service_courier',
 'service_csnet_ns',
 'service_ctf',
 'service_daytime',
 'service_discard',
 'service_domain',
 'service_domain_u',
 'service_echo',
 'service_eco_i',
 'servi

In [38]:
y = np.where(df['status'] == 'normal.', 1, 0)

In [39]:
y.shape

(494021,)

In [40]:
# total normal/total ratio
normal_idx = np.where(y)
abnormal_idx = np.where(1 - y)
len(normal_idx[0])/ (len(abnormal_idx[0]) + len(normal_idx[0]))

0.19691065764410826

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=123)
print(len(X_train), len(X_test), len(y_train), len(y_test))

247010 247011 247010 247011


In [67]:
np.sum(np.equal(y_train[np.where(y_train)], y_train[y_train == 1]))

48398

In [64]:
len(y_train ==1)

247010

In [60]:
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]
print(len(X_train), len(y_train))

198612 198612
