In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sb
from sklearn.preprocessing import OneHotEncoder

<h1> Load the training data </h1>

In [2]:
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


<h1>Check the data consistency</h1>

We check the number of duplicates using the pandas <em>duplicated()</em> method. The assertion checks if the number of non-duplicates (unique) rows is equal to the total number of rows in the dataset

In [3]:
number_of_duplicates = df.duplicated().value_counts()
assert number_of_duplicates[0] == df.shape[0], 'There are duplicates in the dataset'

We check if the names of the collumns are all unique

In [4]:

unique, counts = np.unique(df.columns.duplicated(), return_counts=True)
print(unique, counts)
assert unique[0] == False, 'There are duplicate columns in the dataset'

[False] [43]


We extract the categorical column names from the dataframe. Next we will replace their string values by their one-hot encoding 

In [61]:
categorical_columns = [df.columns[i] for i in range(len(df.columns)) if type(df[df.columns[i]][0]) == str]
print(categorical_columns)


['protocol_type', 'service', 'flag', 'attack']


Then we encode the categorical columns using one-hot encoding. This encoding transforms the number of our categorical columns from 4 to 107. This relation is the sum of all of the possible categories over all categorical columns.

In [62]:
ohe = OneHotEncoder(handle_unknown='error')
features_encoded = ohe.fit_transform(df[categorical_columns]).toarray()
features_labels = np.concatenate(np.array(ohe.categories_, dtype=object))

print(features_encoded.shape)
print(features_labels.shape)

ohe_df = pd.DataFrame(features_encoded, columns=features_labels)
ohe_df.head()

(125973, 107)
(107,)


Unnamed: 0,icmp,tcp,udp,IRC,X11,Z39_50,aol,auth,bgp,courier,...,phf,pod,portsweep,rootkit,satan,smurf,spy,teardrop,warezclient,warezmaster
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


To combine the one-hot encoded columns with our dataframe we first need to remove the original categorical columns from df.

In [64]:
df_wo_categories = df.drop(categorical_columns, axis=1)
df_wo_categories.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag
0,0,491,0,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,20
1,0,146,0,0,0,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,15
2,0,0,0,0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,19
3,0,232,8153,0,0,0,0,0,1,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,21
4,0,199,420,0,0,0,0,0,1,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21


Then we concatenate the df without categorical columns with the onehot encoded columns.

In [66]:
df_onehot = pd.concat([df_wo_categories, ohe_df], axis=1)
df_onehot.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,phf,pod,portsweep,rootkit,satan,smurf,spy,teardrop,warezclient,warezmaster
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
