><h1> Preprocessing of KDDCup '99 Dataset <h1>
><h3> Author: Shubham Manekar <br>
> Last Updated: December 27, 2021 <h3>

<h4> This notebook intends to develope the precessing pipeline of the KDDCup'99 Dataset. The analysis includes: <br>
1. 

<h4>
<hr>

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use( 'tkagg' )
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
df_raw = pd.read_csv('../Data/raw_df.csv')
df_raw.drop_duplicates(keep='first', inplace=True)
df_raw.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,deu_ruim_ou_nao
0,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal
1,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal
2,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal
3,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack
4,0,udp,private,SF,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack


In [17]:
params = {'axes.titlesize':'8',
          'xtick.labelsize':'4',
          'ytick.labelsize':'4'}
matplotlib.rcParams.update(params)
df_raw.hist(figsize=(50, 30), bins=20)

plt.savefig('../Figures/Data_Distribution.png', dpi=150)
plt.show()

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311029 entries, 0 to 311028
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     311029 non-null  int64  
 1   protocol_type                311029 non-null  object 
 2   service                      311029 non-null  object 
 3   flag                         311029 non-null  object 
 4   src_bytes                    311029 non-null  int64  
 5   dst_bytes                    311029 non-null  int64  
 6   land                         311029 non-null  int64  
 7   wrong_fragment               311029 non-null  int64  
 8   urgent                       311029 non-null  int64  
 9   hot                          311029 non-null  int64  
 10  num_failed_logins            311029 non-null  int64  
 11  logged_in                    311029 non-null  int64  
 12  num_compromised              311029 non-null  int64  
 13 

In [4]:
categorical_features = list(df_raw.select_dtypes(include=['object']).columns)
categorical_features

['protocol_type', 'service', 'flag', 'deu_ruim_ou_nao']

In [5]:
for c in categorical_features:
    df_raw[c] = df_raw[c].astype('category')
    globals()[f"enc_{c}"] = dict(enumerate(df_raw[c].cat.categories))   
    df_raw[c] = df_raw[c].astype('category').cat.codes

In [10]:
df_raw.skew()

duration                        27.825786
protocol_type                    1.458348
service                          0.480166
flag                            -1.131536
src_bytes                      217.306724
dst_bytes                       93.123576
land                            92.656474
wrong_fragment                  35.223495
urgent                         108.272079
hot                             78.899549
num_failed_logins               15.058245
logged_in                       -0.344962
num_compromised                169.323737
root_shell                      35.265821
su_attempted                   144.779374
num_root                       168.917259
num_file_creations             229.748183
num_shells                     121.585653
num_access_files                22.558352
num_outbound_cmds                0.000000
is_host_login                   80.238187
is_guest_login                  10.100917
count                            1.818084
srv_count                        7

In [11]:
df_raw.kurtosis()

duration                        1427.001619
protocol_type                      9.093474
service                           -1.269165
flag                              -0.504916
src_bytes                      50377.110197
dst_bytes                      11581.715009
land                            8583.444341
wrong_fragment                  1314.346998
urgent                         12877.959941
hot                            11315.360579
num_failed_logins                316.712234
logged_in                         -1.881050
num_compromised                30598.998990
root_shell                      1241.710238
su_attempted                   22353.527318
num_root                       30546.114746
num_file_creations             57940.586083
num_shells                     19664.337797
num_access_files                 693.630038
num_outbound_cmds                  0.000000
is_host_login                   6436.333262
is_guest_login                   100.031108
count                           