# ML Clustering - Network Intrusion Detection
## Part 2 - DATA ANALYSIS

### 1. INITIALIZATION

In [16]:
# Import necessary libraries and modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, TruncatedSVD

In [3]:
# Set display options
pd.set_option('display.max_columns', None)

### 2. LOADING DATASET

In [4]:
# Initialize required variables to read the cleaned data file
data_file_location = "..\\data\\processed\\"
data_file_name = "kddcup99_processed"
data_file_ext = ".csv"

# Read the dataset
data = pd.read_csv(data_file_location + data_file_name + data_file_ext, index_col=0)

In [6]:
# Check the dataset and its shape
data

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,labels,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_finger,service_ftp,service_ftp_data,service_gopher,service_http,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_mtp,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_nnsp,service_ntp_u,service_other,service_pop_2,service_pop_3,service_printer,service_private,service_red_i,service_remote_job,service_rje,service_shell,service_smtp,service_ssh,service_supdup,service_systat,service_telnet,service_tftp_u,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_whois,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.160754,-0.011425,1.224727,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,0.628588,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,-0.274787,-0.337961,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,-1.439350,-2.319201,-2.759730,-0.313559,-0.480427,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,normal,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,0.759518,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
1,-0.160754,-0.011905,0.038174,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,0.628588,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,-0.242768,-0.307668,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,-1.429681,-2.307718,0.508106,-0.313559,3.026272,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,normal,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,0.759518,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
2,-0.160754,-0.011234,-0.058394,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,0.628588,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,-0.274787,-0.337961,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,-1.420012,-2.296235,0.508106,-0.313559,1.272922,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,normal,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,0.759518,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
3,-0.160754,-0.011261,-0.034866,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,0.628588,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,-0.242768,-0.307668,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,-1.410343,-2.284752,0.508106,-0.313559,0.676784,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,normal,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,0.759518,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
4,-0.160754,-0.011207,-0.080107,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,0.628588,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,-0.210750,-0.277376,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,-1.400674,-2.273269,0.508106,-0.313559,0.396248,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,normal,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,0.759518,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976153,-0.160754,-0.004018,-0.094329,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,-1.590866,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,16.022758,15.080953,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,1.026235,0.608986,0.508106,-0.313559,3.026272,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,smurf,7.945850,-1.930158,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,12.821640,-0.071883,-0.062687,-0.201523,-0.001012,-1.316625,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
976154,-0.160754,-0.004018,-0.094329,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,-1.590866,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,16.054777,15.111246,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,1.026235,0.608986,0.508106,-0.313559,3.026272,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,smurf,7.945850,-1.930158,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,12.821640,-0.071883,-0.062687,-0.201523,-0.001012,-1.316625,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753
976155,-0.160754,-0.013374,-0.094329,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,-1.590866,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,7.281639,-0.186498,-0.057745,-0.065175,4.121972,4.143123,-9.844211,0.438069,-0.47852,1.026235,-2.250303,-2.694373,0.075629,-0.480427,-0.47886,-0.070496,-0.057190,4.184460,4.304894,neptune,-0.125852,0.518092,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,-0.077993,-0.071883,-0.062687,-0.201523,-0.001012,-1.316625,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,3.473987,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,4.146268,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,-4.035024,-0.001753
976156,-0.160754,-0.004018,-0.094329,-0.002678,-0.00128,-0.002245,-0.053068,-0.007497,-1.590866,-0.004429,-0.017592,-0.010144,-0.00734,-0.021222,-0.018913,-0.06445,-0.001431,-0.062316,16.054777,15.111246,-0.057745,-0.065175,-0.244300,-0.246787,0.160417,-0.157977,-0.47852,1.026235,0.608986,0.508106,-0.313559,3.026272,-0.47886,-0.070496,-0.057190,-0.256644,-0.255505,smurf,7.945850,-1.930158,-0.493782,-0.023086,-0.011496,-0.001012,-0.048904,-0.001012,-0.001431,-0.001431,-0.001431,-0.001012,-0.006321,-0.250813,-0.001753,-0.062341,12.821640,-0.071883,-0.062687,-0.201523,-0.001012,-1.316625,-0.002024,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012,-0.001753,-0.001012,-0.001012,-0.001431,-0.062786,-0.247967,-0.001012,-0.030748,-0.001012,-0.287854,-0.003036,-0.001431,-0.001753,-0.002263,-0.329058,-0.002863,-0.001431,-0.001012,-0.047829,-0.001753,-0.002678,-0.022863,-0.012314,-0.074409,-0.001012,-0.001012,-0.001753,-0.003649,-0.241181,-0.024841,-0.018803,-0.034134,-0.023263,-0.01252,-0.006865,0.247830,-0.001753


As discussed in the previous phase, the dataset is now ready for analysis using PCA and KMeans clustering.

### 3. PRINCIPLE COMPONENT ANALYSIS (PCA)

The original dataset along with the performed preprocessing tasks has led to a relatively high number of variables, this may pose additional unnecessary overload during the clustering analysis. Specifically that some variables might not as much useful and may contain no relevant information to perform the clustering.

The goal of PCA is to convert the high dimensional data distribution into a lower, more compact space. Namely by calculating new variables (so called principle components) that represent the most important attributes in the original datasets.

Since the PCA includes distance-based calculations, our dataset was prepared by converting all attributes into a standardized version, maintaining the value distribution in a normalized scale.

In [11]:
# As the dataset contains the label class, we git red of it in the dataset and keep it in a separate variable for later assessment.
labels = data["labels"]
data.drop("labels", axis="columns", inplace=True)

In [28]:
pca = PCA(.8, svd_solver='full')
pca.fit(data)
print(len(pca.explained_variance_ratio_))
print(pca.explained_variance_ratio_)


57
[0.0674172  0.05807125 0.03956367 0.03062082 0.02662131 0.02490891
 0.02453054 0.01893166 0.0172115  0.01675292 0.01472814 0.01406813
 0.01297307 0.01255522 0.01172212 0.01142301 0.01122844 0.01067403
 0.01044353 0.01034596 0.01014833 0.01005076 0.01000319 0.00999091
 0.00989438 0.0098323  0.00981328 0.00980562 0.00980522 0.00980465
 0.00980456 0.00980428 0.00980407 0.00980402 0.00980395 0.00980395
 0.00980395 0.00980395 0.00980394 0.00980394 0.00980394 0.00980394
 0.00980394 0.00980393 0.00980393 0.00980393 0.00980393 0.00980393
 0.00980393 0.00980393 0.00980393 0.00980393 0.00980393 0.00980393
 0.00980393 0.00980393 0.00980357]


In [39]:
pca.get_feature_names_out()

array(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7',
       'pca8', 'pca9', 'pca10', 'pca11', 'pca12', 'pca13', 'pca14',
       'pca15', 'pca16', 'pca17', 'pca18', 'pca19', 'pca20', 'pca21',
       'pca22', 'pca23', 'pca24', 'pca25', 'pca26', 'pca27', 'pca28',
       'pca29', 'pca30', 'pca31', 'pca32', 'pca33', 'pca34', 'pca35',
       'pca36', 'pca37', 'pca38', 'pca39', 'pca40', 'pca41', 'pca42',
       'pca43', 'pca44', 'pca45', 'pca46', 'pca47', 'pca48', 'pca49',
       'pca50', 'pca51', 'pca52', 'pca53', 'pca54', 'pca55', 'pca56'],
      dtype=object)