In [None]:
"""
Visualization of non-benign connections data
"""

In [None]:
# Load the top modules that are used in multiple places
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline  

In [None]:
# Column name mapping from original data to compact form
# All the X** are features and the YY is the label
feature_map = {
 ' Destination Port' : 'X1',
 ' Flow Duration' : 'X2', 
 ' Total Fwd Packets' : 'X3', 
 ' Total Backward Packets' : 'X4', 
 'Total Length of Fwd Packets' : 'X5', 
 ' Total Length of Bwd Packets' : 'X6', 
 ' Fwd Packet Length Max' : 'X7', 
 ' Fwd Packet Length Min' : 'X8', 
 ' Fwd Packet Length Mean' : 'X9', 
 ' Fwd Packet Length Std' : 'X10', 
 'Bwd Packet Length Max' : 'X11', 
 ' Bwd Packet Length Min' : 'X12', 
 ' Bwd Packet Length Mean' : 'X13', 
 ' Bwd Packet Length Std' : 'X14', 
 'Flow Bytes/s' : 'X15', 
 ' Flow Packets/s' : 'X16', 
 ' Flow IAT Mean' : 'X17', 
 ' Flow IAT Std' : 'X18', 
 ' Flow IAT Max' : 'X19', 
 ' Flow IAT Min' : 'X20', 
 'Fwd IAT Total' : 'X21', 
 ' Fwd IAT Mean' : 'X22', 
 ' Fwd IAT Std' : 'X23', 
 ' Fwd IAT Max' : 'X24', 
 ' Fwd IAT Min' : 'X25', 
 'Bwd IAT Total' : 'X26', 
 ' Bwd IAT Mean' : 'X27', 
 ' Bwd IAT Std' : 'X28', 
 ' Bwd IAT Max' : 'X29', 
 ' Bwd IAT Min' : 'X30', 
 'Fwd PSH Flags' : 'X31', 
 ' Bwd PSH Flags' : 'X32', 
 ' Fwd URG Flags' : 'X33', 
 ' Bwd URG Flags' : 'X34', 
 ' Fwd Header Length' : 'X35', 
 ' Bwd Header Length' : 'X36', 
 'Fwd Packets/s' : 'X37', 
 ' Bwd Packets/s' : 'X38', 
 ' Min Packet Length' : 'X39', 
 ' Max Packet Length' : 'X40', 
 ' Packet Length Mean' : 'X41', 
 ' Packet Length Std' : 'X42', 
 ' Packet Length Variance' : 'X43', 
 'FIN Flag Count' : 'X44', 
 ' SYN Flag Count' : 'X45', 
 ' RST Flag Count' : 'X46', 
 ' PSH Flag Count' : 'X47', 
 ' ACK Flag Count' : 'X48', 
 ' URG Flag Count' : 'X49', 
 ' CWE Flag Count' : 'X50', 
 ' ECE Flag Count' : 'X51', 
 ' Down/Up Ratio' : 'X52', 
 ' Average Packet Size' : 'X53', 
 ' Avg Fwd Segment Size' : 'X54', 
 ' Avg Bwd Segment Size' : 'X55', 
 ' Fwd Header Length.1' : 'X56', 
 'Fwd Avg Bytes/Bulk' : 'X57', 
 ' Fwd Avg Packets/Bulk' : 'X58', 
 ' Fwd Avg Bulk Rate' : 'X59', 
 ' Bwd Avg Bytes/Bulk' : 'X60', 
 ' Bwd Avg Packets/Bulk' : 'X61', 
 'Bwd Avg Bulk Rate' : 'X62', 
 'Subflow Fwd Packets' : 'X63', 
 ' Subflow Fwd Bytes' : 'X64', 
 ' Subflow Bwd Packets' : 'X65', 
 ' Subflow Bwd Bytes' : 'X66', 
 'Init_Win_bytes_forward' : 'X67', 
 ' Init_Win_bytes_backward' : 'X68', 
 ' act_data_pkt_fwd' : 'X69', 
 ' min_seg_size_forward' : 'X70', 
 'Active Mean' : 'X71', 
 ' Active Std' : 'X72', 
 ' Active Max' : 'X73', 
 ' Active Min' : 'X74', 
 'Idle Mean' : 'X75', 
 ' Idle Std' : 'X76', 
 ' Idle Max' : 'X77', 
 ' Idle Min' : 'X78', 
 ' Label': 'YY'
}

In [None]:
# label names (YY) in the data and their
# mapping to numerical values
label_map = {
 'BENIGN' : 0,
 'FTP-Patator' : 1,
 'SSH-Patator' : 2,
 'DoS slowloris' : 3,
 'DoS Slowhttptest': 4,
 'DoS Hulk' : 5,
 'DoS GoldenEye' : 6,
 'Heartbleed' : 7,
 'Web Attack � Brute Force' : 8,
 'Web Attack � XSS' : 9,
 'Web Attack � Sql Injection' : 10,
 'Infiltration' : 11,
 'Bot' : 12,
 'PortScan' : 13,
 'DDoS' : 14,
}

num_ids_features = 78
num_ids_classes = 15
ids_classes = [ 'BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed', 'Brute Force', 'XSS', 'Sql Injection', 'Infiltration', 'Bot', 'PortScan', 'DDoS',]

In [None]:
outdir = './MachineLearningCVE/restart/'
mal_data = 'mal.csv'

In [None]:
feature_names = list(feature_map.values())
label = feature_names.pop()

mal_class_txt = ids_classes[1:]
num_mal_classes = num_ids_classes - 1
mal_class_num = np.arange(1, num_mal_classes + 1)

In [None]:
df = pd.read_csv(outdir + mal_data)

In [None]:
df.head()

In [None]:
"""
Some rows with 'YY' = 5 have the 'X15' column as NaN.
Set these to the average value of the 'X15' column for 'YY' = 5
"""
# Columns containing nan
df.columns[df.isna().any()].tolist()

# Only 'YY' = 5 has NaNs
(df[df['X15'].isna()])['YY'].unique()

avg_x15 = df[df['YY'] == 5]['X15'].mean()
df.fillna(avg_x15, inplace=True)

if len(df[df['X15'].isna()]) == 0:
    print ('NaNs replaced with mean')
else:
    print ('something went wrong with NaN handling')

In [None]:
"""
Some rows with 'YY' = 12, 13, 14, 1, 5 have inf in the X15 and X16 columns
Replace these with the column max
"""
# Columns containing inf
df.columns.to_series()[np.isinf(df).any()]

# labels corresponding to rows with inf
df.iloc[df.index[np.isinf(df).any(1)]]['YY'].unique()

# replace infs in x15 column with max non-inf value
max_x15 = df.loc[df['X15'] != np.inf, 'X15'].max()
print (max_x15)
df['X15'].replace(np.inf, max_x15, inplace=True)

# replace infs in x16 column with max non-inf value
max_x16 = df.loc[df['X16'] != np.inf, 'X16'].max()
print (max_x16)
df['X16'].replace(np.inf, max_x16, inplace=True)

if len(df.columns.to_series()[np.isinf(df).any()]) == 0:
    print ('infs replaced with max')
else:
    print ('something went wrong with inf handling')

In [None]:
"""
The dataset is badly skewed.
"""
df.groupby(['YY'])['YY'].count()

In [None]:
"""
Get the medians of the 14 classes across all dimensions.
"""

df_median = df.groupby(['YY'])[feature_names].median()

print (df_median.shape)
print (df_median)

In [None]:
"""
PCA
Plot them in 2D.
"""
from sklearn.decomposition import PCA

mod_pca = PCA(n_components=10)
X_pca = mod_pca.fit_transform(df_median)
print (X_pca.shape)

fig, ax = plt.subplots()
ax.scatter(X_pca[:, 4], X_pca[:, 5])

for i, txt in enumerate(mal_class_num):
    ax.annotate(txt, (X_pca[i, 4], X_pca[i, 5]))

In [None]:
"""
Multi Dimensional Scaling (MDS)
Plot them in 2D.
"""
from sklearn.manifold import MDS

mod_mds = MDS(n_components=3, n_init=10, verbose=1, random_state=42)
X_mds = mod_mds.fit_transform(df_median)
print (X_mds.shape)

fig, ax = plt.subplots()
ax.scatter(X_mds[:, 0], X_mds[:, 1])

for i, txt in enumerate(mal_class_num):
    ax.annotate(txt, (X_mds[i, 0], X_mds[i, 1]))

In [None]:
"""
Logistic regression to understand the most important parameters
"""
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

mod_scaler = StandardScaler()
X_std = mod_scaler.fit_transform(df[feature_names])
mod_log = LogisticRegression(max_iter=10)
mod_log.fit(X_std, df['YY'])

y_pred = mod_log.predict(X_std)


In [None]:
# XXX - need to updated to python 3.10 and latest versions of scikit learn for better API
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(df['YY'], y_pred, labels=mod_log.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=mod_log.classes_)
disp.plot()
plt.show()