#### Installations

In [None]:
pip install pandas

In [None]:
pip install --upgrade pip

In [None]:
pip install seaborn

In [None]:
pip install scikit-learn-extra

Notes: in comments, click to view
<!-- 
### Features: first - absolute value of loadings, second - weighted sum of loadings, third - weighted sum + limited components to capture 95% variance

# IoT23
# features = np.array(['conn_state', 'history', 'id.orig_p', 'id.resp_p', 'orig_bytes', 'orig_pkts', 'proto', 'resp_bytes', 'resp_pkts', 'service'])
# features = np.array(['conn_state', 'history', 'id.orig_p', 'id.resp_h', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_bytes', 'resp_ip_bytes', 'resp_pkts'])
features = np.array(['conn_state', 'history', 'id.orig_p', 'missed_bytes', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_bytes', 'resp_ip_bytes','resp_pkts'])
# NB15
#features = np.array(['Djit', 'Dload', 'ct_dst_sport_ltm', 'ct_src_dport_ltm', 'ct_srv_dst', 'ct_state_ttl', 'dmeansz', 'res_bdy_len', 'sloss', 'state'])
#features = np.array(['Djit', 'Dload', 'Sjit', 'ct_srv_dst', 'ct_srv_src', 'ct_state_ttl', 'dmeansz', 'proto', 'state', 'trans_depth'])
#features = np.array(['Djit', 'Dload', 'Sjit', 'Sload', 'ct_srv_dst', 'ct_srv_src','dur', 'proto', 'sloss', 'state'])
# KDD'99
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_same_src_port_rate', 'Dst_host_same_srv_rate', 'Dst_host_srv_count', 'Flag', 'Logged_in', 'Protocol_type', 'Serror_rate', 'Service'])
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_serror_rate', 'Dst_host_srv_count', 'Logged_in', 'Protocol_type', 'Same_srv_rate', 'Serror_rate', 'Service', 'Srv_serror_rate'])
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_srv_serror_rate', 'Duration', 'Logged_in', 'Protocol_type', 'Same_srv_rate', 'Serror_rate', 'Service', 'Srv_serror_rate'])
# NSL-KDD
# features = np.array(['Diff Srv Rate', 'Dst Host Count', 'Dst Host Diff Srv Rate', 'Dst Host Same Src Port Rate', 'Dst Host Same Srv Rate', 'Flag', 'Logged In', 'Protocol Type', 'Same Srv Rate', 'Service'])
# features = np.array(['Count', 'Diff Srv Rate', 'Dst Host Diff Srv Rate', 'Dst Host Rerror Rate', 'Dst Host Same Src Port Rate', 'Dst Host Srv Rerror Rate', 'Logged In', 'Rerror Rate', 'Service', 'Srv Rerror Rate'])
# features = np.array(['Count', 'Diff Srv Rate', 'Dst Host Count', 'Dst Host Diff Srv Rate', 'Dst Host Same Src Port Rate', 'Dst Host Srv Rerror Rate', 'Logged In', 'Rerror Rate', 'Service', 'Srv Rerror Rate'])
features = features.tolist()
print(features) -->

### Helper Functions: Algorithms & Clustering

**Algorithms**

In [2]:
# Code for KMeans

import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal
from sklearn.metrics import silhouette_score

"""
Performs KMeans clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def kmeans_clustering(selected_features,mode, n_clusters=2, max_iter=300):
    """
    Perform KMeans clustering on the input samples
    
    Parameters:
        samples: array-like, shape (n_samples, n_features)
        n_clusters: int, number of clusters (default=2)
        max_iter: int, maximum iterations (default=300)
    
    Returns:
        silhouette_coef: silhouette coefficient score
    """
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    best_k = n_clusters

    try:
        k_options = range(2, 6)
        best_k = max(k_options, key=lambda k: silhouette_score(X_scaled, KMeans(n_clusters=k).fit_predict(X_scaled)))
    except:
        best_k = 2

    k_means = KMeans(n_clusters=best_k, max_iter=max_iter)
    k_means.fit(X_scaled)
    if mode == 0:
        try:
            silhouette_coef = silhouette_score(X_scaled, k_means.labels_)
        except ValueError:
            silhouette_coef = -1  # Assigning lowest score if clustering fails
        return silhouette_coef, k_means.labels_
    if mode == 1:
        return k_means.labels_

In [3]:
# EM Clustering Code

from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


"""
Performs EM clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def em_clustering(selected_features, mode, n_clusters=2):
    """
    Perform EM Clustering on selected features and return silhouette score.
        
    Returns:
    --------
    float
        Silhouette score of the clustering (-1 if clustering fails)
    """
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize and fit the EM model
    em_model = GaussianMixture(
        n_components=n_clusters,
        #random_state=0, #THOUGHTS: We can improve this later to have an array of seeds to select from to observe variations
        n_init=10  # Multiple initializations to avoid local optima
    )
    
   
    try:
        # Fit the model and get cluster assignments
        em_model.fit(X_scaled)
        labels = em_model.predict(X_scaled)
        
        # Calculate silhouette score
        silhouette_coef = silhouette_score(X_scaled, labels)
    except Exception as e:
        #print(f"Clustering failed: {str(e)}")
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [4]:
# DBSCAN Detection method: 
# I put 'optimization part' in 'DBSCAN_Optimization_Code.ipynb' file. 
# We can use optimization after initial run to do a comparison and analysis in our paper to show improvements.

import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# DONE: keep -1 --> they will be its own cluster
"""
Performs DBSCAN clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def dbscan_clustering(selected_features, mode, eps=0.5, min_samples=5):
    """
    Perform DBSCAN clustering on selected features
    
    Parameters:
    selected_features : pandas DataFrame
        The features selected for clustering
    eps : float
        The maximum distance between two samples for them to be considered neighbors
    min_samples : int
        The number of samples in a neighborhood for a point to be considered a core point
        
    Returns:
    float : silhouette coefficient
    dict : additional clustering information
    """

    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize and fit DBSCAN
    #min_samples = max(5, int(len(X) * 0.01)) # use 1% of the data as the size of the smallest sample, if this value is less than 5, default to 5
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X_scaled)

    if -1 in labels:
        labels[labels == -1] = max(labels) + 1

    # Get number of clusters (excluding noise points which are labeled -1, K Medoids does not have noise points)
    n_clusters = len(set(labels))
    
    # calculate silhouette score if more than one cluster and  noise points
    if n_clusters > 1:
        silhouette_coef = silhouette_score(X_scaled, labels)
    else:
        silhouette_coef = -1  # Assign lowest score if clustering fails

    
    # NOTE: -- Uncomment when we analyze and optimize ---- Additional clustering information
    # info = {
    #     'n_clusters': n_clusters,
    #     'n_noise': list(labels).count(-1),
    #     'labels': labels,
    #     'cluster_sizes': pd.Series(labels).value_counts().to_dict()
    # }
    
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels


In [5]:
# Code for K Medoids
import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs KMediods clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def modified_kmedoids_clustering(selected_features, mode, n_clusters=2):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    best_k = n_clusters
    try:
        k_options = range(2, 6)
        best_k = max(k_options, key=lambda k: silhouette_score(X_scaled, KMedoids(n_clusters=k, method='alternate', nit='k-medoids++', max_iter=1500).fit_predict(X_scaled)))
    except:
        best_k = 2

     # Initialize and fit the K-Medoids model
    kmedoids = KMedoids(n_clusters=best_k, method='alternate', init='k-medoids++', max_iter=1500)
    

    # Calculate silhouette score
    try:
        labels = kmedoids.fit_predict(X_scaled)
        if len(set(labels)) > 1:
            silhouette_coef = silhouette_score(X_scaled, labels)
        else:
            silhouette_coef = -1 # Assigning lowest score if there is only 1 cluster
    except Exception as e:
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [6]:
# Code for K Medoids
import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs KMediods clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def kmedoids_clustering(selected_features, mode, n_clusters=2):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

     # Initialize and fit the K-Medoids model
    kmedoids = KMedoids(n_clusters=n_clusters, method='pam', max_iter=1500)
    

    # Calculate silhouette score
    try:
        labels = kmedoids.fit_predict(X_scaled)
        silhouette_coef = silhouette_score(X_scaled, labels)
    except ValueError:
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [7]:
# Code for Mean Shift
import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs Mean Shift clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def modified_meanshift_clustering(selected_features, mode, quantile=0.3, n_samples=500):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Estimate optimal bandwidth
    bandwidth = estimate_bandwidth(X_scaled, quantile=quantile, n_samples=n_samples)
    if bandwidth <= 0:
        bandwidth = 1.0  # Fallback in case of extremely small bandwidth
        
    # Initialize and fit the Mean Shift model
    meanshift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    # Calculate silhouette score
    try:
        # print("in try")
        labels = meanshift.fit_predict(X_scaled)
        n_clusters = len(set(labels))

        # if -1 in labels:
        #     labels[labels == -1] = n_clusters - 1
        # Check the number of clusters determined 
        #n_clusters = len(np.unique(labels))
        #print(f"Number of clusters found: {n_clusters}")
        if n_clusters > 1:
            silhouette_coef = silhouette_score(X_scaled, labels)
        else:
            silhouette_coef = -1
    except Exception as e:
        # print("in except")
        silhouette_coef = -1  # Assign lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [8]:
# Codes for Mean Shift
import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs Mean Shift clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def meanshift_clustering(selected_features, mode, bandwidth=None):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize and fit the Mean Shift model
    meanshift = MeanShift(bandwidth=bandwidth)
    
    # Calculate silhouette score
    try:
        labels = meanshift.fit_predict(X_scaled)
        # Check the number of clusters determined 
        n_clusters = len(np.unique(labels))
        #print(f"Number of clusters found: {n_clusters}")
        silhouette_coef = silhouette_score(X_scaled, labels)
    except ValueError:
        silhouette_coef = -1  # Assign lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [9]:
"""
Converts the binary value of state (which represents features selected) 
to both list features and string output res. If mode = 0, returns features.
If mode = 1, returns res.
"""
def bin_to_features(state, mode):
  state_bin = bin(state)
  #print(state_bin)
  state_bin_arr = np.array([b for b in state_bin[2:]])
  #pad with zeros
  diff = len(FEATURES) - len(state_bin_arr)
  padded_arr = np.insert(state_bin_arr, 0, ['0' for i in range(diff)])
  (padded_arr)

  # identify which indexes are 1
  idx = (np.where(padded_arr == '1')[0]).tolist()
  #print(idx)
  # select feature headings
  selected_features = original_features.iloc[:,idx]
  features = selected_features.columns.tolist()
  res = f"Features Used: {features}"
  if mode == 0: # return actual feature list
    return selected_features
  if mode == 1: # return string of feature list
    return res

  # # force length to always be == to length of features
  # state_bin = bin(state)[2:].zfill(len(FEATURES))
  # state_bin_arr = np.array(state_bin)

  # # identify which indexes are 1
  # idx = (np.where(state_bin_arr == '1')[0]).tolist()
  # #print(idx)
  # # select feature headings
  # selected_features = original_features.iloc[:,idx]
  # features = selected_features.columns.tolist()
  # res = f"Features Used: {features}"
 




### Main Code

In [10]:
# features = np.array(['action', 'availability', 'device_type:1', 'direction',
#        'event_type', 'interface_status', 'patch_description',
#        'patch_status', 'severity:1', 'traffic_direction'])

# Replace feature list input with selected features
# features = np.array(['action', 'behavior_name', 'event_id:4', 'exploit_available',
#  'file_signature_status', 'network_interface', 'patch_name', 'severity',
#  'tcp_flags', 'vulnerability_solution']) # features for generated data complete

#features = np.array(['avg_bytes_sent','avg_bytes_received','avg_packets_transferred','avg_flow_duration','recent_tcp_flags','recent_protocol','avg_cpu_usage','avg_memory_usage','avg_disk_usage','avg_uptime']) # features for generated data

### Features: first - absolute value of loadings, second - weighted sum of loadings, third - weighted sum + limited components to capture 95% variance

# IoT23
#features = np.array(['conn_state', 'history', 'id.orig_p', 'id.resp_p', 'orig_bytes', 'orig_pkts', 'proto', 'resp_bytes', 'resp_pkts', 'service'])
#features = np.array(['conn_state', 'history', 'id.orig_p', 'id.resp_h', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_bytes', 'resp_ip_bytes', 'resp_pkts'])
#features = np.array(['conn_state', 'history', 'id.orig_p', 'missed_bytes', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_bytes', 'resp_ip_bytes','resp_pkts'])
# NB15
#features = np.array(['Djit', 'Dload', 'ct_dst_sport_ltm', 'ct_src_dport_ltm', 'ct_srv_dst', 'ct_state_ttl', 'dmeansz', 'res_bdy_len', 'sloss', 'state'])
#features = np.array(['Djit', 'Dload', 'Sjit', 'ct_srv_dst', 'ct_srv_src', 'ct_state_ttl', 'dmeansz', 'proto', 'state', 'trans_depth'])
#features = np.array(['Djit', 'Dload', 'Sjit', 'Sload', 'ct_srv_dst', 'ct_srv_src','dur', 'proto', 'sloss', 'state'])
features = np.array(['Dpkts', 'Spkts', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_dport_ltm', 'dbytes', 'dloss', 'dmeansz', 'dwin', 'swin'])
# KDD'99
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_same_src_port_rate', 'Dst_host_same_srv_rate', 'Dst_host_srv_count', 'Flag', 'Logged_in', 'Protocol_type', 'Serror_rate', 'Service'])
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_serror_rate', 'Dst_host_srv_count', 'Logged_in', 'Protocol_type', 'Same_srv_rate', 'Serror_rate', 'Service', 'Srv_serror_rate'])
# features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_srv_serror_rate', 'Duration', 'Logged_in', 'Protocol_type', 'Same_srv_rate', 'Serror_rate', 'Service', 'Srv_serror_rate'])
# features = np.array(['Dst_host_rerror_rate', 'Dst_host_serror_rate', 'Dst_host_srv_rerror_rate', 'Dst_host_srv_serror_rate', 'Logged_in', 'Protocol_type', 'Rerror_rate', 'Serror_rate', 'Srv_rerror_rate', 'Srv_serror_rate'])
# NSL-KDD
# features = np.array(['Diff Srv Rate', 'Dst Host Count', 'Dst Host Diff Srv Rate', 'Dst Host Same Src Port Rate', 'Dst Host Same Srv Rate', 'Flag', 'Logged In', 'Protocol Type', 'Same Srv Rate', 'Service'])
# features = np.array(['Count', 'Diff Srv Rate', 'Dst Host Diff Srv Rate', 'Dst Host Rerror Rate', 'Dst Host Same Src Port Rate', 'Dst Host Srv Rerror Rate', 'Logged In', 'Rerror Rate', 'Service', 'Srv Rerror Rate'])
# features = np.array(['Count', 'Diff Srv Rate', 'Dst Host Count', 'Dst Host Diff Srv Rate', 'Dst Host Same Src Port Rate', 'Dst Host Srv Rerror Rate', 'Logged In', 'Rerror Rate', 'Service', 'Srv Rerror Rate'])
# features = np.array([' Serror Rate', 'Count', 'Dst Host Rerror Rate', 'Dst Host Srv Rerror Rate', 'Dst Host Srv Serror Rate', 'Flag', 'Logged In', 'Rerror Rate', 'Srv Rerror Rate', 'Srv Serror Rate'])
# Bot-IoT
# features = np.array(['AR_P_Proto_P_DstIP', 'AR_P_Proto_P_Sport', 'AR_P_Proto_P_SrcIP', 'flgs', 'ltime', 'mean', 'pkts', 'proto_number', 'rate', 'stime'])
# features = np.array(['AR_P_Proto_P_DstIP', 'AR_P_Proto_P_Sport', 'AR_P_Proto_P_SrcIP', 'flgs', 'pkts', 'proto', 'proto_number', 'rate', 'srate', 'state'])

features = features.tolist()
print(features)

['Dpkts', 'Spkts', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_dport_ltm', 'dbytes', 'dloss', 'dmeansz', 'dwin', 'swin']


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# FEATURES = {0: 'avg_bytes_sent', 1: 'avg_bytes_received', 2: 'avg_packets_transferred', 
#   3: 'avg_flow_duration', 4: 'recent_tcp_flags', 5: 'recent_protocol', 6: 'avg_cpu_usage', 
#   7: 'avg_memory_usage', 8: 'avg_disk_usage', 9: 'avg_uptime'}
# features = list(FEATURES.values())
FEATURES = {k:str(v) for k,v in zip(range(len(features)), features) }
#print(FEATURES)

data = pd.read_csv("../data/real-world/data-cleaning/cleaned_NB15_1_sub1.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_iot23_RW21.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_kdd_sample_1.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_nsl_kdd_sample_1.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_Bot-IoT_sample1.csv")


#print(data.head(10))


ALGORITHMS = {0: 'DBSCAN Clustering', 1: 'Mean Shift', 2: 'K-Mediods', 3: 'EM Clustering', 4: 'K-Means'}
NUM_ALG = len(ALGORITHMS)
original_features = data[features].copy(deep = True)
print(original_features.head(10))

scaler = StandardScaler()
original_features_scaled = scaler.fit_transform(original_features)
data['uid'] = data.index # comment out for IoT-23/BoT-IoT
# uid = "pkSeqID" # use only for Bot-IoT
uid = "uid" # comment out for Bot-IoT
ips = data[uid]
#print(original_features.columns)
#print(ips.head(10))

def algorithm_prep(state, action, mode):
  selected_features = bin_to_features(state, 0)
  
  # call algorithm function
  out = None
  #print('algorithm:',ALGORITHMS[action])

  # if mode = 0, output is the silhouette coefficient
  # if mode = 1, output is the cluster labelling
  match action:   
    case 0:
      #print('algorithm:',ALGORITHMS[action])
      out = dbscan_clustering(selected_features, mode)
    case 1: 
      #print('algorithm:',ALGORITHMS[action])
      out = modified_meanshift_clustering(selected_features, mode)
    case 2:
      #print('algorithm:',ALGORITHMS[action])
      out = modified_kmedoids_clustering(selected_features, mode)
    case 3: 
      out = em_clustering(selected_features, mode)
    case 4:
      out = kmeans_clustering(selected_features, mode)
      
  # if test == 1 and mode == 1: # calculate and return overall silhouette
  #   try:
  #     out = silhouette_score(original_features_scaled, out)
  #   except ValueError:
  #     out = -1
  #     sil_scores = silhouette_samples(data, out)
  #     unique_labels, counts = np.unique(out, return_counts=True)

  #     weighted_sum = sum(sil_scores[out == label].mean() * count for label, count in zip(unique_labels, counts))

  #     return (weighted_sum / len(data))
  #   except ValueError:
  #     return -1
    
  # return silhouette/cluster labels from algorithm function
  return out
    
    

"""
Converts the binary value of state (which represents features selected) 
to both list features and string output res. If mode = 0, returns features.
If mode = 1, returns res.
"""
def bin_to_features(state, mode):
  state_bin = bin(state)
  #print(state_bin)
  state_bin_arr = np.array([b for b in state_bin[2:]])
  #pad with zeros
  diff = len(FEATURES) - len(state_bin_arr)
  padded_arr = np.insert(state_bin_arr, 0, ['0' for i in range(diff)])
  (padded_arr)

  # identify which indexes are 1
  idx = (np.where(padded_arr == '1')[0]).tolist()
  #print(idx)
  # select feature headings
  selected_features = original_features.iloc[:,idx]
  features = selected_features.columns.tolist()
  res = f"Features Used: {features}"
  if mode == 0: # return actual feature list
    return selected_features
  if mode == 1: # return string of feature list
    return res
 



   Dpkts  Spkts  ct_dst_sport_ltm  ct_dst_src_ltm  ct_src_dport_ltm  dbytes  \
0      4      4                 1               2                 1     304   
1      2      2                 1               1                 1     178   
2      2      2                 1               1                 1     178   
3      4      4                 1               3                 1     304   
4      2      2                 1               1                 1     162   
5      6     14                 1               2                 1     320   
6     30     28                 1               3                 1    3080   
7     18     14                 1               1                 1   10168   
8     42     40                 1               1                 1   23508   
9      2      2                 1               1                 3     178   

   dloss  dmeansz  dwin  swin  
0      0       76     0     0  
1      0       89     0     0  
2      0       89     0     0  
3 

#### Reinforcement Learning

##### Older one, with comments

In [27]:
# Markov Decision Process (MDP) - The Bellman equations adapted to
# Q Learning.Reinforcement Learning with the Q action-value(reward) function.
# Copyright 2018 Denis Rothman MIT License. See LICENSE.
import numpy as ql
import random
# R is The Reward Matrix for each state
# 1024 configurations of the 10 features --> 2^10
# 5 algorithms
num_configs = 2 ** len(FEATURES)
R = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

# Q is the Learning Matrix in which rewards will be learned/stored
Q = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

cluster_labels_matrix = np.empty(Q.shape, dtype=object)
# Gamma : It's a form of penalty or uncertainty for learning
# If the value is 1 , the rewards would be too high.
# This way the system knows it is learning.
gamma = 0.8

# The possible "a" actions when the agent is in a given state
def possible_actions(state):
    # 2) DONE: we should check Q, not R because R is never modified
    current_state_row = Q[state,]
    # 3) DONE: this should pick valid actions based on what we have not visited
    possible_act = ql.where(current_state_row == 0)[1]
    return possible_act


# This function chooses at random which action to be performed within the range 
# of all the available actions.
def ActionChoice(available_actions_range, state):
    epsilon = 0.95 # 90% exploration
    if len(available_actions_range) > 0:
        if np.random.rand() < epsilon:  
            # Explore: Randomly pick from possible actions
            next_action = int(ql.random.choice(available_actions_range, 1)[0])
        else:
            # Exploit: Pick best action from Q matrix
            next_action = int(np.argmax(Q[state, :]))
    else:
    # If no valid actions, pick randomly from all possible algorithms
        next_action = int(np.random.choice(NUM_ALG, 1)[0])
    
    return next_action
    # if len(available_actions_range) > 0:
    #     next_action = int(ql.random.choice(PossibleAction,1)[0])
    # else:
    #     next_action = int(np.random.choice(NUM_ALG,1)[0])
    # return next_action


# A version of Bellman's equation for reinforcement learning using the Q function
# This reinforcement algorithm is a memoryless process
# The transition function T from one state to another
# is not in the equation below.  T is done by the random choice above

def reward(current_state, action, gamma):
    Max_State = ql.where(Q[action,] == ql.max(Q[action,]))[1]

    if Max_State.shape[0] > 1:
        Max_State = int(ql.random.choice(Max_State, size = 1)[0])
    else:
        Max_State = int(Max_State[0])

    # 5) DONE: we think this is a typo and action/Max_State should be switched. 
    # MaxValue = Q[action, Max_State]
    MaxValue = Q[Max_State, action]

    # 6) DONE: call function to run ML algorithm using the value of action. this will
    # run the algorithm using the features from current_state, create clusters,
    # and calculate the silhouette value.
    silhouette_co, labels = algorithm_prep(current_state, action, 0) 
    cluster_labels_matrix[current_state, action] = labels
   
    # get silhouette coefficient compared to all available features
    try:
        overall_silhouette_co = silhouette_score(original_features_scaled, labels)
    except ValueError: overall_silhouette_co = -1

    # calculate ratio of selected features 
    ratio = silhouette_co / (overall_silhouette_co + 1e-6)
    if silhouette_co < overall_silhouette_co:
        penalty = 0.1 * ratio
    else: penalty = 0

    # normalized silhouette score for better consistency in reinformcement learning
    if silhouette_co < 0: # ensures RL doesn't learn from bad clustering
        norm_silhouette = 0
    else: norm_silhouette = (silhouette_co + 1) / 2  # Scale from [-1,1] to [0,1]
    
    # Bellman's MDP based Q function
    #Q[current_state, action] = silhouette_co + gamma * MaxValue
    Q[current_state, action] = (norm_silhouette - penalty) + gamma * MaxValue


# Learning over n iterations depending on the convergence of the system
# A convergence function can replace the systematic repeating of the process
# by comparing the sum of the Q matrix to that of Q matrix n-1 in the
# previous episode

# agent_s_state. The agent the name of the system calculating
# s is the state the agent is going from and s' the state it's going to
# this state can be random or it can be chosen as long as the rest of the choices
# are not determined. Randomness is part of this stochastic process
# 1) DONE: decide if starting state is random or a specific state
#agent_s_state = 1

# Get available actions in the current state
#PossibleAction = possible_actions(agent_s_state)

# Sample next action to be performed
#action = ActionChoice(PossibleAction, agent_s_state)

# Rewarding Q matrix
#reward(agent_s_state,action,gamma)


state_epsilon = 0.95 # 5% exploration
visited_pairs = np.zeros(Q.shape, dtype=bool)
for a in range(NUM_ALG):
    visited_pairs[0,a] = True # to skip all null feature configs

convergence_threshold = 0.01  
previous_Q = Q.copy()
iteration_buffer = num_configs * NUM_ALG

for i in range(10000):
    print("Iteration:", i)
    
    # visit all states first, then allow full access to any state
    unvisited_pairs = np.argwhere(visited_pairs == False)
    state_epsilon = max(0.1, 0.95 * (0.99 ** i))

    if len(unvisited_pairs) > 0:
        current_pair = unvisited_pairs[ql.random.choice(len(unvisited_pairs))]
        current_state, action = current_pair
    else:
        if len(unvisited_pairs) == 0:
            print('all pairs visited')
        if ql.random.rand() < state_epsilon: # explore
            current_state = ql.random.randint(1, int(Q.shape[0]))
        else: # exploit past good states
            k = 10
            top_k_states = np.argsort(np.array(Q.sum(axis=1)).flatten())[-k:]
            current_state = np.random.choice(top_k_states)
        # Get available actions in the current state
        PossibleAction = possible_actions(current_state)
        # Sample next action to be performed
        action = ActionChoice(PossibleAction, current_state)
    
    visited_pairs[current_state, action] = True  
    
    print("Algorithm:", ALGORITHMS[action])
    # Rewarding Q matrix
    reward(current_state,action,gamma)

    if i > iteration_buffer: # make sure it doesn't stop too early
    # check for convergence in Q to stop updates
        Q_diff = np.abs(Q - previous_Q).sum()
        if Q_diff < convergence_threshold:
            print(f"Converged at iteration {i} with Q_diff={Q_diff:.4f}")
            break

    previous_Q = Q.copy() # update for comparison
    # 95% of the time, we choose the random action and state 
    
# Displaying Q before the norm of Q phase
print("Q  :")
print(Q)

# Norm of Q
print("Normed Q :")
print(Q/ql.max(Q)*100)

# DONE: get maximum value from Q-Learning Matrix
normed_Q = Q/ql.max(Q)*100
max_location = np.where(normed_Q==normed_Q.max())
print("\nmax value located at",max_location)
max_config = max_location[0][0]
max_algorithm = max_location[1][0]
final_feats = bin_to_features(max_config, 1)
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
#DONE: print(f"Selected features:")

# DONE: get final cluster labels
cluster_labels = cluster_labels_matrix[max_config, max_algorithm]

# DONE: match data in clusters to IP addresses
labelled_data = data.copy()
labelled_data['cluster'] = cluster_labels

# DONE: return what IPs are likely anomalous
# see what clusters have < 5% of the data
# get unique values in cluster column
num_clusters = labelled_data['cluster'].nunique()

# for each unique value, get the count / len of data (aka percentage)
# num_clusters = labelled_data['cluster'].nunique()
cluster_array = labelled_data['cluster'].to_numpy()
perc_values = np.unique(cluster_array,return_counts = True)[-1]
percentages = perc_values / labelled_data.shape[0]

# keep cluster values with % < 5
idx = (np.where(percentages <= 0.1)[0]).tolist()
anomalies = labelled_data.loc[labelled_data['cluster'].isin(idx)]
# output IPs within those selected clusters

# for i in anomalies[uid]: # replace with unique ID here
#     print(f"\nIP {i} is a potential anomaly")

Iteration: 0
Algorithm: K-Means
Iteration: 1
Algorithm: DBSCAN Clustering
Iteration: 2
Algorithm: K-Mediods
Iteration: 3
Algorithm: EM Clustering
Iteration: 4
Algorithm: K-Mediods
Iteration: 5
Algorithm: DBSCAN Clustering
Iteration: 6
Algorithm: K-Mediods
Iteration: 7
Algorithm: EM Clustering
Iteration: 8
Algorithm: K-Means
Iteration: 9
Algorithm: K-Mediods
Iteration: 10
Algorithm: K-Means
Iteration: 11
Algorithm: Mean Shift
Iteration: 12
Algorithm: K-Mediods
Iteration: 13
Algorithm: DBSCAN Clustering
Iteration: 14
Algorithm: Mean Shift
Iteration: 15
Algorithm: EM Clustering
Iteration: 16
Algorithm: K-Mediods
Iteration: 17
Algorithm: EM Clustering
Iteration: 18
Algorithm: K-Means
Iteration: 19
Algorithm: EM Clustering
Iteration: 20
Algorithm: EM Clustering
Iteration: 21
Algorithm: K-Mediods
Iteration: 22
Algorithm: DBSCAN Clustering
Iteration: 23
Algorithm: K-Means
Iteration: 24
Algorithm: Mean Shift
Iteration: 25
Algorithm: K-Means
Iteration: 26
Algorithm: DBSCAN Clustering
Iteration:

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Iteration: 1313
Algorithm: DBSCAN Clustering
Iteration: 1314
Algorithm: Mean Shift
Iteration: 1315
Algorithm: DBSCAN Clustering
Iteration: 1316
Algorithm: EM Clustering
Iteration: 1317
Algorithm: K-Means
Iteration: 1318
Algorithm: Mean Shift
Iteration: 1319
Algorithm: Mean Shift
Iteration: 1320
Algorithm: DBSCAN Clustering
Iteration: 1321
Algorithm: Mean Shift
Iteration: 1322
Algorithm: K-Mediods
Iteration: 1323
Algorithm: EM Clustering
Iteration: 1324
Algorithm: K-Mediods
Iteration: 1325
Algorithm: DBSCAN Clustering
Iteration: 1326
Algorithm: DBSCAN Clustering
Iteration: 1327
Algorithm: K-Mediods
Iteration: 1328
Algorithm: EM Clustering
Iteration: 1329
Algorithm: EM Clustering
Iteration: 1330
Algorithm: EM Clustering
Iteration: 1331
Algorithm: Mean Shift
Iteration: 1332
Algorithm: Mean Shift
Iteration: 1333
Algorithm: Mean Shift
Iteration: 1334
Algorithm: EM Clustering
Iteration: 1335
Algorithm: K-Means
Iteration: 1336
Algorithm: K-Mediods
Iteration: 1337
Algorithm: Mean Shift
Iterati

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Iteration: 2718
Algorithm: K-Mediods
Iteration: 2719
Algorithm: K-Mediods
Iteration: 2720
Algorithm: Mean Shift
Iteration: 2721
Algorithm: Mean Shift
Iteration: 2722
Algorithm: Mean Shift
Iteration: 2723
Algorithm: K-Mediods
Iteration: 2724
Algorithm: K-Mediods
Iteration: 2725
Algorithm: Mean Shift
Iteration: 2726
Algorithm: EM Clustering
Iteration: 2727
Algorithm: EM Clustering
Iteration: 2728
Algorithm: K-Mediods
Iteration: 2729
Algorithm: Mean Shift
Iteration: 2730
Algorithm: EM Clustering
Iteration: 2731
Algorithm: K-Mediods
Iteration: 2732
Algorithm: Mean Shift
Iteration: 2733
Algorithm: K-Means
Iteration: 2734
Algorithm: K-Means
Iteration: 2735
Algorithm: EM Clustering
Iteration: 2736
Algorithm: Mean Shift
Iteration: 2737
Algorithm: EM Clustering
Iteration: 2738
Algorithm: K-Mediods
Iteration: 2739
Algorithm: Mean Shift
Iteration: 2740
Algorithm: Mean Shift
Iteration: 2741
Algorithm: DBSCAN Clustering
Iteration: 2742
Algorithm: Mean Shift
Iteration: 2743
Algorithm: K-Means
Iterat

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Iteration: 5055
Algorithm: Mean Shift
Iteration: 5056
Algorithm: Mean Shift
Iteration: 5057
Algorithm: EM Clustering
Iteration: 5058
Algorithm: Mean Shift
Iteration: 5059
Algorithm: K-Mediods
Iteration: 5060
Algorithm: K-Mediods
Iteration: 5061
Algorithm: K-Mediods
Iteration: 5062
Algorithm: K-Means
Iteration: 5063
Algorithm: Mean Shift
Iteration: 5064
Algorithm: K-Means
Iteration: 5065
Algorithm: Mean Shift
Iteration: 5066
Algorithm: DBSCAN Clustering
Iteration: 5067
Algorithm: EM Clustering
Iteration: 5068
Algorithm: K-Mediods
Iteration: 5069
Algorithm: DBSCAN Clustering
Iteration: 5070
Algorithm: K-Means
Iteration: 5071
Algorithm: Mean Shift
Iteration: 5072
Algorithm: K-Means
Iteration: 5073
Algorithm: K-Means
Iteration: 5074
Algorithm: EM Clustering
Iteration: 5075
Algorithm: DBSCAN Clustering
Iteration: 5076
Algorithm: DBSCAN Clustering
Iteration: 5077
Algorithm: EM Clustering
Iteration: 5078
Algorithm: K-Mediods
Iteration: 5079
Algorithm: K-Means
Iteration: 5080
Algorithm: K-Medi

In [29]:
set(cluster_labels)

{np.int64(0), np.int64(1)}

In [None]:
a = labelled_data.loc[labelled_data['cluster'] == 2]
len(a['Label'])

24

In [28]:
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
anomalies


Using algorithm EM Clustering and Features Used: ['swin'], max value is: 100.0


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label,uid,cluster


In [336]:
num_clusters

5

In [129]:
algorithm_prep(current_state, 5, 0)
# alg prep: dbscan 0.2s, meanshift 10-16s (modified: 1s), kmediods 0.6s, em clustering 0.3s, kmeans 0.1s


np.float64(0.5744217212371902)

In [126]:
algorithm_prep(current_state, action, 0)




np.float64(0.5744217212371902)

selected_features

##### Current, cleaned up comments

In [13]:
import warnings
warnings.filterwarnings("ignore", message="Cluster .* is empty!")  # Suppresses warning

In [None]:
# Markov Decision Process (MDP) - The Bellman equations adapted to
# Q Learning.Reinforcement Learning with the Q action-value(reward) function.
# Copyright 2018 Denis Rothman MIT License. See LICENSE.
import numpy as ql
import random
from sklearn.metrics import silhouette_score

# R is The Reward Matrix for each state
# 1024 configurations of the 10 features --> 2^10
# 5 algorithms
num_configs = 2 ** len(FEATURES)
R = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

# Q is the Learning Matrix in which rewards will be learned/stored
Q = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

# used to save the labels of each (state, action) combination for later retrieval
cluster_labels_matrix = np.empty(Q.shape, dtype=object)

# Gamma : It's a form of penalty or uncertainty for learning
# If the value is 1 , the rewards would be too high.
# This way the system knows it is learning.
gamma = 0.8

# The possible "a" actions when the agent is in a given state
def possible_actions(state):
    # 2) DONE: we should check Q, not R because R is never modified
    current_state_row = Q[state,]
    # 3) DONE: this should pick valid actions based on what we have not visited
    possible_act = ql.where(current_state_row == 0)[1]
    return possible_act


# This function chooses at random which action to be performed within the range 
# of all the available actions.

def ActionChoice(available_actions_range, state):
    epsilon = 0.95 # 90% exploration
    if len(available_actions_range) > 0:
        if np.random.rand() < epsilon:  
            # Explore: Randomly pick from possible actions
            next_action = int(ql.random.choice(available_actions_range, 1)[0])
        else:
            # Exploit: Pick best action from Q matrix
            next_action = int(np.argmax(Q[state, :]))
    else:
    # If no valid actions, pick randomly from all possible algorithms
        next_action = int(np.random.choice(NUM_ALG, 1)[0])
    
    return next_action
    


# A version of Bellman's equation for reinforcement learning using the Q function
# This reinforcement algorithm is a memoryless process
# The transition function T from one state to another
# is not in the equation below.  T is done by the random choice above

def reward(current_state, action, gamma):
    Max_State = ql.where(Q[action,] == ql.max(Q[action,]))[1]

    if Max_State.shape[0] > 1:
        Max_State = int(ql.random.choice(Max_State, size = 1)[0])
    else:
        Max_State = int(Max_State[0])

    MaxValue = Q[Max_State, action]

    # call function to run ML algorithm using the value of action. this will
    # run the algorithm using the features from current_state, create clusters,
    # and calculate the silhouette value.
    selected_silhouette_co, labels = algorithm_prep(current_state, action, 0)
    cluster_labels_matrix[current_state, action] = labels
    try:
        overall_silhouette_co = silhouette_score(original_features_scaled, labels)
    except ValueError: overall_silhouette_co = -1

    # calculate ratio of selected features 
    ratio = selected_silhouette_co / (overall_silhouette_co + 1e-6)
    if selected_silhouette_co < overall_silhouette_co:
        penalty = 0.1 * ratio
    else: penalty = 0
    
    # Bellman's MDP based Q function

    # normalized silhouette score for better consistency in reinforcement learning
    if selected_silhouette_co < 0: # ensures RL doesn't learn from bad clustering
        norm_silhouette = 0
    else:
        norm_silhouette = (selected_silhouette_co + 1) / 2  # Scale from [-1,1] to [0,1]
    # norm_silhouette = (selected_silhouette_co + 1) / 2
    
    #norm_silhouette = (silhouette_co + 1) / 2  # Scale from [-1,1] to [0,1]
    # Q[current_state, action] = norm_silhouette + gamma * MaxValue
    Q[current_state, action] = (norm_silhouette - penalty) + gamma * MaxValue


# Learning over n iterations depending on the convergence of the system
# A convergence function can replace the systematic repeating of the process
# by comparing the sum of the Q matrix to that of Q matrix n-1 in the
# previous episode

# agent_s_state. The agent the name of the system calculating
# s is the state the agent is going from and s' the state it's going to
# this state can be random or it can be chosen as long as the rest of the choices
# are not determined. Randomness is part of this stochastic process
# 1) DONE: decide if starting state is random or a specific state
#agent_s_state = 1

# Get available actions in the current state
#PossibleAction = possible_actions(agent_s_state)

# Sample next action to be performed
#action = ActionChoice(PossibleAction, agent_s_state)

# Rewarding Q matrix
#reward(agent_s_state,action,gamma)


#state_epsilon = 0.95 # 5% exploration
visited_pairs = np.zeros(Q.shape, dtype=bool)
for a in range(NUM_ALG):
    visited_pairs[0,a] = True # to skip all null feature configs

convergence_threshold = 0.01  
previous_Q = Q.copy()
iteration_buffer = num_configs * NUM_ALG

for i in range(10000):
    print("Iteration:", i)
    
    # visit all states first, then allow full access to any state
    unvisited_pairs = np.argwhere(visited_pairs == False)
    state_epsilon = max(0.1, 0.95 * (0.99 ** i)) # starts at 5% exploration/95% exploitation. exploration increases over time but is capped at 90%. 

    if len(unvisited_pairs) > 0 and ql.random.rand() < state_epsilon:
        current_pair = unvisited_pairs[ql.random.choice(len(unvisited_pairs))]
        current_state, action = current_pair
    else:
        if len(unvisited_pairs) == 0:
            print('all pairs visited')
        if ql.random.rand() < state_epsilon: # explore
            current_state = ql.random.randint(1, int(Q.shape[0]))
        else: # exploit past good states
            k = 10
            top_k_states = np.argsort(np.array(Q.sum(axis=1)).flatten())[-k:]
            current_state = np.random.choice(top_k_states)
        PossibleAction = possible_actions(current_state)
        action = ActionChoice(PossibleAction, current_state)
    visited_pairs[current_state, action] = True  
    
    print("Algorithm:", ALGORITHMS[action])
    reward(current_state,action,gamma)
    #visited_states.add((current_state, action))

    if i > iteration_buffer: # make sure it doesn't stop too early
    # check for convergence in Q to stop updates
        Q_diff = np.abs(Q - previous_Q).sum()
        if Q_diff < convergence_threshold:
            print(f"Converged at iteration {i} with Q_diff={Q_diff:.4f}")
            break

    previous_Q = Q.copy() # update for comparison
    # 95% of the time, we choose the random action and state 
    
# Displaying Q before the norm of Q phase
print("Q:")
print(Q)

# Norm of Q
print("Normed Q:")
print(Q/ql.max(Q)*100)

# DONE: get maximum value from Q-Learning Matrix
normed_Q = Q/ql.max(Q)*100
max_location = np.where(normed_Q==normed_Q.max())
print("\nmax value located at",max_location)
max_config = max_location[0][0]
max_algorithm = max_location[1][0]
final_feats = bin_to_features(max_config, 1)
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
#DONE: print(f"Selected features:")

# get final cluster labels
cluster_labels = cluster_labels_matrix[max_config, max_algorithm]

# match data to their clusters
labelled_data = data.copy()
labelled_data['cluster'] = cluster_labels

# get total number of clusters
num_clusters = labelled_data['cluster'].nunique()

### filter clusters based on percentage of data

# # for each unique value, get the count / len of data (aka percentage)
# cluster_array = labelled_data['cluster'].to_numpy()
# perc_values = np.unique(cluster_array,return_counts = True)[-1]
# percentages = perc_values / labelled_data.shape[0]

# # keep cluster values with % < 10 as anomalous
# idx = (np.where(percentages <= 0.1)[0]).tolist()
# anomalies = labelled_data.loc[labelled_data['cluster'].isin(idx)]


### filter anomalous clusters by size relative to the data statistics
cluster_sizes = labelled_data['cluster'].value_counts(normalize=True)
mean_size = cluster_sizes.mean()
std_dev = cluster_sizes.std()
flag_val = mean_size
anomalous_clusters = cluster_sizes[cluster_sizes < flag_val].index
anomalies = labelled_data[labelled_data['cluster'].isin(anomalous_clusters)]

# if none fall below the threshold, check if smallest two are statistically different
if len(anomalous_clusters) == 0:
    sorted_clusters = cluster_sizes.sort_values()
    sm, sec_sm = sorted_clusters.iloc[0], sorted_clusters.iloc[1]

    if sm < (0.8 * sec_sm):  # sm is at least 20% smaller than sec_sm
        anomalies = labelled_data[labelled_data['cluster'] == sorted_clusters.index[0]]

 



In [22]:
cluster_sizes = labelled_data['cluster'].value_counts(normalize=True)
cluster_sizes

cluster
0    0.701
1    0.299
Name: proportion, dtype: float64

In [25]:
# set(cluster_labels)
a = labelled_data.loc[labelled_data['cluster'] == 0]
set(a['Label'])

{0, 1}

In [16]:
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
anomalies


Using algorithm DBSCAN Clustering and Features Used: ['ct_src_dport_ltm', 'dloss', 'swin'], max value is: 100.0


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label,uid,cluster
9,59.166.0.9,10607,0,53,0,0,0.001050,146,178,31,...,4,3,3,4,3,1,1,0,9,2
11,10.40.182.1,0,7,0,2,2,50.004372,384,0,1,...,2,4,4,2,2,4,2,0,11,3
19,59.166.0.3,14947,6,6881,1,1,0.591447,13558,548216,31,...,14,11,5,5,3,1,3,0,19,14
20,59.166.0.3,26273,5,22,1,1,1.436850,12472,12716,31,...,10,7,6,10,4,4,4,0,20,4
22,59.166.0.5,28059,8,53,0,0,0.000930,130,162,31,...,3,3,6,3,3,1,1,0,22,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,175.45.176.0,42272,19,520,0,3,0.000006,1144,0,254,...,5,5,3,2,2,1,2,1,1988,3
1989,59.166.0.9,22341,9,6881,1,1,0.638689,1540,1644,31,...,16,17,8,5,5,1,5,0,1989,11
1991,59.166.0.3,63109,8,80,1,0,1.162022,890,6216,31,...,1,3,10,6,2,2,4,0,1991,8
1996,59.166.0.4,58735,2,6881,1,1,5.848744,24848,1094788,31,...,10,12,4,7,1,1,2,0,1996,7


#### ignore

In [27]:
import numpy as np
# percentages = labelled_data['cluster'].value_counts().values
# percentages = percentages / labelled_data.shape[0]
# print(percentages)
# print(labelled_data['cluster'].value_counts())
cluster_array = labelled_data['cluster'].to_numpy()
perc_vals = np.unique(cluster_array,return_counts = True)[-1]
print(perc_vals)
percentages = perc_vals / labelled_data.shape[0]
print(percentages)
idx = (np.where(percentages <= 0.1)[0]).tolist()
anomalies = labelled_data.loc[labelled_data['cluster'].isin(idx)]
print(anomalies)

[296   1]
[0.996633 0.003367]
    device_name      source_ip  avg_bytes_sent  avg_bytes_received  \
142  Device-227  192.168.0.226          9973.5              4648.0   

     avg_packets_transferred  avg_flow_duration  recent_tcp_flags  \
142                   303.75            7694.75                 2   

     recent_protocol  avg_cpu_usage  avg_memory_usage  avg_disk_usage  \
142                1         29.545            58.905           58.96   

     avg_uptime  cluster  
142       426.0        1  


In [None]:
#print(max_location)
np.where(normed_Q==normed_Q.max())
#normed_Q[normed_Q > 99.]
#max_location = np.where(normed_Q==normed_Q.max())

matrix([[ 99.99998773, 100.        ,  99.99998111,  99.9512457 ,
          99.9512457 ,  99.95124659,  99.99917226,  99.99991577,
          99.9991523 ,  99.99991297,  99.95122339,  99.95121864]])

In [101]:
normed_Q = Q/ql.max(Q)*100
max_location = np.where(normed_Q==normed_Q.max())
print("max value located at",max_location)
max_config = max_location[0][0]
max_algorithm = ALGORITHMS[max_location[1][0]]
print(f"Using algorithm {max_algorithm} and feature configuration {max_config}, max value is:",normed_Q[278,0])



max value located at (array([278]), array([0]))
278
Using algorithm K-Means and feature configuration 278, max value is: 100.0


### Performance Metrics

In [17]:
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score

In [18]:
# NB15 = Label | IOT = label | KDD'99 = Label | NSL-KDD = Class_Bin | Bot-IoT = attak

labels = data['Label']
result_labels = pd.DataFrame(columns=["true", "pred"])
result_labels['uid'] = ips
result_labels['true'] = labels
#print(result_labels)
result_labels.loc[result_labels['uid'].isin(anomalies[uid]), 'pred'] = 1
result_labels.loc[~result_labels['uid'].isin(anomalies[uid]), 'pred'] = 0
print(result_labels.loc[result_labels['true'] == 1])


      true pred   uid
15       1    0    15
93       1    1    93
129      1    1   129
229      1    1   229
260      1    0   260
...    ...  ...   ...
1775     1    1  1775
1782     1    0  1782
1810     1    1  1810
1820     1    0  1820
1988     1    1  1988

[63 rows x 3 columns]


In [19]:
y_true = result_labels['true'].astype(int)
y_pred = result_labels['pred'].astype(int)

pr_auc = average_precision_score(y_true, y_pred)

roc_auc = roc_auc_score(y_true, y_pred)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()

In [20]:
print("PR AUC:", pr_auc)
print("ROC AUC:", roc_auc)
print("Accuracy:", acc)
print("F1:", f1)
print(f"Precision: {precision}, Recall: {recall}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

PR AUC: 0.058444224393392606
ROC AUC: 0.688136621022527
Accuracy: 0.738
F1: 0.13245033112582782
Precision: 0.07393715341959335, Recall: 0.6349206349206349
TP: 40, TN: 1436, FP: 501, FN: 23


test

In [59]:
### filter anomalous clusters by size relative to the data statistics
cluster_sizes = labelled_data['cluster'].value_counts(normalize=True)
mean_size = cluster_sizes.mean()
std_dev = cluster_sizes.std()
flag_val = max(0, mean_size)

median_size = np.median(cluster_sizes)

Q1 = np.percentile(cluster_sizes, 25)
Q3 = np.percentile(cluster_sizes, 75)
IQR = Q3 - Q1 # less sensitive to outliers than mean
# flag_val = max(0, median_size - 1.5 * IQR)

anomalous_clusters = cluster_sizes[cluster_sizes <= flag_val].index
anomalies_2 = labelled_data[labelled_data['cluster'].isin(anomalous_clusters)]

# if none fall below the threshold, check if smallest two are statistically different
if len(anomalous_clusters) == 0:
    sorted_clusters = cluster_sizes.sort_values()
    sm, sec_sm = sorted_clusters.iloc[0], sorted_clusters.iloc[1]

    if sm < (0.8 * sec_sm):  # sm is at least 20% smaller than sec_sm
        anomalies_2 = labelled_data[labelled_data['cluster'] == sorted_clusters.index[0]]

In [87]:
flag_val

np.float64(0.16666666666666666)

In [69]:
cluster_sizes

cluster
0    0.999087
1    0.000304
2    0.000304
3    0.000304
Name: proportion, dtype: float64

In [464]:
# set(cluster_labels)
a = labelled_data.loc[labelled_data['cluster'] == 9]
set(a['Label'])


{0}

In [60]:
labels_2 = data['label']
result_labels_2 = pd.DataFrame(columns=["true", "pred"])
result_labels_2['uid'] = ips
result_labels_2['true'] = labels_2
#print(result_labels)
result_labels_2.loc[result_labels_2['uid'].isin(anomalies_2[uid]), 'pred'] = 1
result_labels_2.loc[~result_labels_2['uid'].isin(anomalies_2[uid]), 'pred'] = 0
print(result_labels_2.loc[result_labels_2['true'] == 1])

      true pred                 uid
20       1    1  CaFD7t4x11rNjJyXpe
22       1    1   C2gDegk8EvlOlAaJi
25       1    1  CY9Vox2T9RVyTvV5Zc
2034     1    1  CChUAn2y9k075ut2L4
2035     1    1  CPQreG2wGyBw1Fdqa2
2046     1    1  C5eeVN1ibcDUKaR0na
2048     1    1  CUpb0u1h3TX688LlAi
2049     1    1  CImwQs4If4Eyj29ZQ6
2050     1    1   CzVVJTZ4ssdqivqr1
2052     1    1  C0qgcP3QHZRIkuOkm8
2053     1    1  C50Is73ulQc4Xfpv5e
2055     1    1  Cqx3Nn1myl6TuT2YP6
3146     1    1   CeXTE32bb03yltkRV
3285     1    1  CJ0N5q2qFD0t1nCl3j


In [61]:
y_true_2 = result_labels_2['true'].astype(int)
y_pred_2 = result_labels_2['pred'].astype(int)

pr_auc_2 = average_precision_score(y_true_2, y_pred_2)

roc_auc_2 = roc_auc_score(y_true_2, y_pred_2)

acc_2 = accuracy_score(y_true_2, y_pred_2)
f1_2 = f1_score(y_true_2, y_pred_2)
precision_2 = precision_score(y_true_2, y_pred_2)
recall_2 = recall_score(y_true_2, y_pred_2)
cm_2 = confusion_matrix(y_true_2, y_pred_2)
tn_2, fp_2, fn_2, tp_2 = cm_2.ravel()

print("PR AUC:", pr_auc_2)
print("ROC AUC:", roc_auc_2)
print("Accuracy:", acc_2)
print("F1:", f1_2)
print(f"Precision: {precision_2}, Recall: {recall_2}")
print(f"TP: {tp_2}, TN: {tn_2}, FP: {fp_2}, FN: {fn_2}")

PR AUC: 0.00901481004507405
ROC AUC: 0.7648227383863081
Accuracy: 0.5316494217894097
F1: 0.017868538608806637
Precision: 0.00901481004507405, Recall: 1.0
TP: 14, TN: 1733, FP: 1539, FN: 0
