#### Installations

In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install scikit-learn-extra

Collecting scikit-learn-extra
  Using cached scikit-learn-extra-0.3.0.tar.gz (818 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: scikit-learn-extra
  Building wheel for scikit-learn-extra (pyproject.toml) ... [?25ldone
[?25h  Created wheel for scikit-learn-extra: filename=scikit_learn_extra-0.3.0-cp310-cp310-macosx_11_0_arm64.whl size=390035 sha256=a2811e2d97f38c2f962a6e68292fc25f26433d9142111d903fbd4340dd84f74b
  Stored in directory: /Users/s_gre1/Library/Caches/pip/wheels/89/a1/b9/758739c49b7f3e0a944e04247341258e15ce13016fbd23628b
Successfully built scikit-learn-extra
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0
Note: you may need to restart the kernel to use updated packages.


### Helper Functions: Algorithms & Clustering

**Algorithms**

DONE: add a "mode" argument to each algorithm that, if mode = 1 the cluster labelling is output and if mode = 0 the silhouette coefficient is output.

In [1]:
import numpy as np
import random
import os
from sklearn.preprocessing import StandardScaler

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["OMP_NUM_THREADS"] = "1"  # Prevents parallelism issues


In [2]:
# Code for KMeans

from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal
from sklearn.metrics import silhouette_score

"""
Performs KMeans clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def kmeans_clustering(selected_features,mode, n_clusters=2, max_iter=300):
    """
    Perform KMeans clustering on the input samples
    
    Parameters:
        samples: array-like, shape (n_samples, n_features)
        n_clusters: int, number of clusters (default=2)
        max_iter: int, maximum iterations (default=300)
    
    Returns:
        silhouette_coef: silhouette coefficient score
    """
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    best_k = n_clusters

    try:
        k_options = range(2, 6)
        best_k = max(k_options, key=lambda k: silhouette_score(X_scaled, KMeans(n_clusters=k, random_state=SEED, init='random', n_init=1).fit_predict(X_scaled)))
    except:
        best_k = 2
    
    k_means = KMeans(n_clusters=best_k, max_iter=max_iter, random_state=SEED, init='random', n_init=1)
    k_means.fit(X_scaled)
    if mode == 0:
        try:
            silhouette_coef = silhouette_score(X_scaled, k_means.labels_)
        except ValueError:
            silhouette_coef = -1  # Assigning lowest score if clustering fails
        return silhouette_coef, k_means.labels_
    if mode == 1:
        return k_means.labels_

In [3]:
# EM Clustering Code

from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score


"""
Performs EM clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def em_clustering(selected_features, mode, n_clusters=2):
    """
    Perform EM Clustering on selected features and return silhouette score.
        
    Returns:
    --------
    float
        Silhouette score of the clustering (-1 if clustering fails)
    """
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize and fit the EM model
    em_model = GaussianMixture(
        n_components=n_clusters,
        random_state=SEED, #THOUGHTS: We can improve this later to have an array of seeds to select from to observe variations
        n_init=1  # forces one initialization to remove variability
        
    )
    
   
    try:
        # Fit the model and get cluster assignments
        em_model.fit(X_scaled)
        labels = em_model.predict(X_scaled)
        
        # Calculate silhouette score
        silhouette_coef = silhouette_score(X_scaled, labels)
    except Exception as e:
        #print(f"Clustering failed: {str(e)}")
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [5]:
# DBSCAN Detection method: 
# I put 'optimization part' in 'DBSCAN_Optimization_Code.ipynb' file. 
# We can use optimization after initial run to do a comparison and analysis in our paper to show improvements.

# import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# DONE: keep -1 --> they will be its own cluster
"""
Performs DBSCAN clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def dbscan_clustering(selected_features, mode, eps=0.5, min_samples=5):
    """
    Perform DBSCAN clustering on selected features
    
    Parameters:
    selected_features : pandas DataFrame
        The features selected for clustering
    eps : float
        The maximum distance between two samples for them to be considered neighbors
    min_samples : int
        The number of samples in a neighborhood for a point to be considered a core point
        
    Returns:
    float : silhouette coefficient
    dict : additional clustering information
    """

    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize and fit DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X_scaled)

    if -1 in labels:
        labels[labels == -1] = max(labels) +1

     # Get number of clusters (including noise points which are labeled -1, K Medoids does not have noise points)
    n_clusters = len(set(labels))
    
    # calculate silhouette score if more than one cluster and  noise points
    if n_clusters > 1:
        silhouette_coef = silhouette_score(X_scaled, labels)
    else:
        silhouette_coef = -1  # Assign lowest score if clustering fails

    
    # NOTE: -- Uncomment when we analyze and optimize ---- Additional clustering information
    # info = {
    #     'n_clusters': n_clusters,
    #     'n_noise': list(labels).count(-1),
    #     'labels': labels,
    #     'cluster_sizes': pd.Series(labels).value_counts().to_dict()
    # }
    
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels



In [6]:
# Code for K Medoids
# import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs KMediods clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def modified_kmedoids_clustering(selected_features, mode, n_clusters=2):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    best_k = n_clusters
    try:
        k_options = range(2, 6)
        best_k = max(k_options, key=lambda k: silhouette_score(X_scaled, KMedoids(n_clusters=k, method='alternate', init='k-medoids++', max_iter=1500, random_state=SEED).fit_predict(X_scaled)))
    except:
        best_k = 2

     # Initialize and fit the K-Medoids model
    kmedoids = KMedoids(n_clusters=best_k, method='alternate', init='k-medoids++', max_iter=1500,random_state=SEED)
    

    # Calculate silhouette score
    try:
        labels = kmedoids.fit_predict(X_scaled)
        if len(set(labels)) > 1:
            silhouette_coef = silhouette_score(X_scaled, labels)
        else:
            silhouette_coef = 0 # Assigning lowest score if there is only 1 cluster
    except Exception as e:
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [7]:
# Code for K Medoids
# import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs KMediods clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def kmedoids_clustering(selected_features, mode, n_clusters=2):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

     # Initialize and fit the K-Medoids model
    kmedoids = KMedoids(n_clusters=n_clusters, method='pam', max_iter=1500, random_state=SEED)
    

    # Calculate silhouette score
    try:
        labels = kmedoids.fit_predict(X_scaled)
        silhouette_coef = silhouette_score(X_scaled, labels)
    except ValueError:
        silhouette_coef = -1  # Assigning lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [8]:
# Code for Mean Shift
# import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs Mean Shift clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def modified_meanshift_clustering(selected_features, mode, quantile=0.2, n_samples=500):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Estimate optimal bandwidth
    bandwidth = estimate_bandwidth(X_scaled, quantile=quantile, n_samples=n_samples, random_state=SEED)
    if bandwidth <= 0:
        bandwidth = 1.0  # Fallback in case of extremely small bandwidth
        
    # Initialize and fit the Mean Shift model
    meanshift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    # Calculate silhouette score
    try:
        # print("in try")
        labels = meanshift.fit_predict(X_scaled)
        n_clusters = len(set(labels))

        # if -1 in labels:
        #     labels[labels == -1] = n_clusters
        # Check the number of clusters determined 
        #n_clusters = len(np.unique(labels))
        #print(f"Number of clusters found: {n_clusters}")
        if n_clusters > 1:
            silhouette_coef = silhouette_score(X_scaled, labels)
        else:
            silhouette_coef = -1
    except Exception as e:
        # print("in except")
        silhouette_coef = -1 # Assign lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [9]:
# Codes for Mean Shift
# import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

"""
Performs Mean Shift clustering using the data from selected_features.
If mode = 0, the silhouette score of the clustering is returned.
If mode = 1, the labels of the clustering is returned.
"""
def meanshift_clustering(selected_features, mode, bandwidth=None):
    # Filter the selected features
    X = selected_features
    
    # Standardize selected features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize and fit the Mean Shift model
    meanshift = MeanShift(bandwidth=bandwidth)
    
    # Calculate silhouette score
    try:
        labels = meanshift.fit_predict(X_scaled)
        # Check the number of clusters determined 
        n_clusters = len(np.unique(labels))
        #print(f"Number of clusters found: {n_clusters}")
        silhouette_coef = silhouette_score(X_scaled, labels)
    except ValueError:
        silhouette_coef = -1  # Assign lowest score if clustering fails
    if mode == 0:
        return silhouette_coef, labels
    if mode == 1:
        return labels

In [10]:
"""
Converts the binary value of state (which represents features selected) 
to a string output res and returns.
"""
def bin_to_features(state, mode):
  state_bin = bin(state)
  #print(state_bin)
  state_bin_arr = np.array([b for b in state_bin[2:]])
  #pad with zeros
  diff = len(FEATURES) - len(state_bin_arr)
  padded_arr = np.insert(state_bin_arr, 0, ['0' for i in range(diff)])
  (padded_arr)

  # identify which indexes are 1
  idx = (np.where(padded_arr == '1')[0]).tolist()
  #print(idx)
  # select feature headings
  selected_features = original_features.iloc[:,idx]
  features = selected_features.columns.tolist()
  res = f"Features Used: {features}"
  if mode == 0: # return actual feature list
    return selected_features
  if mode == 1: # return string of feature list
    return res

### Main Code

In [None]:
# features = np.array(['action', 'availability', 'device_type:1', 'direction',
#        'event_type', 'interface_status', 'patch_description',
#        'patch_status', 'severity:1', 'traffic_direction'])

# Replace feature list input with selected features
# features = np.array(['action', 'behavior_name', 'event_id:4', 'exploit_available',
#  'file_signature_status', 'network_interface', 'patch_name', 'severity',
#  'tcp_flags', 'vulnerability_solution']) # features for generated data complete

#features = np.array(['avg_bytes_sent','avg_bytes_received','avg_packets_transferred','avg_flow_duration','recent_tcp_flags','recent_protocol','avg_cpu_usage','avg_memory_usage','avg_disk_usage','avg_uptime']) # features for generated data

# IoT23
#features = np.array(['conn_state', 'history', 'id.orig_p', 'missed_bytes', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_bytes', 'resp_ip_bytes','resp_pkts'])
# NB15
#features = np.array(['Djit', 'Dload', 'ct_dst_sport_ltm', 'ct_src_dport_ltm', 'ct_srv_dst', 'ct_state_ttl', 'dmeansz', 'res_bdy_len', 'sloss', 'state'])
# KDD'99
features = np.array(['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_same_src_port_rate', 'Dst_host_same_srv_rate', 'Dst_host_srv_count', 'Flag', 'Logged_in', 'Protocol_type', 'Serror_rate', 'Service'])
# NSL-KDD
#features = np.array(['Diff Srv Rate', 'Dst Host Count', 'Dst Host Diff Srv Rate', 'Dst Host Same Src Port Rate', 'Dst Host Same Srv Rate', 'Flag', 'Logged In', 'Protocol Type', 'Same Srv Rate', 'Service'])


features = features.tolist()
print(features)

['Dst_host_count', 'Dst_host_diff_srv_rate', 'Dst_host_serror_rate', 'Dst_host_srv_count', 'Logged_in', 'Protocol_type', 'Same_srv_rate', 'Serror_rate', 'Service', 'Srv_serror_rate']


In [None]:
import pandas as pd
# import numpy as np

# FEATURES = {0: 'avg_bytes_sent', 1: 'avg_bytes_received', 2: 'avg_packets_transferred', 
#   3: 'avg_flow_duration', 4: 'recent_tcp_flags', 5: 'recent_protocol', 6: 'avg_cpu_usage', 
#   7: 'avg_memory_usage', 8: 'avg_disk_usage', 9: 'avg_uptime'}
# features = list(FEATURES.values())
FEATURES = {k:str(v) for k,v in zip(range(len(features)), features) }
#print(FEATURES)
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_NB15_1_sub1.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_iot23_RW21.csv")
data = pd.read_csv("../data/real-world/data-cleaning/cleaned_kdd_sample_1.csv")
#data = pd.read_csv("../data/real-world/data-cleaning/cleaned_nsl_kdd_sample_1.csv")

#print(data.head(10))


ALGORITHMS = {0: 'DBSCAN Clustering', 1: 'Mean Shift', 2: 'K-Mediods', 3: 'EM Clustering', 4: 'K-Means'}
NUM_ALG = len(ALGORITHMS)
original_features = data[features].copy(deep = True)
print(original_features.head(10))

scaler = StandardScaler()
original_features_scaled = scaler.fit_transform(original_features)
 
data['uid'] = data.index # comment this line out for IoT-23
uid = "uid"
ips = data[uid]
#print(original_features.columns)
#print(ips.head(10))

def algorithm_prep(state, action, mode):
  selected_features = bin_to_features(state, 0)
  out = None
  match action:   
    case 0:
      #print('algorithm:',ALGORITHMS[action])
      out = dbscan_clustering(selected_features, mode)
    case 1: 
      #print('algorithm:',ALGORITHMS[action])
      out = modified_meanshift_clustering(selected_features, mode)
    case 2:
      #print('algorithm:',ALGORITHMS[action])
      out = modified_kmedoids_clustering(selected_features, mode)
    case 3: 
      out = em_clustering(selected_features, mode)
    case 4:
      out = kmeans_clustering(selected_features, mode)
      
  # return silhouette from algorithm function
  return out

def bin_to_features(state, mode):
  state_bin = bin(state)
  #print(state_bin)
  state_bin_arr = np.array([b for b in state_bin[2:]])
  #pad with zeros
  diff = len(FEATURES) - len(state_bin_arr)
  padded_arr = np.insert(state_bin_arr, 0, ['0' for i in range(diff)])
  (padded_arr)

  # identify which indexes are 1
  idx = (np.where(padded_arr == '1')[0]).tolist()
  #print(idx)
  # select feature headings
  selected_features = original_features.iloc[:,idx]
  features = selected_features.columns.tolist()
  res = f"Features Used: {features}"
  if mode == 0: # return actual feature list
    return selected_features
  if mode == 1: # return string of feature list
    return res

          Djit         Dload  ct_dst_sport_ltm  ct_src_dport_ltm  ct_srv_dst  \
0     0.292512  1.024719e+06                 1                 1           5   
1     0.000000  7.213779e+05                 1                 1           3   
2     0.000000  7.170191e+05                 1                 1           4   
3     0.325278  1.017857e+06                 1                 1           8   
4     0.000000  6.512562e+05                 1                 1           1   
5    68.279484  9.480692e+03                 1                 1           1   
6    21.213361  6.184004e+04                 1                 1          14   
7  7735.661800  6.782899e+04                 1                 1           1   
8    74.834585  3.614229e+06                 1                 1          12   
9     0.000000  6.780952e+05                 1                 3           3   

   ct_state_ttl  dmeansz  res_bdy_len  sloss  state  
0             0       76            0      0      0  
1          

#### Reinforcement Learning

In [None]:
# Markov Decision Process (MDP) - The Bellman equations adapted to
# Q Learning.Reinforcement Learning with the Q action-value(reward) function.
# Copyright 2018 Denis Rothman MIT License. See LICENSE.
import numpy as ql

# R is The Reward Matrix for each state
# 1024 configurations of the 10 features --> 2^10
# 5 algorithms
num_configs = 2 ** len(FEATURES)
R = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

# Q is the Learning Matrix in which rewards will be learned/stored
Q = ql.matrix(ql.zeros([num_configs,NUM_ALG]))

# used to save the labels of each (state, action) combination for later retrieval
cluster_labels_matrix = np.empty(Q.shape, dtype=object)

# Gamma : It's a form of penalty or uncertainty for learning
# If the value is 1 , the rewards would be too high.
# This way the system knows it is learning.
gamma = 0.8

# The possible "a" actions when the agent is in a given state
def possible_actions(state):
    # 2) DONE: we should check Q, not R because R is never modified
    current_state_row = Q[state,]
    # 3) DONE: this should pick valid actions based on what we have not visited
    possible_act = ql.where(current_state_row == 0)[1]
    return possible_act


# This function chooses at random which action to be performed within the range 
# of all the available actions.

def ActionChoice(available_actions_range, state):
    epsilon = 0.95 # 90% exploration
    if len(available_actions_range) > 0:
        if np.random.rand() < epsilon:  
            # Explore: Randomly pick from possible actions
            next_action = int(ql.random.choice(available_actions_range, 1)[0])
        else:
            # Exploit: Pick best action from Q matrix
            next_action = int(np.argmax(Q[state, :]))
    else:
    # If no valid actions, pick randomly from all possible algorithms
        next_action = int(np.random.choice(NUM_ALG, 1)[0])
    
    return next_action
    


# A version of Bellman's equation for reinforcement learning using the Q function
# This reinforcement algorithm is a memoryless process
# The transition function T from one state to another
# is not in the equation below.  T is done by the random choice above

def reward(current_state, action, gamma):
    Max_State = ql.where(Q[action,] == ql.max(Q[action,]))[1]

    if Max_State.shape[0] > 1:
        Max_State = int(ql.random.choice(Max_State, size = 1)[0])
    else:
        Max_State = int(Max_State[0])

    MaxValue = Q[Max_State, action]

    # call function to run ML algorithm using the value of action. this will
    # run the algorithm using the features from current_state, create clusters,
    # and calculate the silhouette value.
    selected_silhouette_co, labels = algorithm_prep(current_state, action, 0)
    cluster_labels_matrix[current_state, action] = labels
    try:
        overall_silhouette_co = silhouette_score(original_features_scaled, labels)
    except ValueError: overall_silhouette_co = -1

    # calculate ratio of selected features 
    ratio = selected_silhouette_co / (overall_silhouette_co + 1e-6)
    if selected_silhouette_co < overall_silhouette_co:
        penalty = 0.1 * ratio
    else: penalty = 0
    
    # Bellman's MDP based Q function

    # normalized silhouette score for better consistency in reinforcement learning
    if selected_silhouette_co < 0: # ensures RL doesn't learn from bad clustering
        norm_silhouette = 0
    else:
        norm_silhouette = (selected_silhouette_co + 1) / 2  # Scale from [-1,1] to [0,1]
    # norm_silhouette = (selected_silhouette_co + 1) / 2
    
    #norm_silhouette = (silhouette_co + 1) / 2  # Scale from [-1,1] to [0,1]
    # Q[current_state, action] = norm_silhouette + gamma * MaxValue
    Q[current_state, action] = (norm_silhouette - penalty) + gamma * MaxValue


# Learning over n iterations depending on the convergence of the system
# A convergence function can replace the systematic repeating of the process
# by comparing the sum of the Q matrix to that of Q matrix n-1 in the
# previous episode

# agent_s_state. The agent the name of the system calculating
# s is the state the agent is going from and s' the state it's going to
# this state can be random or it can be chosen as long as the rest of the choices
# are not determined. Randomness is part of this stochastic process
# 1) DONE: decide if starting state is random or a specific state
#agent_s_state = 1

# Get available actions in the current state
#PossibleAction = possible_actions(agent_s_state)

# Sample next action to be performed
#action = ActionChoice(PossibleAction, agent_s_state)

# Rewarding Q matrix
#reward(agent_s_state,action,gamma)


#state_epsilon = 0.95 # 5% exploration
visited_pairs = np.zeros(Q.shape, dtype=bool)
for a in range(NUM_ALG):
    visited_pairs[0,a] = True # to skip all null feature configs

convergence_threshold = 0.01  
previous_Q = Q.copy()
iteration_buffer = num_configs * NUM_ALG

for i in range(10000):
    print("Iteration:", i)
    
    # visit all states first, then allow full access to any state
    unvisited_pairs = np.argwhere(visited_pairs == False)
    state_epsilon = max(0.1, 0.95 * (0.99 ** i)) # starts at 5% exploration/95% exploitation. exploration increases over time but is capped at 90%. 

    if len(unvisited_pairs) > 0 and ql.random.rand() < state_epsilon:
        current_pair = unvisited_pairs[ql.random.choice(len(unvisited_pairs))]
        current_state, action = current_pair
    else:
        if len(unvisited_pairs) == 0:
            print('all pairs visited')
        if ql.random.rand() < state_epsilon: # explore
            current_state = ql.random.randint(1, int(Q.shape[0]))
        else: # exploit past good states
            k = 10
            top_k_states = np.argsort(np.array(Q.sum(axis=1)).flatten())[-k:]
            current_state = np.random.choice(top_k_states)
        PossibleAction = possible_actions(current_state)
        action = ActionChoice(PossibleAction, current_state)
    visited_pairs[current_state, action] = True  
    
    print("Algorithm:", ALGORITHMS[action])
    reward(current_state,action,gamma)
    #visited_states.add((current_state, action))

    if i > iteration_buffer: # make sure it doesn't stop too early
    # check for convergence in Q to stop updates
        Q_diff = np.abs(Q - previous_Q).sum()
        if Q_diff < convergence_threshold:
            print(f"Converged at iteration {i} with Q_diff={Q_diff:.4f}")
            break

    previous_Q = Q.copy() # update for comparison
    # 95% of the time, we choose the random action and state 
    
# Displaying Q before the norm of Q phase
print("Q:")
print(Q)

# Norm of Q
print("Normed Q:")
print(Q/ql.max(Q)*100)

# DONE: get maximum value from Q-Learning Matrix
normed_Q = Q/ql.max(Q)*100
max_location = np.where(normed_Q==normed_Q.max())
print("\nmax value located at",max_location)
max_config = max_location[0][0]
max_algorithm = max_location[1][0]
final_feats = bin_to_features(max_config, 1)
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
#DONE: print(f"Selected features:")

# get final cluster labels
cluster_labels = cluster_labels_matrix[max_config, max_algorithm]

# match data to their clusters
labelled_data = data.copy()
labelled_data['cluster'] = cluster_labels

# get total number of clusters
num_clusters = labelled_data['cluster'].nunique()

### filter clusters based on percentage of data

# # for each unique value, get the count / len of data (aka percentage)
# cluster_array = labelled_data['cluster'].to_numpy()
# perc_values = np.unique(cluster_array,return_counts = True)[-1]
# percentages = perc_values / labelled_data.shape[0]

# # keep cluster values with % < 10 as anomalous
# idx = (np.where(percentages <= 0.1)[0]).tolist()
# anomalies = labelled_data.loc[labelled_data['cluster'].isin(idx)]


### filter anomalous clusters by size relative to the data statistics
cluster_sizes = labelled_data['cluster'].value_counts(normalize=True)
mean_size = cluster_sizes.mean()
std_dev = cluster_sizes.std()
flag_val = mean_size
anomalous_clusters = cluster_sizes[cluster_sizes < flag_val].index
anomalies = labelled_data[labelled_data['cluster'].isin(anomalous_clusters)]

# if none fall below the threshold, check if smallest two are statistically different
if len(anomalous_clusters) == 0:
    sorted_clusters = cluster_sizes.sort_values()
    sm, sec_sm = sorted_clusters.iloc[0], sorted_clusters.iloc[1]

    if sm < (0.8 * sec_sm):  # sm is at least 20% smaller than sec_sm
        anomalies = labelled_data[labelled_data['cluster'] == sorted_clusters.index[0]]


Iteration: 0
Algorithm: DBSCAN Clustering
Iteration: 1
Algorithm: EM Clustering
Iteration: 2
Algorithm: DBSCAN Clustering
Iteration: 3
Algorithm: DBSCAN Clustering
Iteration: 4
Algorithm: DBSCAN Clustering
Iteration: 5
Algorithm: DBSCAN Clustering
Iteration: 6
Algorithm: K-Means
Iteration: 7
Algorithm: DBSCAN Clustering
Iteration: 8
Algorithm: K-Mediods
Iteration: 9
Algorithm: K-Mediods
Iteration: 10
Algorithm: Mean Shift
Iteration: 11
Algorithm: K-Means
Iteration: 12
Algorithm: EM Clustering
Iteration: 13
Algorithm: EM Clustering
Iteration: 14
Algorithm: DBSCAN Clustering
Iteration: 15
Algorithm: DBSCAN Clustering
Iteration: 16
Algorithm: Mean Shift
Iteration: 17
Algorithm: DBSCAN Clustering
Iteration: 18
Algorithm: DBSCAN Clustering
Iteration: 19
Algorithm: Mean Shift
Iteration: 20
Algorithm: K-Means
Iteration: 21
Algorithm: K-Mediods
Iteration: 22
Algorithm: K-Means
Iteration: 23
Algorithm: K-Means
Iteration: 24
Algorithm: DBSCAN Clustering
Iteration: 25
Algorithm: EM Clustering
Ite

In [114]:
num_clusters

a = labelled_data.loc[labelled_data['cluster'] == 2]
# set(a['Label'])
len(a['Label'])

104

In [21]:
print(f"\nUsing algorithm {ALGORITHMS[max_algorithm]} and {final_feats}, max value is:",normed_Q[max_config,max_algorithm])
anomalies


Using algorithm DBSCAN Clustering and Features Used: ['Djit', 'Dload', 'ct_dst_sport_ltm', 'ct_srv_dst', 'ct_state_ttl', 'dmeansz'], max value is: 100.0


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label,uid,cluster
11,10.40.182.1,0,7,0,2,2,50.004372,384,0,1,...,2,4,4,2,2,4,2,0,11,2
20,59.166.0.3,26273,5,22,1,1,1.436850,12472,12716,31,...,10,7,6,10,4,4,4,0,20,2
23,59.166.0.2,64156,6,53,0,0,0.000990,146,178,31,...,3,3,11,9,3,3,6,0,23,3
30,59.166.0.1,46078,0,21,1,1,1.881681,2934,3740,31,...,4,4,17,30,4,4,9,0,30,2
38,59.166.0.7,18544,9,6881,1,1,5.656051,23202,1052048,31,...,8,13,4,1,1,1,1,0,38,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1934,59.166.0.1,25295,8,53,0,0,0.001008,146,178,31,...,2,2,5,13,2,2,2,0,1934,6
1946,59.166.0.8,1764,11,22,1,1,0.203732,12752,13202,31,...,9,11,8,4,4,4,4,0,1946,2
1969,59.166.0.4,20206,5,49992,1,1,0.860602,320,1878,31,...,23,7,8,18,3,3,8,0,1969,3
1988,175.45.176.0,42272,19,520,0,3,0.000006,1144,0,254,...,5,5,3,2,2,1,2,1,1988,15


Using algorithm DBSCAN Clustering and Features Used: ['ct_dst_sport_ltm', 'ct_state_ttl', 'res_bdy_len', 'state'], max value is: 100.0

### Performance Metrics

In [17]:
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score

In [18]:
# NB15 = Label | IOT = label | KDD'99 = Label | NSL-KDD = Class

labels = data['Label']
result_labels = pd.DataFrame(columns=["true", "pred"])
result_labels['uid'] = ips
result_labels['true'] = labels
#print(result_labels)
result_labels.loc[result_labels['uid'].isin(anomalies[uid]), 'pred'] = 1
result_labels.loc[~result_labels['uid'].isin(anomalies[uid]), 'pred'] = 0
print(result_labels.loc[result_labels['true'] == 1])


      true pred   uid
15       1    0    15
93       1    0    93
129      1    1   129
229      1    1   229
260      1    0   260
...    ...  ...   ...
1775     1    0  1775
1782     1    1  1782
1810     1    1  1810
1820     1    1  1820
1988     1    1  1988

[63 rows x 3 columns]


In [19]:
y_true = result_labels['true'].astype(int)
y_pred = result_labels['pred'].astype(int)

pr_auc = average_precision_score(y_true, y_pred)

roc_auc = roc_auc_score(y_true, y_pred)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()

In [20]:
print("PR AUC:", pr_auc)
print("ROC AUC:", roc_auc)
print("Accuracy:", acc)
print("F1:", f1)
print(f"Precision: {precision}, Recall: {recall}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

PR AUC: 0.13510483573274273
ROC AUC: 0.7804820086699282
Accuracy: 0.902
F1: 0.2949640287769784
Precision: 0.19069767441860466, Recall: 0.6507936507936508
TP: 41, TN: 1763, FP: 174, FN: 22
