# MT in ML : K-Mean Clustering

# Import Libraries 

In [58]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [2]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Data

The dataset consist of `54 measures` and `650 binary files`

In [3]:
measures = pd.read_csv('../00_input/benchmark_measures.csv')
measures.head()

Unnamed: 0,files_name,CVE-CWE-915 Diagnostic,CVE-CWE-294 Diagnostic,Yara exploit_kits Diagnostic,CVE-CWE-261 Diagnostic,CVE-CWE-611 Diagnostic,CVE-CWE-91 Diagnostic,CWE-560 Weakness Diagnostic,CVE-CWE-201 Diagnostic,CVE-CWE-281 Diagnostic,...,CVE-CWE-540 Diagnostic,CVE-CWE-276 Diagnostic,CVE-CWE-295 Diagnostic,Yara maldocs Diagnostic,CVE-CWE-73 Diagnostic,CVE-CWE-310 Diagnostic,CVE-CWE-476 Diagnostic,Yara capabilities Diagnostic,CVE-CWE-266 Diagnostic,CVE-CWE-838 Diagnostic
0,3proxy,0.99,0.99,0.99,0.99,0.99,0.99,0.506862,0.99,0.99,...,0.99,0.99,0.547593,0.549027,0.99,0.619846,0.976276,0.610712,0.512519,0.617773
1,ab,0.99,0.99,0.99,0.99,0.99,0.99,0.506862,0.99,0.99,...,0.99,0.99,0.547593,0.549027,0.99,0.619846,0.532391,0.610712,0.512519,0.617773
2,ag,0.99,0.99,0.99,0.99,0.99,0.99,0.506862,0.99,0.99,...,0.99,0.99,0.547593,0.549027,0.99,0.619846,0.976276,0.610712,0.512519,0.617773
3,aircrack-ng,0.99,0.99,0.99,0.99,0.99,0.99,0.506862,0.99,0.99,...,0.99,0.99,0.547593,0.549027,0.99,0.619846,0.532391,0.03706,0.512519,0.617773
4,airdecap-ng,0.99,0.99,0.99,0.99,0.99,0.99,0.506862,0.99,0.99,...,0.99,0.99,0.547593,0.549027,0.99,0.619846,0.532391,0.610712,0.512519,0.617773


## Drop constance variables

In [4]:
# Select columns that don't contain constant values
nonConsMeasures_columns = measures.columns[measures.nunique() > 1]

# Create a new DataFrame with non-constant columns
nonConsMeasures = measures[nonConsMeasures_columns]
nonConsMeasures.head()

Unnamed: 0,files_name,CWE-560 Weakness Diagnostic,CVE-CWE-770 Diagnostic,CVE-CWE-908 Diagnostic,Yara antidebug_antivm Diagnostic,CVE-CWE-94 Diagnostic,CVE-CWE-502 Diagnostic,CWE-415 Weakness Diagnostic,CVE-CWE-125 Diagnostic,Yara malware Diagnostic,...,CWE-190 Weakness Diagnostic,CVE-CWE-120 Diagnostic,CVE-CWE-320 Diagnostic,CVE-CWE-295 Diagnostic,Yara maldocs Diagnostic,CVE-CWE-310 Diagnostic,CVE-CWE-476 Diagnostic,Yara capabilities Diagnostic,CVE-CWE-266 Diagnostic,CVE-CWE-838 Diagnostic
0,3proxy,0.506862,0.930134,0.546097,0.597516,0.912581,0.570389,0.678688,0.974818,0.570406,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.976276,0.610712,0.512519,0.617773
1,ab,0.506862,0.439933,0.546097,0.597516,0.504931,0.570389,0.678688,0.491205,0.570406,...,0.542089,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.610712,0.512519,0.617773
2,ag,0.506862,0.930134,0.546097,0.597516,0.912581,0.570389,0.678688,0.974818,0.570406,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.976276,0.610712,0.512519,0.617773
3,aircrack-ng,0.506862,0.439933,0.546097,0.01621,0.504931,0.570389,0.678688,0.491205,0.570406,...,0.375512,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.03706,0.512519,0.617773
4,airdecap-ng,0.506862,0.439933,0.546097,0.597516,0.504931,0.570389,0.678688,0.491205,0.570406,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.610712,0.512519,0.617773


## Drop `files name column`

In [7]:
noFileNoConsMeasures =  nonConsMeasures.copy()
noFileNoConsMeasures.drop('files_name', axis=1, inplace=True)
noFileNoConsMeasures.head()

Unnamed: 0,CWE-560 Weakness Diagnostic,CVE-CWE-770 Diagnostic,CVE-CWE-908 Diagnostic,Yara antidebug_antivm Diagnostic,CVE-CWE-94 Diagnostic,CVE-CWE-502 Diagnostic,CWE-415 Weakness Diagnostic,CVE-CWE-125 Diagnostic,Yara malware Diagnostic,CVE-CWE-399 Diagnostic,...,CWE-190 Weakness Diagnostic,CVE-CWE-120 Diagnostic,CVE-CWE-320 Diagnostic,CVE-CWE-295 Diagnostic,Yara maldocs Diagnostic,CVE-CWE-310 Diagnostic,CVE-CWE-476 Diagnostic,Yara capabilities Diagnostic,CVE-CWE-266 Diagnostic,CVE-CWE-838 Diagnostic
0,0.506862,0.930134,0.546097,0.597516,0.912581,0.570389,0.678688,0.974818,0.570406,0.975918,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.976276,0.610712,0.512519,0.617773
1,0.506862,0.439933,0.546097,0.597516,0.504931,0.570389,0.678688,0.491205,0.570406,0.490431,...,0.542089,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.610712,0.512519,0.617773
2,0.506862,0.930134,0.546097,0.597516,0.912581,0.570389,0.678688,0.974818,0.570406,0.975918,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.976276,0.610712,0.512519,0.617773
3,0.506862,0.439933,0.546097,0.01621,0.504931,0.570389,0.678688,0.491205,0.570406,0.782087,...,0.375512,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.03706,0.512519,0.617773
4,0.506862,0.439933,0.546097,0.597516,0.504931,0.570389,0.678688,0.491205,0.570406,0.490431,...,0.67996,0.506862,0.551166,0.547593,0.549027,0.619846,0.532391,0.610712,0.512519,0.617773


## Transpose the dataset

The dataset is transponsed because the `Measures` are the datapoint to be grouble int `Aspects` clusters

In [105]:
noFileNoConsMeasuresT = noFileNoConsMeasures.T
noFileNoConsMeasuresT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CWE-560 Weakness Diagnostic,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,...,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862
CVE-CWE-770 Diagnostic,0.930134,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,...,0.930134,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,0.439933
CVE-CWE-908 Diagnostic,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,...,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097,0.546097
Yara antidebug_antivm Diagnostic,0.597516,0.597516,0.597516,0.01621,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516,...,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516,0.597516
CVE-CWE-94 Diagnostic,0.912581,0.504931,0.912581,0.504931,0.504931,0.504931,0.504931,0.504931,0.504931,0.912581,...,0.912581,0.912581,0.087774,0.504931,0.504931,0.504931,0.504931,0.504931,0.912581,0.504931
CVE-CWE-502 Diagnostic,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,...,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389,0.570389
CWE-415 Weakness Diagnostic,0.678688,0.678688,0.678688,0.678688,0.678688,0.678688,0.678688,0.678688,0.678688,0.678688,...,0.678688,0.678688,0.678688,0.093179,0.13389,0.678688,0.13389,0.13389,0.678688,0.05474
CVE-CWE-125 Diagnostic,0.974818,0.491205,0.974818,0.491205,0.491205,0.491205,0.491205,0.491205,0.491205,0.974818,...,0.974818,0.974818,0.491205,0.491205,0.491205,0.491205,0.491205,0.491205,0.88515,0.491205
Yara malware Diagnostic,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,...,0.009151,0.009151,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406,0.570406
CVE-CWE-399 Diagnostic,0.975918,0.490431,0.975918,0.782087,0.490431,0.490431,0.490431,0.490431,0.490431,0.975918,...,0.975918,0.975918,0.120151,0.490431,0.490431,0.490431,0.490431,0.490431,0.886012,0.490431


In [9]:
initDataT = noFileNoConsMeasuresT

k = 6 stand for 6 Aspects: 

Aspects:

    - Availability
    - Authenticity
    - Authorization
    - Confidentiality
    - Non-repudiation
    - Integrity

**Os:** source/initial output

**Of:** followup output

## DEA

In [10]:
# Descriptive statistics
initDataT.shape

(54, 650)

In [11]:
initDataT.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54 entries, CWE-560 Weakness Diagnostic to CVE-CWE-838 Diagnostic
Columns: 650 entries, 0 to 649
dtypes: float64(650)
memory usage: 276.7+ KB


In [12]:
# Sample columns to display pairplot
sampleColumns = initDataT.iloc[:, 20:30]

In [104]:
# # Plot pair plot
# sns.pairplot(sampleColumns)
# plt.show()

## Initical clustering

In [62]:
def clustering(df, cluster_label):
    
    # Instantiate the KMeans model with constant centroids
    # Choose the number of clusters and set random_state for reproducibility
    kmeans = KMeans(n_clusters=6, init='random', random_state=42)  

    # Fit the model to your data
    kmeans.fit(df)

    # Get cluster labels for each data point
    cluster_labels = kmeans.labels_
    
    # Get cluster centroids
    centroids = kmeans.cluster_centers_
    
#     print('\n', centroids)

    # Add cluster labels to the DataFrame
    df_with_labels = df.copy()  # Create a copy of the original DataFrame
    df_with_labels['Cluster'] = cluster_labels
    
    
    def print_cluster_indices(dfWithClusters, cluster_label):
        """
        Prints the indices of data points belonging to a specific cluster.

        Parameters:
        - df: DataFrame containing the data points with cluster labels.
        - cluster_label: The label of the cluster whose indices you want to print.
        
         Returns:
        - cluster_indices: List of indices of data points in the specified cluster.
        """
        cluster_indices = dfWithClusters[dfWithClusters['Cluster'] == cluster_label].index.tolist()
        print(f"\nData points in Cluster {cluster_label}:")
        print('\n',cluster_indices)
    
    print_cluster_indices(df_with_labels, cluster_label)  
    
    # Count datapoint in each cluster
    clusterDPcount = df_with_labels['Cluster'].value_counts()

    # Create a new DataFrame from the unique value counts
    result_df = pd.DataFrame({'A': clusterDPcount.index, 'B': clusterDPcount.values})
    result_df = result_df.reset_index(drop=True)
    
    
    
    print('\n\n\n***********************************************************')
    print('Clusers and numbers of instances they contain')
    print('\n',clusterDPcount)
    
    # Create a DataFrame with indices and cluster labels
    init_cluster_df = pd.DataFrame({'Index': df_with_labels.index, 'Cluster': cluster_labels})
    # Sort by Index (measures) column
    init_cluster_df = init_cluster_df.sort_values(by='Index').reset_index(drop=True)
    
    print('\n\n\n***********************************************************')
    print('First 5 datapoint and their corresponsing clusers')
    print('\n',init_cluster_df.head())
    
#     return result_df

#     # Output the DataFrame with cluster labels
#     return df_with_labels

In [63]:
OriginalResults = clustering(initDataT, 1)
OriginalResults


Data points in Cluster 1:

 ['CVE-CWE-94 Diagnostic', 'CVE-CWE-399 Diagnostic', 'CVE-CWE-19 Diagnostic', 'CVE-CWE-59 Diagnostic', 'CVE-CWE-189 Diagnostic', 'CVE-Unknown-Other Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


# Implementation of Metamorphic Relations (MRs)

Steps:
    
    1. Copy a dataset
    2. Impement MR
    3. Check when MR is violated or not
    

MRs are obtain from the paper titled **MT4UML: Metamorphic Testing for Unsupervised
Machine Learning**

# MR1 - Duplicate of instances (single / multiple) -> VD (I would add VR)

**Idea** If we duplicate a single/multiple instance(s)in the follow-up input, the output should remain consistent 

**PIQUE:** benchmark repository with similar measure(s)

In [103]:
# copyt the oroginal transpose dataset
mrData = initDataT.copy()
mrData.shape

(54, 650)

### MR1.1 - Duplicate one instace 

In [65]:
# select a row to duplicate
dpToDuplicate = mrData.loc['CWE-782 Weakness Diagnostic']

# Add a duplicated row (single)
mr1DataIWithDuplicateS = mrData.append(dpToDuplicate)
mr1DataIWithDuplicateS.shape

(55, 650)

**Note:** I duplicated a row which is in cluster `0`. Since there was one datapoints, the cluser should have two and other clusters should not change

In [66]:
clustering(mr1DataIWithDuplicateS, 2)


Data points in Cluster 2:

 ['CWE-782 Weakness Diagnostic', 'CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 4    30
1    11
3     6
5     4
2     2
0     2
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        4
1  CVE-CWE-125 Diagnostic        1
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        1


## ********* FAIL ********

**The clusters changed**

## MR 1.2 Duplicate multiple datapoints

In [67]:
# List of indices to duplicate
# Select 4 indeces (measures) from different clusters -> These were randomly picked
indices_to_duplicate = ['CVE-CWE-120 Diagnostic', 
                        'CVE-CWE-125 Diagnostic', 
                        'CVE-CWE-189 Diagnostic', 
                        'CVE-CWE-190 Diagnostic']

# Select rows with the specified indices
rows_to_duplicate = mrData.loc[indices_to_duplicate]

# Add a duplicated rows
mr1DataIWithDuplicateM = mrData.append(rows_to_duplicate)
mr1DataIWithDuplicateM.shape


(58, 650)

In [68]:
clustering(mr1DataIWithDuplicateM, 0)


Data points in Cluster 0:

 ['CWE-416 Weakness Diagnostic', 'CWE-676 Weakness Diagnostic', 'CWE-119 Weakness Diagnostic', 'CWE-190 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 4    23
1    13
2     7
5     7
3     4
0     4
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        1
1  CVE-CWE-120 Diagnostic        1
2  CVE-CWE-125 Diagnostic        2
3  CVE-CWE-125 Diagnostic        2
4  CVE-CWE-189 Diagnostic        5


## ***** FAIL*****

**Clusters changed**

## MR 2 - Data Standardization: VR and VD

--> **To standardize a DataFrame**, subtract the mean and divide by the standard deviation for each column. This process ensures that each feature has a mean of 0 and a standard deviation of 1

**Idea:** If the existing standardized data is once again standardized, the output should remain the same

    i) it will not change the mean and variance of the data points, and
    ii) it will maintain the same distance among the data points (similar to the source-input); thus, it should not not change the results.
    
**PIQUE**: Rerun the model

### Standardize data


In [69]:
# Copy the data
mr2standardization1 = mrData.copy()

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
mr2standardization1 = pd.DataFrame(scaler.fit_transform(mr2standardization1), columns=mr2standardization1.columns)

# Set the index of new dataFrame to be the same as the original dataframe
mr2standardization1.index = mrData.index

mr2standardization1.head(2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CWE-560 Weakness Diagnostic,-0.719508,-0.421745,-0.580955,0.139553,-0.324995,-0.354497,-0.410507,-0.14837,-0.71386,-0.656712,...,-0.780524,-0.780524,-0.027049,0.221222,0.18367,-0.166425,0.153177,0.140476,-0.753849,-0.235249
CVE-CWE-770 Diagnostic,1.358118,-1.141454,1.274503,-0.247061,-1.04482,-1.065868,-1.216996,-0.71244,-1.772968,1.243841,...,1.31563,1.31563,-0.427814,-0.190736,-0.24885,-0.852886,-0.303422,-0.310548,1.837099,-0.934302


In [70]:
clustering(mr2standardization1, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 2    25
3    14
1     6
4     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        2
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


### Standardize standardized data

In [71]:
# Copy the data
mr2standardization2 = mr2standardization1.copy()

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
mr2standardization2 = pd.DataFrame(scaler.fit_transform(mr2standardization2), columns=mr2standardization2.columns)

# Set the index of new dataFrame to be the same as the original dataframe
mr2standardization2.index = mrData.index

mr2standardization2.head(2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CWE-560 Weakness Diagnostic,-0.719508,-0.421745,-0.580955,0.139553,-0.324995,-0.354497,-0.410507,-0.14837,-0.71386,-0.656712,...,-0.780524,-0.780524,-0.027049,0.221222,0.18367,-0.166425,0.153177,0.140476,-0.753849,-0.235249
CVE-CWE-770 Diagnostic,1.358118,-1.141454,1.274503,-0.247061,-1.04482,-1.065868,-1.216996,-0.71244,-1.772968,1.243841,...,1.31563,1.31563,-0.427814,-0.190736,-0.24885,-0.852886,-0.303422,-0.310548,1.837099,-0.934302


In [72]:
clustering(mr2standardization2, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 2    25
3    14
1     6
4     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        2
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


In [106]:
## Origina Results
clustering(mrData, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ******PASS*******

**No changes**

# MR3 - Duplication of Features: VD

**Idea**: For a given source input we denote the output. For the follow-up input, if new features are added by duplicating existing features, the output should remain unchanged

**PIQUE**: Addition of software to benchamrk repository

In [74]:
# Copy data
mr3duplicateFeat = mrData.copy()

# Duplicate the columns
duplicated_columns = mr3duplicateFeat.iloc[:, :10] # first 10 columns

# Concatenate the original DataFrame with the duplicated columns
mr3duplicateFeat = pd.concat([mr3duplicateFeat, duplicated_columns], axis=1)

# Display the DataFrame with duplicated columns added
mr3duplicateFeat.head(2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1
CWE-560 Weakness Diagnostic,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,...,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862
CVE-CWE-770 Diagnostic,0.930134,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,...,0.930134,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134


In [75]:
# MR3 results
clustering(mr3duplicateFeat, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


In [76]:
# Original results
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ********PASS********

**No changes**

# MR 4 - Removal of Instance(s): VD

**PIQUE**: Removal of measures 

## MR 4.1 - Removal of instance from one cluster

If an instance from a cluster is removed for the follow-up input, it should not have any effect on changing the results for the remaining inputs

In [77]:
# Copy data
mr4removeInstanceS = mrData.copy()

# We remove instnce from cluster 5, which initially had 3 insances, now we expect 2 and no changes to other clusters
mr4removeInstanceS = mr4removeInstanceS.drop('CVE-CWE-502 Diagnostic')
mr4removeInstanceS.shape

(53, 650)

In [78]:
clustering(mr4removeInstanceS, 2)


Data points in Cluster 2:

 ['CWE-676 Weakness Diagnostic', 'CWE-190 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 0    35
3     6
1     5
5     3
4     2
2     2
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        0
1  CVE-CWE-125 Diagnostic        1
2  CVE-CWE-189 Diagnostic        3
3   CVE-CWE-19 Diagnostic        3
4  CVE-CWE-190 Diagnostic        1


In [79]:
clustering(initDataT, 5)


Data points in Cluster 5:

 ['CVE-CWE-502 Diagnostic', 'CVE-CWE-426 Diagnostic', 'CVE-CWE-266 Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ********* FAIL************

**Clusters changed**. The labes might have changed too, but the count of instances in each cluser changed. We expected a cluster with two instances but the ones that that contain two instances, they contain instances of the initial cluser. 

## MR4.2 - Removal of instance from different clusters

In [80]:
# copy data
mr4removeInstanceM = mrData.copy() #M: multiple

# List of indices to remove
# We remove one instance from each original cluster
indices_to_remove = ['CWE-782 Weakness Diagnostic', 
                     'CVE-CWE-94 Diagnostic', 
                     'CWE-415 Weakness Diagnostic', 
                     'CWE-560 Weakness Diagnostic',
                     'CVE-CWE-770 Diagnostic',
                     'CVE-CWE-502 Diagnostic']

# Remove rows with specified indices
mr4removeInstanceM = mr4removeInstanceM.drop(indices_to_remove)
mr4removeInstanceM.shape


(48, 650)

### MR4.2 results

In [81]:
clustering(mr4removeInstanceM, 0)


Data points in Cluster 0:

 ['CVE-CWE-125 Diagnostic', 'CVE-CWE-399 Diagnostic', 'CVE-CWE-787 Diagnostic', 'CVE-CWE-190 Diagnostic', 'CVE-CWE-19 Diagnostic', 'CVE-CWE-59 Diagnostic', 'CVE-CWE-189 Diagnostic', 'CVE-Unknown-Other Diagnostic', 'CVE-CWE-772 Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 1    19
4    12
0     9
5     6
3     1
2     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        4
1  CVE-CWE-125 Diagnostic        0
2  CVE-CWE-189 Diagnostic        0
3   CVE-CWE-19 Diagnostic        0
4  CVE-CWE-190 Diagnostic        0


### Original Results

In [82]:
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ************ FAIL ****

We removed one instance from each cluster. Expected:
    - The difference expected is a drop on one instance in each element
    - No changes on the clusters of the remaining instances

# MR5 - Addition of Uninformative Attribute: VR & VD

**Idea**: For the follow-up input, if a new uninformative feature (i.e., a feature having the same value for all the instances) is added, the output should remain unchanged.

**PIQUE**: Add a software to the benchmark repository with exactly same measures as one the existing software

In [83]:
# copy data
mr3AddUninfoAttr = mrData.copy()

# name the columns -> I'm naming the number if the columns in df
# Get the number of columns in the DataFrame
num_columns = mr3AddUninfoAttr.shape[1]

# Create a list of column names from 0 to num_columns - 1
column_names = [str(i) for i in range(num_columns)]

# Assign the list of column names to the DataFrame
mr3AddUninfoAttr.columns = column_names

# Remove the first column and store it in a separate variable
first_column = mr3AddUninfoAttr.iloc[:, 0]

# Remove the first column from the DataFrame
mr3AddUninfoAttr = mr3AddUninfoAttr.drop(mr3AddUninfoAttr.columns[0], axis=1)

# Rename the first column
new_column_name = 'UninfoAttr'
first_column = first_column.rename(new_column_name)

# Add the renamed first column back to the DataFrame
mr3AddUninfoAttr = pd.concat([first_column, mr3AddUninfoAttr], axis=1)

# Display the DataFrame with the renamed first column
print(mr3AddUninfoAttr.shape)
mr3AddUninfoAttr.head(2)

(54, 650)


Unnamed: 0,UninfoAttr,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CWE-560 Weakness Diagnostic,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,...,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862
CVE-CWE-770 Diagnostic,0.930134,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,...,0.930134,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,0.439933


### MR5 results

In [84]:
clustering(mr3AddUninfoAttr, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


### Original results

In [85]:
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ******** PASS ********

**No changes**

# MR6 - Deterministic Output Across Multiple Runs: VR & VD

**Idea**: If a new data point is added, it should be assigned to the same cluster no matter how many times the algorithm under test is executed

**PIQUE**: rerun the models to generate weights

In [86]:
# copy data
mr6DetermOutput = mrData.copy()

# name the columns -> I'm naming the number if the columns in df
# Get the number of columns in the DataFrame
num_columns = mr6DetermOutput.shape[1]

# Create a list of column names from 0 to num_columns - 1
column_names = [str(i) for i in range(num_columns)]

# Assign the list of column names to the DataFrame
mr6DetermOutput.columns = column_names

# reserve the row for test
# Select the last row
dataPointForTest = mr6DetermOutput.iloc[-1]

# Remove the last row from the original DataFrame
mr6DetermOutput = mr6DetermOutput.drop(mr6DetermOutput.index[-1])

# print(dataPointForTest)
print('df without last row', mr6DetermOutput.shape)
 

df without last row (53, 650)


In [87]:
# Fit the model

# Specify the number of runs
num_runs = 100

# Initialize a list to store the predicted clusters across runs
predicted_clusters = []

# initial_cluster = predicted_clusters[0]

for i in range(num_runs):
    # Instantiate the KMeans model
    kmeans = KMeans(n_clusters=6, init='random', random_state=42)

    kmeans.fit(mr6DetermOutput)

    # mr6DetermOutput['Cluster_Label'] = kmeans.fit_predict(mr6DetermOutput)

    #Predict the cluster for the new data point 
    dataPointForTest_cluster = kmeans.predict([dataPointForTest])[0]
#     print("Predicted cluster for the new data point:", dataPointForTest_cluster)
    
    # Store the predicted cluster
    predicted_clusters.append(dataPointForTest_cluster)
    
print(set(predicted_clusters))


{0}


## ******** PASS *******

The cluster of a new datapoint did not chance regardless if how manytime the algolith runs

# MR 7 (10 in P) - Changing the Location of Features: VR & VD

**Idea** If we change the order of features, the clustering result should remain unchanged for both the source and follow-up inputs

**PIQUE** Change in the order software in benchmarl repo


In [88]:
# copy data
mr7ChangFeatLoc = mrData.copy()

# Shuffle the order of columns
# frac=1 specifies that we want to sample the entire DataFrame.
# axis=1 indicates that we are shuffling the columns.
mr7ChangFeatLoc = mr7ChangFeatLoc.sample(frac=1, axis=1)
mr7ChangFeatLoc.head(2)

Unnamed: 0,59,401,283,20,461,375,379,567,239,158,...,277,53,70,483,272,404,580,591,100,389
CWE-560 Weakness Diagnostic,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,...,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862
CVE-CWE-770 Diagnostic,0.439933,0.930134,0.439933,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,...,0.439933,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134


### MR7 results

In [89]:
clustering(mr7ChangFeatLoc, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


### Original results

In [90]:
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ******* PASS *******

In [91]:
## me -  change order of a new software 

# MR 8 (11 in P) - Adding an Informative Attribute:  VD

**Idea**: If a new attribute whose value is strongly associated with each of the clusters i.e., value x1 with c1, x2 with c2,..., and xn with cn, is added to the original data instances, the clustering result should remain the same for both the source and follow-up inputs

Me: Association with two features

**PIQUE**: Add a software whih is associated with others in BR

In [92]:
# copy data
mr8addInformAttr = mrData.copy()

# name the columns -> I'm naming the number if the columns in df
# Get the number of columns in the DataFrame
num_columns = mr8addInformAttr.shape[1]
# Create a list of column names from 0 to num_columns - 1
column_names = [str(i) for i in range(num_columns)]
# Assign the list of column names to the DataFrame
mr8addInformAttr.columns = column_names

# Add a column
mr8addInformAttr['InformAttr'] = mr8addInformAttr['0']/mr8addInformAttr['1']
mr8addInformAttr.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,641,642,643,644,645,646,647,648,649,InformAttr
CWE-560 Weakness Diagnostic,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,...,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,0.506862,1.0
CVE-CWE-770 Diagnostic,0.930134,0.439933,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,...,0.930134,0.439933,0.439933,0.439933,0.439933,0.439933,0.439933,0.930134,0.439933,2.114261


Since K-Mean is sensitive to outliers, we will standardize the original dataset and the new dataset

In [93]:
# Scale original data

scaler = StandardScaler()
OGdata = scaler.fit_transform(mrData)
scaled_OGdata = pd.DataFrame(OGdata, columns=mrData.columns)
# add index
scaled_OGdata.index = mrData.index
scaled_OGdata.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CWE-560 Weakness Diagnostic,-0.719508,-0.421745,-0.580955,0.139553,-0.324995,-0.354497,-0.410507,-0.14837,-0.71386,-0.656712,...,-0.780524,-0.780524,-0.027049,0.221222,0.18367,-0.166425,0.153177,0.140476,-0.753849,-0.235249
CVE-CWE-770 Diagnostic,1.358118,-1.141454,1.274503,-0.247061,-1.04482,-1.065868,-1.216996,-0.71244,-1.772968,1.243841,...,1.31563,1.31563,-0.427814,-0.190736,-0.24885,-0.852886,-0.303422,-0.310548,1.837099,-0.934302


In [94]:
# Scale data with added infoAffr

scaler = StandardScaler()
mr8addInformAttr_data = scaler.fit_transform(mr8addInformAttr)
scaled_mr8addInformAttr_data = pd.DataFrame(mr8addInformAttr_data, columns=mr8addInformAttr.columns)
# add index
scaled_mr8addInformAttr_data.index = mr8addInformAttr.index
scaled_mr8addInformAttr_data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,641,642,643,644,645,646,647,648,649,InformAttr
CWE-560 Weakness Diagnostic,-0.719508,-0.421745,-0.580955,0.139553,-0.324995,-0.354497,-0.410507,-0.14837,-0.71386,-0.656712,...,-0.780524,-0.027049,0.221222,0.18367,-0.166425,0.153177,0.140476,-0.753849,-0.235249,-0.374045
CVE-CWE-770 Diagnostic,1.358118,-1.141454,1.274503,-0.247061,-1.04482,-1.065868,-1.216996,-0.71244,-1.772968,1.243841,...,1.31563,-0.427814,-0.190736,-0.24885,-0.852886,-0.303422,-0.310548,1.837099,-0.934302,0.924316


### results of scaled original data

In [95]:
clustering(scaled_OGdata, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 2    25
3    14
1     6
4     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        2
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


### Results of scaled data with added infoAttr

In [96]:
clustering(scaled_mr8addInformAttr_data, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 2    25
3    14
1     6
4     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        2
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ******** PASS *****

**No changes on clusters**

# MR 9 (12 in P) - Rows Transformation: VD & VR

**Idea**: If we reverse the order of data points/rows, the clustering result should remain consistent

**PIQUE**: Shuffle measure in BR

In [97]:
# copy data
mr9rowsTransform = mrData.copy()

# Shuffle the rows
# sample(frac=1) shuffles the rows of the DataFrame randomly
mr9rowsTransform = mr9rowsTransform.sample(frac=1)
mr9rowsTransform.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
CVE-CWE-266 Diagnostic,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,...,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519,0.512519
CVE-CWE-787 Diagnostic,0.976588,0.447698,0.976588,0.447698,0.447698,0.447698,0.447698,0.447698,0.447698,0.976588,...,0.976588,0.976588,0.447698,0.447698,0.447698,0.447698,0.447698,0.447698,0.911103,0.447698


### Results of MR 9

In [98]:
clustering(mr9rowsTransform, 0)


Data points in Cluster 0:

 ['CVE-CWE-94 Diagnostic', 'CVE-Unknown-Other Diagnostic', 'CVE-CWE-59 Diagnostic', 'CVE-CWE-399 Diagnostic', 'CVE-CWE-189 Diagnostic', 'CVE-CWE-19 Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 2    17
4    16
1     9
0     6
3     5
5     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        2
1  CVE-CWE-125 Diagnostic        1
2  CVE-CWE-189 Diagnostic        0
3   CVE-CWE-19 Diagnostic        0
4  CVE-CWE-190 Diagnostic        1


In [99]:
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ****** FAIL *******

The clusters changed 

In [None]:
# ## ******** VR - PASS ***********
# When you shuffle the data points, their relative positions in the feature space change. Shuffling the data points can lead to different initializations of the cluster centroids

# ## ******** VD - FAIL ************
# if we change the order of rows/data points, it will not have any affect on existing relationship between the
# data points

# MR 10 (14 in P) - Addition of New Instance(s): VD

**Idea** If we add a new instance(s) to any of the clusters, this addition of new instance(s) may result in the change of centroids (different
from the one found during source execution); thus, changing the final output.

**PIQUE** Addition of new measures

In [100]:
# copy data
mr19addNewInstance = mrData.copy()

# Generate a new row with random values
new_instance = pd.Series(np.random.rand(mr19addNewInstance.shape[1]), index=mr19addNewInstance.columns)

# Add the new row to the DataFrame
mr19addNewInstance = mr19addNewInstance.append(new_instance, ignore_index=True)
print(mr19addNewInstance.shape)
mr19addNewInstance.tail(2)

(55, 650)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,640,641,642,643,644,645,646,647,648,649
53,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,...,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773,0.617773
54,0.704611,0.257289,0.604309,0.994977,0.59737,0.93415,0.005439,0.801282,0.894118,0.412376,...,0.759087,0.514833,0.216341,0.001098,0.972477,0.423917,0.588088,0.587374,0.918762,0.938833


### Results of MR 10

In [101]:
clustering(mr19addNewInstance, 0)


Data points in Cluster 0:

 [20, 44]



***********************************************************
Clusers and numbers of instances they contain

 4    31
1    11
3     6
5     4
0     2
2     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

    Index  Cluster
0      0        4
1      1        1
2      2        4
3      3        4
4      4        1


In [102]:
clustering(initDataT, 0)


Data points in Cluster 0:

 ['CWE-782 Weakness Diagnostic']



***********************************************************
Clusers and numbers of instances they contain

 3    30
4     9
1     6
2     5
5     3
0     1
Name: Cluster, dtype: int64



***********************************************************
First 5 datapoint and their corresponsing clusers

                     Index  Cluster
0  CVE-CWE-120 Diagnostic        3
1  CVE-CWE-125 Diagnostic        4
2  CVE-CWE-189 Diagnostic        1
3   CVE-CWE-19 Diagnostic        1
4  CVE-CWE-190 Diagnostic        4


## ******** PASS ********

The clusters are different as expected

In [None]:
# END