CNN train rule to cve without cluster

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# parameters
max_words = 10000
max_len = 100
embedding_dim = 300
similarity_threshold = 0.6
keyword_weight = 0.2

# Loading the datasets
train_data = pd.read_csv('combined_dataset_75training.csv')
cve_data = pd.read_csv('Processed_CVE_withSpace.csv')




train_text = (
    train_data['triggerChannelTitle'].fillna('') + " " +
    train_data['actionChannelTitle'].fillna('') + " " +
    train_data['Processed Title + Description'].fillna('') + " " +
    train_data['Generated Topic Name'].fillna('') + " " +
    train_data['Best Matched Keywords'].fillna('')
)

cve_text = cve_data['Processed_Text'].fillna('')

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_text.values)
tokenizer.fit_on_texts(cve_text.values)
train_sequences = tokenizer.texts_to_sequences(train_text.values)
cve_sequences = tokenizer.texts_to_sequences(cve_text.values)
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
cve_padded = pad_sequences(cve_sequences, maxlen=max_len, padding='post')

# Load the Word2Vec embeddings
word2vec_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))


input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
                            weights=[embedding_matrix], trainable=False)(input_layer)
conv_layer = Conv1D(128, 5, activation='relu')(embedding_layer)
batch_norm1 = BatchNormalization()(conv_layer)
global_pool = GlobalMaxPooling1D()(batch_norm1)
dense_layer = Dense(64, activation='relu')(global_pool)
batch_norm2 = BatchNormalization()(dense_layer)
dropout_layer = Dropout(0.5)(batch_norm2)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

final_model = Model(inputs=input_layer, outputs=output_layer)
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
embedding_extractor = Model(inputs=final_model.input, outputs=global_pool)

# Generate embeddings
train_embeddings = embedding_extractor.predict(train_padded)
cve_embeddings = embedding_extractor.predict(cve_padded)

# Computeing keyword similarity
def keyword_similarity(row_keywords, cve_keywords):
    train_keywords_set = set(row_keywords.lower().split())
    cve_keywords_set = set(cve_keywords.lower().split())
    intersection = train_keywords_set.intersection(cve_keywords_set)
    union = train_keywords_set.union(cve_keywords_set)
    return len(intersection) / len(union) if union else 0


def compute_keyword_similarities(train_data, cve_data):
    keyword_similarities = np.zeros((len(train_data), len(cve_data)))
    for i, train_keywords in enumerate(train_data['Best Matched Keywords']):
        for j, cve_keywords in enumerate(cve_data['Processed_Text']):
            keyword_similarities[i, j] = keyword_similarity(train_keywords, cve_keywords)
    return keyword_similarities

keyword_similarities = compute_keyword_similarities(train_data, cve_data)

# Combine embedding and keyword similarity
combined_similarity = (1 - keyword_weight) * cosine_similarity(train_embeddings, cve_embeddings) + keyword_weight * keyword_similarities
best_matches = combined_similarity.argmax(axis=1)
best_scores = combined_similarity.max(axis=1)

# Generate train labels based on the best scores
train_labels = (best_scores >= similarity_threshold).astype(int)

# Train the CNN model on the train data
print("Training the CNN model...")
final_model.fit(train_padded, train_labels, epochs=8 , batch_size=32)

# Saveing of the trained cnn model
final_model.save("final_cnn_model_for_cve_mapping_0.2w.keras")
print("Final trained model saved as 'final_cnn_model_for_cve_mapping_0.2w.keras'.")

# calculate the Accuracy
correct_predictions = (best_scores >= similarity_threshold).sum()
accuracy = correct_predictions / len(best_scores) * 100
print(f"Accuracy based on similarity threshold ({similarity_threshold}): {accuracy:.2f}")

# Add predictions to output data
train_output_data = train_data.copy()
train_output_data['CVE Name'] = cve_data['Name'].iloc[best_matches].values
train_output_data['CVE Processed Text'] = cve_data['Processed_Text'].iloc[best_matches].values
train_output_data['Similarity Score'] = best_scores
train_output_data['Label'] = train_labels

# columns to include in the final output dataset
meaningful_columns = [
    'triggerTitle', 'triggerChannelTitle', 'actionTitle',
    'title', 'desc', 'target', 'Best Matched Keywords', 'Generated Topic Name',
    'CVE Name', 'CVE Processed Text', 'Similarity Score', 'Label'
]
train_output_data = train_output_data[meaningful_columns]

# Save results in the output dataset
train_output_data.to_csv("training_mapping_results_cnn_model_0.2w.csv", index=False)
print("Final mapping results saved to 'training_mapping_results_cnn_model_0.2w.csv'.")

cnn test rule to cve without cluster

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score

# Parameters
max_words = 10000
max_len = 100
similarity_threshold = 0.8
keyword_weight = 0.6

# Load datasets
test_data = pd.read_csv('combined_dataset_25testing.csv')
cve_data = pd.read_csv('Processed_CVE_withSpace.csv')
expert_data = pd.read_csv('Complete_Rule_to_CVE_Mapping_with_Full_Details.csv')

# Preprocess text data
test_text = (
    test_data['triggerChannelTitle'].fillna('') + " " +
    test_data['actionChannelTitle'].fillna('') + " " +
    test_data['Processed Title + Description'].fillna('') + " " +
    test_data['Generated Topic Name'].fillna('') + " " +
    test_data['Best Matched Keywords'].fillna('')
)
cve_text = cve_data['Processed_Text'].fillna('')

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(test_text.values)
tokenizer.fit_on_texts(cve_text.values)
test_sequences = tokenizer.texts_to_sequences(test_text.values)
cve_sequences = tokenizer.texts_to_sequences(cve_text.values)
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')
cve_padded = pad_sequences(cve_sequences, maxlen=max_len, padding='post')

# Load trained CNN model
model_path = "final_cnn_model_for_cve_mapping_0.2w.keras"
final_model = load_model(model_path)

# Extract embeddings
embedding_extractor = Model(inputs=final_model.input, outputs=final_model.layers[-3].output)
test_embeddings = embedding_extractor.predict(test_padded)
cve_embeddings = embedding_extractor.predict(cve_padded)

# Compute keyword similarity
def keyword_similarity(row_keywords, cve_keywords):
    train_keywords_set = set(row_keywords.lower().split())
    cve_keywords_set = set(cve_keywords.lower().split())
    intersection = train_keywords_set.intersection(cve_keywords_set)
    union = train_keywords_set.union(cve_keywords_set)
    return len(intersection) / len(union) if union else 0

# Calculate keyword similarities
def compute_keyword_similarities(test_data, cve_data):
    keyword_similarities = np.zeros((len(test_data), len(cve_data)))
    for i, test_keywords in enumerate(test_data['Best Matched Keywords']):
        for j, cve_keywords in enumerate(cve_data['Processed_Text']):
            keyword_similarities[i, j] = keyword_similarity(test_keywords, cve_keywords)
    return keyword_similarities

keyword_similarities = compute_keyword_similarities(test_data, cve_data)

# Combine embedding and keyword similarity
combined_similarity = (1 - keyword_weight) * cosine_similarity(test_embeddings, cve_embeddings) + keyword_weight * keyword_similarities

# Normalize combined similarity to 0-1 range
combined_similarity = (combined_similarity - combined_similarity.min()) / (combined_similarity.max() - combined_similarity.min())
best_matches = combined_similarity.argmax(axis=1)
best_scores = combined_similarity.max(axis=1)

# Generate labels based on similarity threshold
test_labels = (best_scores >= similarity_threshold).astype(int)

# Map test data to CVE dataset
test_data['CVE Name'] = cve_data['Name'].iloc[best_matches].values
test_data['CVE Processed Text'] = cve_data['Processed_Text'].iloc[best_matches].values
test_data['Similarity Score'] = best_scores
test_data['Predicted Label'] = test_labels

# Save the results
test_data.to_csv("test_mapping_results_cnn_model_expert_without_cluster.csv", index=False)
print("Test mapping results saved to 'test_mapping_results_cnn_model_try.csv'.")

# Use the expert labels
expert_labels = expert_data['Logical Match']


if len(test_labels) != len(expert_labels):
    raise ValueError("Mismatch in row counts between test data and expert-labeled data.")


print("Comparison with Expert Labels:")
print(classification_report(test_labels, expert_labels))

# Calculate and print accuracy
accuracy = accuracy_score(test_labels, expert_labels)
print(f"Accuracy Compared to Expert Labels: {accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


precision = precision_score(expert_labels, test_labels, average='weighted')
recall = recall_score(expert_labels, test_labels, average='weighted')
f1 = f1_score(expert_labels, test_labels, average='weighted')
accuracy = accuracy_score(expert_labels, test_labels)

print(f"Precision based on labels: {precision:.2f}")
print(f"Recall based on labels: {recall:.2f}")
print(f"F1-Score based on labels: {f1:.2f}")
print(f"Accuracy based on labels: {accuracy:.2f}")

cnn train rule to cve with cluster

In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pickle

# Load the datasets
combined_df = pd.read_csv('combined_dataset_75training.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')

# Ensure all entries are strings
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# Combine columns for rule text
combined_texts = (
    combined_df['Best Matched Keywords'] + " " +
    combined_df['Processed Title + Description'] + " " +
    combined_df['actionChannelTitle'] + " " +
    combined_df['triggerChannelTitle']
).tolist()

cve_texts = cve_df['Processed_Text'].tolist()

# Parameters
max_words = 10000
max_len = 100
embedding_dim = 300

# Tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(combined_texts + cve_texts)
word_index = tokenizer.word_index

# Load Word2Vec embeddings
word2vec_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))


# Define the CNN model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
                            weights=[embedding_matrix], trainable=True)(input_layer)
conv_layer = Conv1D(128, 5, activation='relu')(embedding_layer)
batch_norm1 = BatchNormalization()(conv_layer)
global_pool = GlobalMaxPooling1D()(batch_norm1)
dense_layer = Dense(128, activation='relu')(global_pool)
batch_norm2 = BatchNormalization()(dense_layer)
dropout_layer = Dropout(0.5)(batch_norm2)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

cnn_model = Model(inputs=input_layer, outputs=output_layer)


# Compile the CNN model
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




# Define the embedding extractor model
embedding_extractor = Model(inputs=cnn_model.input, outputs=global_pool)

# Function to compute embeddings using CNN
def compute_cnn_embeddings(texts, model, max_length=100):
    # Tokenize and pad texts
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Generate embeddings
    embeddings = model.predict(padded_sequences)
    return embeddings

# Compute embeddings
print("Computing embeddings using CNN...")
combined_embeddings = compute_cnn_embeddings(combined_texts, embedding_extractor)
cve_embeddings = compute_cnn_embeddings(cve_texts, embedding_extractor)
print("Embeddings computed successfully.")

# Save the CNN model
cnn_model.save("cnn_model_with_clustering_final28.keras")
print("CNN model saved to 'cnn_model_with_clustering.keras'.")

# Perform clustering using KMeans
optimal_clusters = 20
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
cve_clusters = kmeans.fit_predict(cve_embeddings)
print(f"Clustering completed with {optimal_clusters} clusters.")



# Save the KMeans model
with open("kmeans_model_final28.pkl", "wb") as file:
    pickle.dump(kmeans, file)
print("KMeans model saved to 'kmeans_model.pkl'.")

# Map rules to CVEs and compute evaluation metrics
predicted_labels = []
true_labels = []
output_data = {
    'combined_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'assigned_cluster': [],
    'similarity_score': [],
    'correctly_mapped_cluster': []
}

correct_matches = 0
for i, combined_embedding in enumerate(combined_embeddings):

    # Predict the cluster
    combined_cluster = kmeans.predict(combined_embedding.reshape(1, -1))[0]

    # Compute similarities
    similarities = cosine_similarity(combined_embedding.reshape(1, -1), cve_embeddings).flatten()
    best_match_idx = np.argmax(similarities)
    best_similarity = similarities[best_match_idx]

    # Get the best matching CVE and its cluster
    true_cve_cluster = cve_clusters[best_match_idx]
    cve_name = cve_df.iloc[best_match_idx]['Name'] if 'Name' in cve_df.columns else 'N/A'
    cve_text = cve_df.iloc[best_match_idx]['Processed_Text']


    # Check if the cluster mapping is correct
    optimal_threshold = 0.8
    is_correct = int(best_similarity >= optimal_threshold and combined_cluster == true_cve_cluster)
    correct_matches += is_correct
    predicted_labels.append(combined_cluster)
    true_labels.append(true_cve_cluster)

    # Save details for analysis
    output_data['combined_text'].append(combined_texts[i])
    output_data['title'].append(combined_df.iloc[i]['title'])
    output_data['desc'].append(combined_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['assigned_cluster'].append(combined_cluster)
    output_data['similarity_score'].append(best_similarity)
    output_data['correctly_mapped_cluster'].append(is_correct)



# Save results to CSV
output_df = pd.DataFrame(output_data)
output_file = 'cnn_based_cluster_results_final28.csv'
output_df.to_csv(output_file, index=False)
print(f"Results saved to '{output_file}'.")

cnn test rule to cve with cluster

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle


testing_data_path = 'combined_dataset_25testing.csv'
cve_data_path = 'Processed_CVE_withSpace.csv'
cnn_model_path = 'cnn_model_with_clustering_final128.keras'
kmeans_model_path = 'kmeans_model_final128.pkl'
output_file_path = 'testing_mapping_results_with_clusters_try2.csv'

# Load datasets
testing_df = pd.read_csv(testing_data_path)
cve_df = pd.read_csv(cve_data_path)

# Load models
cnn_model = load_model(cnn_model_path)
with open(kmeans_model_path, 'rb') as file:
    kmeans_model = pickle.load(file)

# Parameters
max_words = 10000
max_len = 100

# Preprocess text data
testing_df['Best Matched Keywords'] = testing_df['Best Matched Keywords'].fillna("").astype(str)
testing_df['Processed Title + Description'] = testing_df['Processed Title + Description'].fillna("").astype(str)
testing_df['actionChannelTitle'] = testing_df['actionChannelTitle'].fillna("").astype(str)
testing_df['triggerChannelTitle'] = testing_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

testing_texts = (
    testing_df['Best Matched Keywords'] + " " +
    testing_df['Processed Title + Description'] + " " +
    testing_df['actionChannelTitle'] + " " +
    testing_df['triggerChannelTitle']
).tolist()

cve_texts = cve_df['Processed_Text'].tolist()

# Tokenizer setup
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(testing_texts + cve_texts)

# Embedding extractor
embedding_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.layers[-3].output)

# Compute embeddings
def compute_cnn_embeddings(texts, tokenizer, model, max_length=100):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    embeddings = model.predict(padded_sequences)
    return embeddings

print("Computing embeddings...")
testing_embeddings = compute_cnn_embeddings(testing_texts, tokenizer, embedding_extractor, max_len)
cve_embeddings = compute_cnn_embeddings(cve_texts, tokenizer, embedding_extractor, max_len)
print("Embeddings computed successfully.")

# Mapping
output_data = {
    'testing_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'assigned_cluster': [],
    'similarity_score': [],
    'correctly_mapped_cluster': []
}
correct_matches = 0
optimal_threshold = 0.8

# Collect true and predicted clusters
predicted_clusters = []
true_clusters = []

for i, test_embedding in enumerate(testing_embeddings):
    predicted_cluster = kmeans_model.predict(test_embedding.reshape(1, -1))[0]
    similarities = cosine_similarity(test_embedding.reshape(1, -1), cve_embeddings).flatten()
    best_match_idx = np.argmax(similarities)
    best_similarity = similarities[best_match_idx]
    cve_name = cve_df.iloc[best_match_idx]['Name']
    cve_text = cve_df.iloc[best_match_idx]['Processed_Text']
    true_cve_cluster = kmeans_model.predict(cve_embeddings[best_match_idx].reshape(1, -1))[0]
    is_correct = int(best_similarity >= optimal_threshold and predicted_cluster == true_cve_cluster)
    correct_matches += is_correct

    predicted_clusters.append(predicted_cluster)
    true_clusters.append(true_cve_cluster)

    output_data['testing_text'].append(testing_texts[i])
    output_data['title'].append(testing_df.iloc[i]['title'])
    output_data['desc'].append(testing_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['assigned_cluster'].append(predicted_cluster)
    output_data['similarity_score'].append(best_similarity)
    output_data['correctly_mapped_cluster'].append(is_correct)

# Calculate metrics
accuracy = accuracy_score(true_clusters, predicted_clusters)
precision = precision_score(true_clusters, predicted_clusters, average='weighted')
recall = recall_score(true_clusters, predicted_clusters, average='weighted')
f1 = f1_score(true_clusters, predicted_clusters, average='weighted')

print(f"Cluster-Based Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Save results
output_df = pd.DataFrame(output_data)
output_df.to_csv(output_file_path, index=False)
print(f"Results saved to '{output_file_path}'.")


cnn test rule to cve with cluster and expert

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle


testing_data_path = 'combined_dataset_25testing.csv'
cve_data_path = 'Processed_CVE_withSpace.csv'
cnn_model_path = 'cnn_model_with_clustering_final128.keras'
kmeans_model_path = 'kmeans_model_final128.pkl'
output_file_path = 'cnn_testing_mapping_results_with_clusters_expert.csv'

# Load expert data
expert_data = pd.read_csv('Complete_Rule_to_CVE_Mapping_with_Full_Details.csv')
expert_labels = expert_data['Logical Match'].astype(int).values

# Load datasets
testing_df = pd.read_csv(testing_data_path)
cve_df = pd.read_csv(cve_data_path)

# Load models
cnn_model = load_model(cnn_model_path)
with open(kmeans_model_path, 'rb') as file:
    kmeans_model = pickle.load(file)

# Parameters
max_words = 10000
max_len = 100

# Preprocess text data
testing_df['Best Matched Keywords'] = testing_df['Best Matched Keywords'].fillna("").astype(str)
testing_df['Processed Title + Description'] = testing_df['Processed Title + Description'].fillna("").astype(str)
testing_df['actionChannelTitle'] = testing_df['actionChannelTitle'].fillna("").astype(str)
testing_df['triggerChannelTitle'] = testing_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

testing_texts = (
    testing_df['Best Matched Keywords'] + " " +
    testing_df['Processed Title + Description'] + " " +
    testing_df['actionChannelTitle'] + " " +
    testing_df['triggerChannelTitle']
).tolist()

cve_texts = cve_df['Processed_Text'].tolist()

# Tokenizer setup
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(testing_texts + cve_texts)

# Embedding extractor
embedding_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.layers[-3].output)

# Compute embeddings
def compute_cnn_embeddings(texts, tokenizer, model, max_length=100):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    embeddings = model.predict(padded_sequences)
    return embeddings

print("Computing embeddings...")
testing_embeddings = compute_cnn_embeddings(testing_texts, tokenizer, embedding_extractor, max_len)
cve_embeddings = compute_cnn_embeddings(cve_texts, tokenizer, embedding_extractor, max_len)
print("Embeddings computed successfully.")

# Mapping
output_data = {
    'testing_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'assigned_cluster': [],
    'similarity_score': [],
    'correctly_mapped_cluster': []
}
correct_matches = 0
optimal_threshold = 0.9

# Collect true and predicted clusters
predicted_clusters = []
true_clusters = []
correctly_mapped_clusters = []

for i, test_embedding in enumerate(testing_embeddings):
    predicted_cluster = kmeans_model.predict(test_embedding.reshape(1, -1))[0]
    similarities = cosine_similarity(test_embedding.reshape(1, -1), cve_embeddings).flatten()
    best_match_idx = np.argmax(similarities)
    best_similarity = similarities[best_match_idx]
    cve_name = cve_df.iloc[best_match_idx]['Name']
    cve_text = cve_df.iloc[best_match_idx]['Processed_Text']
    true_cve_cluster = kmeans_model.predict(cve_embeddings[best_match_idx].reshape(1, -1))[0]

    # Check if the mapping is correct
    is_correct = int(best_similarity >= optimal_threshold and predicted_cluster == true_cve_cluster)
    correctly_mapped_clusters.append(is_correct)

    correct_matches += is_correct
    predicted_clusters.append(predicted_cluster)
    true_clusters.append(true_cve_cluster)

    output_data['testing_text'].append(testing_texts[i])
    output_data['title'].append(testing_df.iloc[i]['title'])
    output_data['desc'].append(testing_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['assigned_cluster'].append(predicted_cluster)
    output_data['similarity_score'].append(best_similarity)
    output_data['correctly_mapped_cluster'].append(is_correct)

# Save results
output_df = pd.DataFrame(output_data)
output_df.to_csv(output_file_path, index=False)
print(f"Results saved to '{output_file_path}'.")


print("Comparison with Expert Labels:")
print(classification_report(expert_labels, correctly_mapped_clusters))

# Calculate metrics
accuracy = accuracy_score(expert_labels, correctly_mapped_clusters)
precision = precision_score(expert_labels, correctly_mapped_clusters, average='weighted')
recall = recall_score(expert_labels, correctly_mapped_clusters, average='weighted')
f1 = f1_score(expert_labels, correctly_mapped_clusters, average='weighted')

print(f"Cluster-Based Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")



cnn train cve to mitre wihtout cluster

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
train_data = pd.read_csv('training_mapping_results_cnn_model_0.2w.csv')
mitre_data = pd.read_csv('Processed_mitre_recommidtaion.csv')

# Parameters
max_words = 10000
max_len = 100
embedding_dim = 300
similarity_threshold = 0.6
keyword_weight = 0.3
num_techniques = mitre_data['Technique'].nunique()

# Combine relevant columns into text for tokenization
train_text =(train_data['CVE Processed Text'].fillna('')+" "+
             train_data['triggerChannelTitle'].fillna('') + " " +
             train_data['triggerTitle'].fillna('') + " " +
             train_data['actionTitle'].fillna('') + " " +
             train_data['title'].fillna('') + " " +
             train_data['desc'].fillna('') + " " +
             train_data['Generated Topic Name'].fillna('') + " " +
             train_data['Best Matched Keywords'].fillna('')
)


mitre_text = (mitre_data['Processed_Technique'].fillna('')+" "+
             mitre_data['Tactic'].fillna('')+" "+
             mitre_data['mitigation'].fillna(''))

# Tokenizer setup
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_text.values)
tokenizer.fit_on_texts(mitre_text.values)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_text.values)
mitre_sequences = tokenizer.texts_to_sequences(mitre_text.values)

# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
mitre_padded = pad_sequences(mitre_sequences, maxlen=max_len, padding='post')

# Load Word2Vec embeddings
word2vec_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Define the model architecture
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
                            weights=[embedding_matrix], trainable=True)(input_layer)
conv_layer = Conv1D(100, 4, activation='relu')(embedding_layer)
batch_norm1 = BatchNormalization()(conv_layer)
global_pool = GlobalMaxPooling1D()(batch_norm1)
dropout_layer = Dropout(0.5)(global_pool)
output_layer = Dense(1 , activation='sigmoid')(dropout_layer)

final_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


embedding_extractor = Model(inputs=final_model.input, outputs=global_pool)

# Generate embeddings for both train and CVE datasets
train_embeddings = embedding_extractor.predict(train_padded)
mitre_embeddings = embedding_extractor.predict(mitre_padded)


# Compute keyword similarity
def keyword_similarity(row_keywords, mitre_keywords):
    train_keywords_set = set(row_keywords.lower().split())
    mitre_keywords_set = set(mitre_keywords.lower().split())
    intersection = train_keywords_set.intersection(mitre_keywords_set)
    union = train_keywords_set.union(mitre_keywords_set)
    return len(intersection) / len(union) if union else 0

# Calculate keyword similarities
def compute_keyword_similarities(train_data, mitre_data):
    keyword_similarities = np.zeros((len(train_data), len(mitre_data)))
    for i, train_keywords in enumerate(train_data['CVE Processed Text']):
        for j, mitre_keywords in enumerate(mitre_data['Processed_Technique']):
            keyword_similarities[i, j] = keyword_similarity(train_keywords, mitre_keywords)
    return keyword_similarities

keyword_similarities = compute_keyword_similarities(train_data, mitre_data)

# Combine embedding and keyword similarity
combined_similarity = (1 - keyword_weight) * cosine_similarity(train_embeddings, mitre_embeddings) + keyword_weight * keyword_similarities
best_matches = combined_similarity.argmax(axis=1)
best_scores = combined_similarity.max(axis=1)

# Generate train labels
train_labels = (best_scores >= similarity_threshold).astype(int)

# Train the final model
final_model.fit(train_padded, train_labels, epochs=5, batch_size=32)

# Save the final trained model
final_model.save("final_cnn_model_for_mitre_mapping_with.keras")
print("Final model saved as 'final_cnn_model_for_mitre_mapping.keras'.")

# Calculate cosine similarity
similarity_matrix = cosine_similarity(train_embeddings, mitre_embeddings)

# Find the top three matching techniques for each CVE
top_three_matches = []
for i in range(len(train_data)):

    top_indices = np.argsort(similarity_matrix[i])[-3:][::-1]
    top_scores = similarity_matrix[i, top_indices]

    # Check if the top scores meet the similarity threshold
    top_techniques = []
    top_tactics = []
    top_mitigations = []
    for j in range(3):
        if top_scores[j] >= similarity_threshold:
            top_techniques.append(mitre_data['Technique'].iloc[top_indices[j]])
            top_tactics.append(mitre_data['Tactic'].iloc[top_indices[j]])
            top_mitigations.append(mitre_data['mitigation'].iloc[top_indices[j]])
        else:
            top_techniques.append(None)
            top_tactics.append(None)
            top_mitigations.append(None)

    top_three_matches.append({
        "CVE Name": train_data['CVE Name'].iloc[i],
        "CVE text": train_data['CVE Processed Text'].iloc[i],
        "Top 1 Technique": top_techniques[0],
        "Top 1 Tactic": top_tactics[0],
        "Top 1 Mitigation": top_mitigations[0],
        "Top 1 Similarity Score": top_scores[0],
        "Top 2 Technique": top_techniques[1],
        "Top 2 Tactic": top_tactics[1],
        "Top 2 Mitigation": top_mitigations[1],
        "Top 2 Similarity Score": top_scores[1],
        "Top 3 Technique": top_techniques[2],
        "Top 3 Tactic": top_tactics[2],
        "Top 3 Mitigation": top_mitigations[2],
        "Top 3 Similarity Score": top_scores[2]
    })

# Convert the best matches to a DataFrame
top_three_matches_df = pd.DataFrame(top_three_matches)


print(top_three_matches_df.head())


top_three_matches_df.to_csv("training_mapping_results_mitre_text.csv", index=False)

print("Final mapping results with top three matched techniques, tactics, mitigations, and similarity scores saved to 'training_mapping_results_top_three_matches.csv'.")


cnn test cve to mitre without cluster

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the saved model
model = load_model('final_cnn_model_for_mitre_mapping_with.keras')
print("Model loaded successfully.")

# Load the test dataset and expert labels
test_data = pd.read_csv('test_mapping_results_cnn_model_expert_without_cluster.csv')
mitre_data = pd.read_csv('Processed_mitre_recommidtaion.csv')
expert_data = pd.read_csv('CVE_Dataset_with_Binary_Logical_Match_Column.csv')

# Parameters
max_words = 10000
max_len = 100
similarity_threshold = 0.9

# Combine relevant columns into text for tokenization
test_text = (test_data['CVE Processed Text'].fillna('') + " " +
             test_data['triggerChannelTitle'].fillna('') + " " +
             test_data['triggerTitle'].fillna('') + " " +
             test_data['actionTitle'].fillna('') + " " +
             test_data['title'].fillna('') + " " +
             test_data['desc'].fillna('') + " " +
             test_data['Generated Topic Name'].fillna('') + " " +
             test_data['Best Matched Keywords'].fillna('')
)

mitre_text = (mitre_data['Processed_Technique'].fillna('') + " " +
              mitre_data['Tactic'].fillna('') + " " +
              mitre_data['mitigation'].fillna(''))

# Tokenizer setup
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(test_text.values)
tokenizer.fit_on_texts(mitre_text.values)

# Convert text to sequences
test_sequences = tokenizer.texts_to_sequences(test_text.values)
mitre_sequences = tokenizer.texts_to_sequences(mitre_text.values)

# Pad sequences
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')
mitre_padded = pad_sequences(mitre_sequences, maxlen=max_len, padding='post')

# Generate embeddings for the test dataset
embedding_extractor = Model(inputs=model.input, outputs=model.layers[-3].output)
test_embeddings = embedding_extractor.predict(test_padded)
mitre_embeddings = embedding_extractor.predict(mitre_padded)

# Calculate cosine similarity between test and MITRE techniques
similarity_matrix = cosine_similarity(test_embeddings, mitre_embeddings)

# Find the top three matching techniques for each CVE in the test dataset
top_three_matches = []
for i in range(len(test_data)):

    top_indices = np.argsort(similarity_matrix[i])[-3:][::-1]
    top_scores = similarity_matrix[i, top_indices]

    # Check if the top scores meet the similarity threshold
    top_techniques = []
    top_tactics = []
    top_mitigations = []
    for j in range(3):
        if top_scores[j] >= similarity_threshold:
            top_techniques.append(mitre_data['Technique'].iloc[top_indices[j]])
            top_tactics.append(mitre_data['Tactic'].iloc[top_indices[j]])
            top_mitigations.append(mitre_data['mitigation'].iloc[top_indices[j]])
        else:
            top_techniques.append(None)
            top_tactics.append(None)
            top_mitigations.append(None)

    top_three_matches.append({
        "CVE Name": test_data['CVE Name'].iloc[i],
        "CVE Processed Text": test_data['CVE Processed Text'].iloc[i],
        "Top 1 Technique": top_techniques[0],
        "Top 1 Tactic": top_tactics[0],
        "Top 1 Mitigation": top_mitigations[0],
        "Top 1 Similarity Score": top_scores[0],
        "Top 2 Technique": top_techniques[1],
        "Top 2 Tactic": top_tactics[1],
        "Top 2 Mitigation": top_mitigations[1],
        "Top 2 Similarity Score": top_scores[1],
        "Top 3 Technique": top_techniques[2],
        "Top 3 Tactic": top_tactics[2],
        "Top 3 Mitigation": top_mitigations[2],
        "Top 3 Similarity Score": top_scores[2]
    })

# Convert the best matches to a DataFrame
top_three_matches_df = pd.DataFrame(top_three_matches)


top_three_matches_df['Label'] = top_three_matches_df['Top 1 Similarity Score'].apply(lambda x: 1 if x >= similarity_threshold else 0)

# Save to CSV file
top_three_matches_df.to_csv("cnn_mapping_results_with_top3_techniques_test_final1.csv", index=False)

print("Final mapping results with top three matched techniques, tactics, mitigations, similarity scores, and assigned labels saved to 'final_mapping_results_with_top3_techniques_and_labels.csv'.")


true_labels = expert_data['Logical Match']
predicted_labels = top_three_matches_df['Label']


# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


cnn train cve to miter with cluster

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import joblib
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load the datasets
train_data = pd.read_csv('cnn_based_cluster_results_final128.csv')
mitre_data = pd.read_csv('Processed_mitre_recommidtaion.csv')

# Parameters
max_words = 10000
max_len = 100
embedding_dim = 300
similarity_threshold = 0.7
num_techniques = mitre_data['Technique'].nunique()
num_clusters = 23

# Combine relevant columns into text for tokenization
train_text = train_data['cve_text'].fillna('')
mitre_text = mitre_data['Processed_Technique'].fillna('')

# Tokenizer setup
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_text.values)
tokenizer.fit_on_texts(mitre_text.values)

# Convert text to sequences for CNN input
train_sequences = tokenizer.texts_to_sequences(train_text.values)
mitre_sequences = tokenizer.texts_to_sequences(mitre_text.values)

# Pad sequences to ensure uniform input length
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
mitre_padded = pad_sequences(mitre_sequences, maxlen=max_len, padding='post')

# Load Word2Vec embeddings
word2vec_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))


input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
                            weights=[embedding_matrix], trainable=False)(input_layer)
conv_layer = Conv1D(100, 4, activation='relu')(embedding_layer)
batch_norm1 = BatchNormalization()(conv_layer)
global_pool = GlobalMaxPooling1D()(batch_norm1)
dropout_layer = Dropout(0.5)(global_pool)
output_layer = Dense(num_techniques, activation='sigmoid')(dropout_layer)

final_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the final model
train_labels = np.zeros((len(train_data), num_techniques))
final_model.fit(train_padded, train_labels, epochs=5, batch_size=32)

# Save the final trained model
final_model.save("final_cnn_model_for_mitre_mapping_23cluster7.keras")
print("Final model saved as 'final_cnn_model_for_mitre_mapping.keras'.")

# Define the embedding extractor model
embedding_extractor = Model(inputs=final_model.input, outputs=global_pool)

# Generate embeddings for the train dataset
train_embeddings = embedding_extractor.predict(train_padded)

# Apply K-Means clustering on the train embeddings
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
train_data['cluster'] = kmeans.fit_predict(train_embeddings)

# Save the KMeans clustering model
joblib.dump(kmeans, 'kmeans_model_for_cve2mitre_23clustering7.pkl')
print("KMeans clustering model saved as 'kmeans_model_for_cve_clustering.pkl'.")

# Calculate cosine similarity between CVE and MITRE techniques
mitre_embeddings = embedding_extractor.predict(mitre_padded)
similarity_matrix = cosine_similarity(train_embeddings, mitre_embeddings)

# Find the top three matching techniques for each CVE
top_three_matches = []
correctly_mapped_cluster = 0
for i in range(len(train_data)):
    # Get the indices of the top three matching techniques based on similarity score
    top_indices = np.argsort(similarity_matrix[i])[-3:][::-1]
    top_scores = similarity_matrix[i, top_indices]

    # Store the techniques, tactics, and mitigations
    top_techniques = []
    top_tactics = []
    top_mitigations = []
    for j in range(3):
        top_techniques.append(mitre_data['Technique'].iloc[top_indices[j]])
        top_tactics.append(mitre_data['Tactic'].iloc[top_indices[j]])
        top_mitigations.append(mitre_data['mitigation'].iloc[top_indices[j]])

    # Check if the predicted cluster matches the true MITRE cluster
    predicted_mitre_cluster = train_data['cluster'].iloc[i]
    if i < len(mitre_embeddings):
        true_mitre_cluster = kmeans.predict(mitre_embeddings[i].reshape(1, -1))[0]
        is_correct_mitre_cluster = 1 if top_scores[j] >= similarity_threshold and predicted_mitre_cluster == true_mitre_cluster else 0
    else:
        is_correct_mitre_cluster = 0

    correctly_mapped_cluster += is_correct_mitre_cluster

    top_three_matches.append({
        "CVE Name": train_data['cve_name'].iloc[i],
        "CVE processed text": train_data['cve_text'].iloc[i],
        "Cluster": train_data['cluster'].iloc[i],
        "Top 1 Technique": top_techniques[0],
        "Top 1 Tactic": top_tactics[0],
        "Top 1 Mitigation": top_mitigations[0],
        "Top 1 Similarity Score": top_scores[0],
        "Top 2 Technique": top_techniques[1],
        "Top 2 Tactic": top_tactics[1],
        "Top 2 Mitigation": top_mitigations[1],
        "Top 2 Similarity Score": top_scores[1],
        "Top 3 Technique": top_techniques[2],
        "Top 3 Tactic": top_tactics[2],
        "Top 3 Mitigation": top_mitigations[2],
        "Top 3 Similarity Score": top_scores[2],
        "Correctly Mapped Cluster": is_correct_mitre_cluster
    })

# Convert the best matches to a DataFrame
top_three_matches_df = pd.DataFrame(top_three_matches)


print(top_three_matches_df.head())

# Save to a new CSV file with the top three match details and cluster information
top_three_matches_df.to_csv("training_mapping_results_mitre_top_three_matches_with_23clusters7.csv", index=False)

print("Final mapping results with top three matched techniques, tactics, mitigations, similarity scores, and cluster information saved.")

# Calculate the metrics based on the correctly mapped cluster
correctly_mapped_percentage = (correctly_mapped_cluster / len(train_data)) * 100

# Print results
print(f"Correctly Mapped Cluster Percentage: {correctly_mapped_percentage:.2f}%")



cnn test cve to mitre with cluster

In [None]:
import numpy as np
import pandas as pd
import joblib
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report

# Load the test dataset
test_data = pd.read_csv('testing_mapping_results_with_clusters_try2.csv')

# Load expert data
expert_data = pd.read_csv('CVE_Dataset_with_Binary_Logical_Match_Column.csv')
expert_data['Logical_Match'] = expert_data['Logical Match'].astype(int)

test_data['cve_text'] = test_data['cve_text'].fillna('')

# Load the trained CNN model
cnn_model = load_model('final_cnn_model_for_mitre_mapping_23cluster7.keras')
print(f"CNN model loaded")

# Load the saved KMeans model (the one you trained on the MITRE data)
kmeans = joblib.load('kmeans_model_for_cve2mitre_23clustering7.pkl')
print(f"KMeans model loaded")

# Tokenizer setup
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
mitre_data = pd.read_csv('Processed_mitre_recommidtaion.csv')
mitre_text = mitre_data['Processed_Technique'].fillna('')
tokenizer.fit_on_texts(mitre_text)

# Define parameters
max_len = 100
embedding_dim = 300

# Convert the test text to sequences
test_sequences = tokenizer.texts_to_sequences(test_data['cve_text'].values)

# Check if test_sequences is empty or None
if not test_sequences or any(len(seq) == 0 for seq in test_sequences):
    raise ValueError("Test data tokenization resulted in empty sequences.")

# Pad the test sequences to ensure uniform input length
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Generate embeddings for the test dataset using the trained CNN model
embedding_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.layers[-2].output)
test_embeddings = embedding_extractor.predict(test_padded)

# Scale the embeddings
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(test_embeddings)

# Use the saved KMeans model
test_clusters = kmeans.predict(scaled_embeddings)

# Load the MITRE data to get the techniques for mapping
mitre_text = mitre_data['Processed_Technique'].fillna('')
mitre_sequences = tokenizer.texts_to_sequences(mitre_text.values)
mitre_padded = pad_sequences(mitre_sequences, maxlen=max_len, padding='post')

# Calculate the cosine similarity between the test embeddings and the MITRE embeddings
mitre_embeddings = embedding_extractor.predict(mitre_padded)
similarity_matrix = cosine_similarity(test_embeddings, mitre_embeddings)

# Set threshold value for cosine similarity
similarity_threshold = 0.63

# Find the top three matching techniques for each CVE in the test dataset
top_three_matches = []
correctly_mapped_cluster = 0

# Initialize lists for true and predicted labels
true_labels = []
predicted_labels = []

for i in range(len(test_data)):

    top_indices = np.argsort(similarity_matrix[i])[-3:][::-1]
    top_scores = similarity_matrix[i, top_indices]

    # Get the top 3 techniques and check if they meet the threshold
    top_techniques = []
    top_tactics = []
    top_mitigations = []
    for j in range(3):
        top_techniques.append(mitre_data['Technique'].iloc[top_indices[j]])
        top_tactics.append(mitre_data['Tactic'].iloc[top_indices[j]])
        top_mitigations.append(mitre_data['mitigation'].iloc[top_indices[j]])

    # Check if the predicted cluster matches the true MITRE cluster
    predicted_mitre_cluster = test_clusters[i]
    true_mitre_cluster = kmeans.predict(mitre_embeddings[top_indices[0]].reshape(1, -1))[0]
    is_correct_mitre_cluster = 1 if top_scores[j] >= similarity_threshold and predicted_mitre_cluster == true_mitre_cluster else 0
    correctly_mapped_cluster += is_correct_mitre_cluster

    true_labels.append(expert_data['Logical_Match'].iloc[i])
    predicted_labels.append(is_correct_mitre_cluster)

    top_three_matches.append({
        "CVE Name": test_data['cve_name'].iloc[i],
        "CVE processed text": test_data['cve_text'].iloc[i],
        "Cluster": test_clusters[i],
        "Top 1 Technique": top_techniques[0],
        "Top 1 Tactic": top_tactics[0],
        "Top 1 Mitigation": top_mitigations[0],
        "Top 1 Similarity Score": top_scores[0],
        "Top 2 Technique": top_techniques[1],
        "Top 2 Tactic": top_tactics[1],
        "Top 2 Mitigation": top_mitigations[1],
        "Top 2 Similarity Score": top_scores[1],
        "Top 3 Technique": top_techniques[2],
        "Top 3 Tactic": top_tactics[2],
        "Top 3 Mitigation": top_mitigations[2],
        "Top 3 Similarity Score": top_scores[2],
        "Correctly Mapped Cluster": is_correct_mitre_cluster
    })

# Convert the best matches to a DataFrame
top_three_matches_df = pd.DataFrame(top_three_matches)

# Save to CSV file
top_three_matches_df.to_csv("testing_mapping_cnn_results_mitre_cluster_final_one.csv", index=False)
print("Final mapping results with top three matched techniques, tactics, mitigations, similarity scores, and cluster information saved.")

# Calculate the metrics based on the correctly mapped clusters
correctly_mapped_percentage = (correctly_mapped_cluster / len(test_data)) * 100

# Print results
print(f"Correctly Mapped Cluster Percentage: {correctly_mapped_percentage:.2f}%")

# Align with expert labels
expert_labels = expert_data['Logical_Match']

# Evaluate results
accuracy_expert = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Cluster-Based Accuracy: {correctly_mapped_percentage:.2f}%")
print(f"Accuracy Compared to Expert Labels: {accuracy_expert:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Compare with the expert dataset
correct_matches_expert = sum(top_three_matches_df['Correctly Mapped Cluster'] == expert_data['Logical Match'])
accuracy_expert = correct_matches_expert / len(top_three_matches_df) * 100

print(f"Accuracy Compared to Expert Labels: {accuracy_expert:.2f}%")
print(classification_report(expert_data['Logical Match'], top_three_matches_df['Correctly Mapped Cluster']))

