# **Rule to CVE BERT (WITH CLUSTER)**
USE THE Bert MODEL WITH 5 EPOCH

# Trained Model

In [None]:

import os
import pandas as pd
import torch
import random
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import transformers
transformers.logging.set_verbosity_error()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the datasets
combined_df = pd.read_csv('combined_dataset_75training.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')

# Ensure all entries are strings and fill NaN values
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionTitle'] = combined_df['actionTitle'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerTitle'] = combined_df['triggerTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# Combine columns
combined_texts = (
    combined_df['triggerTitle'] + " " +
    combined_df['triggerChannelTitle'] + " " +
    combined_df['actionTitle'] + " " +
    combined_df['actionChannelTitle'] + " " +
    combined_df['Processed Title + Description'] + " " +
    combined_df['Best Matched Keywords']
).tolist()
cve_texts = cve_df['Processed_Text'].tolist()

# Define a custom Dataset class
class TextPairDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=64):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        text1, text2, label = self.pairs[idx]
        inputs = self.tokenizer(text1, text2, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        inputs['labels'] = torch.tensor(float(label), dtype=torch.float)
        return inputs

# Initialize tokenizer and model with condition for device
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Define the model with an added dense layer
class SimilarityModel(nn.Module):
    def __init__(self, bert_model):
        super(SimilarityModel, self).__init__()
        self.bert = bert_model
        self.dense = nn.Linear(768, 256)
        self.regressor = nn.Linear(256, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        dense_output = self.relu(self.dense(pooled_output))
        return self.regressor(dense_output)

# Instantiate the model, criterion, optimizer, and scheduler
model = SimilarityModel(bert_model).to(device)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=2e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

# Training parameters
epochs = 5
batch_size = 32

# Prepare data for training (create pairs and limit negatives for balanced training)
pairs = [(combined_texts[i], cve_texts[i], 1) for i in range(min(len(combined_texts), len(cve_texts)))]
pairs += [(combined_texts[i], cve_texts[j], 0) for i in range(len(combined_texts)) for j in range(len(cve_texts)) if i != j]
num_positive_samples = len([pair for pair in pairs if pair[2] == 1])
num_negative_samples = min(num_positive_samples * 2, len([pair for pair in pairs if pair[2] == 0]))
positive_pairs = [pair for pair in pairs if pair[2] == 1]
negative_pairs = random.sample([pair for pair in pairs if pair[2] == 0], num_negative_samples)
pairs = positive_pairs + negative_pairs
random.shuffle(pairs)

# Create the DataLoader
train_dataset = TextPairDataset(pairs, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop with gradient clipping and learning rate scheduler
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        train_loss += loss.item()

    # Step the scheduler
    scheduler.step()
    print(f"Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}")

# Save the model and tokenizer
model_dir = 'final_trained_model_similarity5'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model.bert.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print("Model and tokenizer saved.")

# Load the tokenizer and model
print("Loading model from local path...")
tokenizer = BertTokenizer.from_pretrained(model_dir)
bert_model = BertModel.from_pretrained(model_dir)

# Optimal Number of Cluster (using Elbow point)

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Define the model path
model_dir = 'final_trained_model_similarity5'

# Reload the model and tokenizer
print("Reloading the model and tokenizer from local path...")
tokenizer = BertTokenizer.from_pretrained(model_dir)
bert_model = BertModel.from_pretrained(model_dir).eval()
print("Model and tokenizer reloaded successfully.")

# Load the datasets
combined_df = pd.read_csv('combined_dataset_75training.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')

# Ensure all entries are strings and fill NaN values
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# Combine columns for rule text input in each entry
combined_texts = (
    combined_df['Best Matched Keywords'] + " " +
    combined_df['Processed Title + Description'] + " " +
    combined_df['actionChannelTitle'] + " " +
    combined_df['triggerChannelTitle']
).tolist()
cve_texts = cve_df['Processed_Text'].tolist()

# Function to compute embeddings using the model
def compute_embeddings(texts, model, tokenizer, max_length=64):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to('cpu')
        with torch.no_grad():
            embedding = model(**inputs).pooler_output
        embeddings.append(embedding.squeeze().cpu().numpy())
    return np.array(embeddings)

# Compute embeddings for combined_texts and cve_texts
print("Computing embeddings...")
combined_embeddings = compute_embeddings(combined_texts, bert_model, tokenizer)
cve_embeddings = compute_embeddings(cve_texts, bert_model, tokenizer)
print("Embeddings computed successfully.")

# Calculate inertia for each cluster count to identify the "elbow" point
inertias = []
cluster_counts = range(2, 101)

for k in cluster_counts:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(cve_embeddings)  # Fit on CVE embeddings
    inertias.append(kmeans.inertia_)  # Store inertia value for each k
    print(f"k={k}: Inertia={kmeans.inertia_}")

# Plot inertia values for each cluster count
plt.figure(figsize=(10, 6))
plt.plot(cluster_counts, inertias, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.show()

# Identify the primary elbow point and top 10 elbow points
from kneed import KneeLocator
kneedle = KneeLocator(cluster_counts, inertias, curve='convex', direction='decreasing')
primary_elbow = kneedle.elbow
print(f"Primary elbow point: {primary_elbow}")

# Calculate the top 10 elbow points based on the largest drops in inertia
inertia_drops = np.diff(inertias)
top_10_drop_indices = np.argsort(inertia_drops)[-10:][::-1] + 2

print(f"Top 10 elbow points based on the largest inertia drops: {sorted(top_10_drop_indices)}")

# Highlight primary and top elbow points on the plot
plt.figure(figsize=(10, 6))
plt.plot(cluster_counts, inertias, marker='o', linestyle='-', color='b')
plt.axvline(x=primary_elbow, color='r', linestyle='--', label=f'Primary Elbow at k={primary_elbow}')
for k in sorted(top_10_drop_indices):
    plt.axvline(x=k, color='g', linestyle='--', alpha=0.5)
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method with Highlighted Elbow Points")
plt.grid(True)
plt.legend()
plt.show()

# TRAINING (mapping rule to cve)

In [None]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Define paths and load model
model_dir = './20C'  # Path to your pre-trained model directory

# Load the tokenizer and the pre-trained BERT model from the local folder
print(f"Loading model from {model_dir}...")
tokenizer = BertTokenizer.from_pretrained(model_dir)
bert_model = BertModel.from_pretrained(model_dir)  # For embedding extraction, we use BertModel

# 2. Load datasets and preprocess them
combined_df = pd.read_csv('combined_dataset_75training.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')

# Preprocess text data to ensure they are clean and consistent
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# Concatenate all text columns in combined_df that need to be tokenized
combined_df['all_text'] = (combined_df['Best Matched Keywords'] + " " +
                           combined_df['Processed Title + Description'] + " " +
                           combined_df['actionChannelTitle'] + " " +
                           combined_df['triggerChannelTitle'])

# 3. Define function to compute embeddings
def compute_embeddings(texts, model, tokenizer, max_length=64):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to('cpu')
        with torch.no_grad():
            embedding = model(**inputs).pooler_output
        embeddings.append(embedding.squeeze().cpu().numpy())
    return np.array(embeddings)

# 4. Compute embeddings for combined texts and CVE texts
combined_texts = combined_df['all_text'].fillna('').tolist()
cve_texts = cve_df['Processed_Text'].fillna('').tolist()

combined_embeddings = compute_embeddings(combined_texts, bert_model, tokenizer)
cve_embeddings = compute_embeddings(cve_texts, bert_model, tokenizer)

# 5.  the KMeans model
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cve_clusters = kmeans.fit_predict(cve_embeddings)

kmeans_model_path = './kmeans_model_FRI.pkl'
joblib.dump(kmeans, kmeans_model_path)  # Save the KMeans model
print(f"KMeans model saved to {kmeans_model_path}")


# 6. Step 3: Map each combined text to the closest cluster center
combined_clusters = []
for embedding in combined_embeddings:
    distances = cosine_similarity([embedding], kmeans.cluster_centers_)
    closest_cluster = np.argmax(distances)
    combined_clusters.append(closest_cluster)



similarity_threshold = 0.7

# 7. Step 4: Check cluster match and similarity, record results
correct_matches = 0
predicted_labels = []
true_labels = []
output_data = {
    'combined_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'cve_full_text': [],
    'assigned_cluster': [],
    'similarity_score': [],
    'correctly_mapped_cluster': []
}

# For each combined text, check if it maps to a CVE text based on similarity score
for i, (combined_cluster, combined_text) in enumerate(zip(combined_clusters, combined_texts)):
    combined_embedding = combined_embeddings[i].reshape(1, -1)
    similarities = cosine_similarity(combined_embedding, cve_embeddings).flatten()
    best_match_idx = np.argmax(similarities)
    best_similarity = similarities[best_match_idx]

    # Get the best matching CVE text and its cluster
    true_cve_cluster = cve_clusters[best_match_idx]
    cve_name = cve_df.iloc[best_match_idx]['Name'] if 'Name' in cve_df.columns else 'N/A'
    cve_text = cve_df.iloc[best_match_idx]['Processed_Text']
    cve_full_text = cve_df.iloc[best_match_idx]['Text']  # Full text column from CVE dataset

    if combined_cluster == true_cve_cluster and best_similarity >= similarity_threshold:
        is_correct = 1
        correct_matches += 1

    predicted_labels.append(combined_cluster)
    true_labels.append(true_cve_cluster)

    # Collect data for review
    output_data['combined_text'].append(combined_text)
    output_data['title'].append(combined_df.iloc[i]['title'])
    output_data['desc'].append(combined_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['cve_full_text'].append(cve_full_text)
    output_data['assigned_cluster'].append(combined_cluster)
    output_data['similarity_score'].append(best_similarity)
    output_data['correctly_mapped_cluster'].append(is_correct)

# 9. Calculate cluster-based accuracy
cluster_accuracy = correct_matches / len(combined_texts) * 100
print(f"Cluster-Based Accuracy: {cluster_accuracy:.2f}%")

# 10. Calculate precision, recall, and F1-score ( macro)
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# 11. Save results to CSV for further analysis
output_df = pd.DataFrame(output_data)
output_file = 'PRE_FRI_TRAIN_CVE_BERTresults.csv'
output_df.to_csv(output_file, index=False)
print(f"Cluster-based evaluation results saved to '{output_file}'")

Loading model from ./20C...
KMeans model saved to ./kmeans_model_FRI.pkl
Cluster-Based Accuracy: 82.87%
Precision: 0.67
Recall: 0.66
F1-Score: 0.63
Cluster-based evaluation results saved to 'PRE_FRI_TRAIN_CVE_BERTresults.csv'


#TESTING ( Cluster)

In [None]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#1.Loading
model_dir = './20C'
kmeans_model_path = './kmeans_model_FRI.pkl'

print(f"Loading model from {model_dir}...")
tokenizer = BertTokenizer.from_pretrained(model_dir)
bert_model = BertModel.from_pretrained(model_dir)  # For embedding extraction, we use BertModel

combined_df = pd.read_csv('combined_dataset_25testing.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')

# Preprocess text data to ensure they are clean and consistent
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# 2.to combined_df
combined_df['all_text'] = (combined_df['Best Matched Keywords'] + " " +
                           combined_df['Processed Title + Description'] + " " +
                           combined_df['actionChannelTitle'] + " " +
                           combined_df['triggerChannelTitle'])

# 3. compute embeddings
def compute_embeddings(texts, model, tokenizer, max_length=64):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to('cpu')
        with torch.no_grad():
            embedding = model(**inputs).pooler_output
        embeddings.append(embedding.squeeze().cpu().numpy())
    return np.array(embeddings)

# compute embeddings combined texts and CVE texts
combined_texts = combined_df['all_text'].fillna('').tolist()
cve_texts = cve_df['Processed_Text'].fillna('').tolist()

combined_embeddings = compute_embeddings(combined_texts, bert_model, tokenizer)
cve_embeddings = compute_embeddings(cve_texts, bert_model, tokenizer)

cve_clusters = kmeans.fit_predict(cve_embeddings)

# 5. Load the pre-trained KMeans model
kmeans = joblib.load(kmeans_model_path)
print(f"KMeans model loaded from {kmeans_model_path}")

# 6. Map each combined text to the closest cluster center
combined_clusters = []
for embedding in combined_embeddings:
    distances = cosine_similarity([embedding], kmeans.cluster_centers_)
    closest_cluster = np.argmax(distances)
    combined_clusters.append(closest_cluster)


similarity_threshold = 0.7

# 7. Check cluster match and similarity, record results
correct_matches = 0
predicted_labels = []
true_labels = []
output_data = {
    'combined_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'cve_full_text': [],
    'assigned_cluster': [],
    'similarity_score': [],
    'correctly_mapped_cluster': []
}

# For each combined text, check if it maps to a CVE text based on similarity score
for i, (combined_cluster, combined_text) in enumerate(zip(combined_clusters, combined_texts)):
    combined_embedding = combined_embeddings[i].reshape(1, -1)
    similarities = cosine_similarity(combined_embedding, cve_embeddings).flatten()
    best_match_idx = np.argmax(similarities)
    best_similarity = similarities[best_match_idx]

    # Get the best matching CVE text and its cluster
    true_cve_cluster = cve_clusters[best_match_idx]
    cve_name = cve_df.iloc[best_match_idx]['Name'] if 'Name' in cve_df.columns else 'N/A'
    cve_text = cve_df.iloc[best_match_idx]['Processed_Text']
    cve_full_text = cve_df.iloc[best_match_idx]['Text']


    if  combined_cluster == true_cve_cluster and best_similarity >= similarity_threshold:
        is_correct = 1
        correct_matches += 1

    predicted_labels.append(combined_cluster)
    true_labels.append(true_cve_cluster)


    output_data['combined_text'].append(combined_text)
    output_data['title'].append(combined_df.iloc[i]['title'])
    output_data['desc'].append(combined_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['cve_full_text'].append(cve_full_text)
    output_data['assigned_cluster'].append(combined_cluster)
    output_data['similarity_score'].append(best_similarity)
    output_data['correctly_mapped_cluster'].append(is_correct)

# 9. Calculate cluster-based accuracy
cluster_accuracy = correct_matches / len(combined_texts) * 100
print(f"Cluster-Based Accuracy: {cluster_accuracy:.2f}%")

# 10. Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# 11. Save results to CSV for further analysis
output_df = pd.DataFrame(output_data)
output_file = 'PRE_FRI_TEST_CVE_BERTresults2.csv'
output_df.to_csv(output_file, index=False)
print(f"Cluster-based evaluation results saved to '{output_file}'")

Loading model from ./20C...
KMeans model loaded from ./kmeans_model_FRI.pkl
Cluster-Based Accuracy: 81.85%
Precision: 0.84
Recall: 0.82
F1-Score: 0.82
Cluster-based evaluation results saved to 'PRE_FRI_TEST_CVE_BERTresults2.csv'


In [None]:
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"accuracy: {accuracy:.2f}")

accuracy: 0.82


# EXPERT (without Cluster)

In [None]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score


# 1. Loading
model_dir = './20C'

print(f"Loading model from {model_dir}...")
tokenizer = BertTokenizer.from_pretrained(model_dir)
bert_model = BertModel.from_pretrained(model_dir)

combined_df = pd.read_csv('combined_dataset_25testing.csv')
cve_df = pd.read_csv('Processed_CVE_withSpace.csv')
rule_to_cve_mapping_df = pd.read_csv('Complete_Rule_to_CVE_Mapping_with_Full_Details.csv')

# Preprocess text data to ensure they are clean and consistent
combined_df['Best Matched Keywords'] = combined_df['Best Matched Keywords'].fillna("").astype(str)
combined_df['Processed Title + Description'] = combined_df['Processed Title + Description'].fillna("").astype(str)
combined_df['actionChannelTitle'] = combined_df['actionChannelTitle'].fillna("").astype(str)
combined_df['triggerChannelTitle'] = combined_df['triggerChannelTitle'].fillna("").astype(str)
cve_df['Processed_Text'] = cve_df['Processed_Text'].fillna("").astype(str)

# 2. combined_df
combined_df['all_text'] = (combined_df['Best Matched Keywords'] + " " +
                           combined_df['Processed Title + Description'] + " " +
                           combined_df['actionChannelTitle'] + " " +
                           combined_df['triggerChannelTitle'])

# 3.  compute embeddings
def compute_embeddings(texts, model, tokenizer, max_length=64):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to('cpu')
        with torch.no_grad():
            embedding = model(**inputs).pooler_output
        embeddings.append(embedding.squeeze().cpu().numpy())
    return np.array(embeddings)

# 4. Compute embeddings for combined texts and CVE texts
combined_texts = combined_df['all_text'].fillna('').tolist()
cve_texts = cve_df['Processed_Text'].fillna('').tolist()

combined_embeddings = compute_embeddings(combined_texts, bert_model, tokenizer)
cve_embeddings = compute_embeddings(cve_texts, bert_model, tokenizer)

# 7. Define the similarity threshold for similarity matching
similarity_threshold = 0.7

# 8. Compute keyword similarity (based on keyword overlap)
def keyword_similarity(row_keywords, cve_keywords):
    train_keywords_set = set(row_keywords.lower().split())
    cve_keywords_set = set(cve_keywords.lower().split())
    intersection = train_keywords_set.intersection(cve_keywords_set)
    union = train_keywords_set.union(cve_keywords_set)
    return len(intersection) / len(union) if union else 0

# Calculate keyword similarities
def compute_keyword_similarities(test_data, cve_data):
    keyword_similarities = np.zeros((len(test_data), len(cve_data)))
    for i, test_keywords in enumerate(test_data['Best Matched Keywords']):
        for j, cve_keywords in enumerate(cve_data['Processed_Text']):
            keyword_similarities[i, j] = keyword_similarity(test_keywords, cve_keywords)
    return keyword_similarities

keyword_similarities = compute_keyword_similarities(combined_df, cve_df)

# 9. Combine BERT-based similarity and keyword similarity
keyword_weight = 0.4
combined_similarity = (1 - keyword_weight) * cosine_similarity(combined_embeddings, cve_embeddings) + keyword_weight * keyword_similarities

# Normalize combined similarity to 0-1 range
combined_similarity = (combined_similarity - combined_similarity.min()) / (combined_similarity.max() - combined_similarity.min())

# 10. Check similarity, record results, and create 'correctly_mapped' column
best_matches = combined_similarity.argmax(axis=1)
best_scores = combined_similarity.max(axis=1)

output_data = {
    'combined_text': [],
    'title': [],
    'desc': [],
    'cve_name': [],
    'cve_text': [],
    'cve_full_text': [],
    'similarity_score': [],
    'correctly_mapped': [],
    'logical_match': []
}

for i, combined_text in enumerate(combined_texts):
    # Get the best matching CVE text based on combined similarity
    cve_name = cve_df.iloc[best_matches[i]]['Name'] if 'Name' in cve_df.columns else 'N/A'
    cve_text = cve_df.iloc[best_matches[i]]['Processed_Text']
    cve_full_text = cve_df.iloc[best_matches[i]]['Text']

    if best_scores[i] >= similarity_threshold:
        is_correct = 1
    else:
        is_correct = 0

    # Directly use the Logical Match from rule_to_cve_mapping_df (aligned by index)
    logical_match = rule_to_cve_mapping_df.iloc[i]['Logical Match']

    # Collect data for review
    output_data['combined_text'].append(combined_text)
    output_data['title'].append(combined_df.iloc[i]['title'])
    output_data['desc'].append(combined_df.iloc[i]['desc'])
    output_data['cve_name'].append(cve_name)
    output_data['cve_text'].append(cve_text)
    output_data['cve_full_text'].append(cve_full_text)
    output_data['similarity_score'].append(best_scores[i])
    output_data['correctly_mapped'].append(is_correct)
    output_data['logical_match'].append(logical_match)

# 11. Calculate accuracy based on logical match and similarity
correct_matches = sum([1 for i in range(len(output_data['correctly_mapped'])) if output_data['correctly_mapped'][i] == output_data['logical_match'][i]])
accuracy = correct_matches / len(combined_texts) * 100
print(f"Accuracy based on logical match and similarity: {accuracy:.2f}%")

# 12. Print the classification report and other evaluation metrics
print("---------------------------------------------")
print("              --- Calculate ---              ")

# 1. Print the classification report (precision, recall, F1-score for each class)
print("Comparison with Expert Labels:")
print(classification_report(output_data['logical_match'], output_data['correctly_mapped']))

# 2. Calculate and print overall accuracy
accuracy = accuracy_score(output_data['logical_match'], output_data['correctly_mapped'])
print(f"Accuracy Compared to Expert Labels: {accuracy * 100:.2f}%")

# 3. Calculate and print precision, recall, and F1-score (using 'micro' average)
precision = precision_score(output_data['logical_match'], output_data['correctly_mapped'], average='micro')
recall = recall_score(output_data['logical_match'], output_data['correctly_mapped'], average='micro')
f1 = f1_score(output_data['logical_match'], output_data['correctly_mapped'], average='micro')

print(f"Precision based on labels: {precision:.2f}")
print(f"Recall based on labels: {recall:.2f}")
print(f"F1-Score based on labels: {f1:.2f}")
print(f"Accuracy based on labels: {accuracy:.2f}")

# 13. Save results to CSV for further analysis
output_df = pd.DataFrame(output_data)
output_file = '2Expert_Mapped_CVE_Results.csv'
output_df.to_csv(output_file, index=False)
print(f"Evaluation results saved to '{output_file}'")


Loading model from ./20C...
Accuracy based on logical match and similarity: 98.94%
---------------------------------------------
              --- Calculate ---              
Comparison with Expert Labels:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.99      0.99      3978

    accuracy                           0.99      3978
   macro avg       0.50      0.49      0.50      3978
weighted avg       1.00      0.99      0.99      3978

Accuracy Compared to Expert Labels: 98.94%
Precision based on labels: 0.99
Recall based on labels: 0.99
F1-Score based on labels: 0.99
Accuracy based on labels: 0.99
Evaluation results saved to '2Expert_Mapped_CVE_Results.csv'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
