Embed Sentence


In [1]:
import torch
from transformers import BertTokenizer, BertModel
import os
import json
from tqdm import tqdm
import pickle

In [None]:
# Function to compute BERT embeddings
def compute_bert_embedding(text):
    """
    Compute BERT embedding for a given text.
    Returns the mean-pooled embedding as a list.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value for key, value in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        embedding = last_hidden_state.mean(dim=1).squeeze().cpu().tolist()  # Mean pooling
    return embedding


def add_embeddings_to_labels(input_file, output_file):
    """
    Add BERT embeddings
    """
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    with open(input_file, "rb") as file:
        data = json.load(file)

    # note that this is a PoC, but the complete dataset should comprize of all the CVEs and not just the first 1000
    for entry in tqdm(data[:1000]):
        if entry["cwe_class"] not in {"None", "CWE-noinfo", "CWE-Other"}:  
            entry["embedding"] = compute_bert_embedding(entry["Description"])

    # Save updated data to a new file
    with open(output_file, "wb") as file:
        pickle.dump(data, file)

    print(f"Embeddings added and saved to: {output_file}")

In [7]:
# Input and Output Paths
input_file = "cve_extracted_data2023.json"  # Path to the existing output JSON
output_file = "cve_extracted_data2023_with_embeddings.pickle"  # Path to save the updated JSON

print("Loading BERT model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

add_embeddings_to_labels(input_file, output_file)


Loading BERT model...


100%|██████████| 1000/1000 [01:26<00:00, 11.62it/s]

Embeddings added and saved to: cve_extracted_data2023_with_embeddings.pickle





In [8]:
with open(output_file, "rb") as file:
    data = pickle.load(file)

In [9]:
print(data[0])  # Print the first entry to verify

{'CVE ID': 'CVE-2023-0001', 'Description': 'An information exposure vulnerability in the Palo Alto Networks Cortex XDR agent on Windows devices allows a local system administrator to disclose the admin password for the agent in cleartext, which bad actors can then use to execute privileged cytool commands that disable or uninstall the agent.', 'CWE': ['CWE-319'], 'cwe_class': '311', 'embedding': [-0.4415055215358734, 0.01594029739499092, 0.04627702385187149, 0.03500800579786301, 0.5448458790779114, -0.13969023525714874, 0.17242665588855743, 0.39190492033958435, 0.17194750905036926, -0.05938902124762535, -0.2591797709465027, -0.15562838315963745, -0.29512882232666016, 0.3031870424747467, -0.08647973090410233, 0.1043558418750763, 0.17719487845897675, 0.3072386384010315, -0.2042432725429535, -0.02973429299890995, 0.030058149248361588, -0.046129003167152405, -0.2965058386325836, 0.3241639733314514, 0.5221453309059143, -0.11155488342046738, -0.06877804547548294, 0.1445492058992386, -0.53454

Train validation test

In [10]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import Callback
import joblib

In [11]:
with open('cve_extracted_data2023_with_embeddings.pickle', 'rb') as f1:
    input_file = pickle.load(f1)
print(input_file[1])

{'CVE ID': 'CVE-2023-0002', 'Description': 'A problem with a protection mechanism in the Palo Alto Networks Cortex XDR agent on Windows devices allows a local user to execute privileged cytool commands that disable or uninstall the agent.\n', 'CWE': ['NVD-CWE-Other'], 'cwe_class': 'CWE-Other'}


In [12]:
class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1_weighted = 0.0
        self.best_f1_macro = 0.0
        self.best_model = None
        self.f1_scores_weighted = []
        self.f1_scores_macro = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1_weighted = f1_score(self.y_val, y_val_pred, average='weighted')
        f1_macro = f1_score(self.y_val, y_val_pred, average='macro')
        
        self.f1_scores_weighted.append(f1_weighted)
        self.f1_scores_macro.append(f1_macro)
        
        if f1_weighted > self.best_f1_weighted:
            self.best_f1_weighted = f1_weighted
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score (Weighted): {f1_weighted:.4f}")
            print("Saved best model (Weighted)")
        
        if f1_macro > self.best_f1_macro:
            self.best_f1_macro = f1_macro
            print(f"Epoch {epoch + 1} - F1 Score (Macro): {f1_macro:.4f}")
            print("Saved best model (Macro)")
        
        print(f"Epoch {epoch + 1} - Weighted F1: {f1_weighted:.4f}, Macro F1: {f1_macro:.4f}")


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from collections import Counter
# Filtering and creating the datasets
train = np.array([item['embedding'] for item in input_file if 'embedding' in item])
target = np.array([item['CWE'][0] for item in input_file if 'embedding' in item])

counts = Counter(target)
allowed_classes = {cls for cls, count in counts.items() if count >= 10}
print("Allowed classes:", counts)
train_filtered = []
target_filtered = []

for emb, cls in zip(train, target):
    if cls in allowed_classes:
        train_filtered.append(emb)
        target_filtered.append(cls)
print(len(train_filtered), len(target_filtered))
train_filtered = np.array(train_filtered)
target_filtered = np.array(target_filtered)

X_train, X_temp, y_train, y_temp = train_test_split(
    train_filtered, target_filtered, 
    test_size=0.3, random_state=42, stratify=target_filtered
)


# Splitting temp into validation and test sets with stratification
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Packing the data into a dictionary
data_to_pickle = {
    'X_train': X_train,
    'X_val': X_val,
    'X_test': X_test,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test
}

# Saving to a pickle file
pickle_file_path ='train_val_test_split.pickle'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(data_to_pickle, f)

print(f"Data has been saved to {pickle_file_path}")

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

Allowed classes: Counter({'CWE-79': 136, 'CWE-89': 78, 'CWE-787': 42, 'CWE-416': 24, 'CWE-22': 20, 'CWE-94': 18, 'CWE-352': 18, 'CWE-77': 16, 'CWE-125': 13, 'CWE-476': 13, 'CWE-287': 12, 'CWE-862': 11, 'CWE-434': 11, 'CWE-404': 11, 'CWE-400': 8, 'CWE-863': 7, 'CWE-295': 7, 'CWE-78': 6, 'CWE-200': 6, 'CWE-20': 6, 'CWE-843': 6, 'CWE-319': 5, 'CWE-306': 5, 'CWE-119': 5, 'CWE-732': 5, 'CWE-427': 5, 'CWE-521': 5, 'CWE-120': 5, 'CWE-74': 4, 'CWE-601': 4, 'CWE-269': 4, 'CWE-190': 4, 'CWE-610': 3, 'CWE-294': 3, 'CWE-1021': 3, 'CWE-665': 3, 'CWE-502': 3, 'CWE-639': 3, 'CWE-312': 2, 'CWE-367': 2, 'CWE-613': 2, 'CWE-401': 2, 'CWE-203': 2, 'CWE-798': 2, 'CWE-428': 2, 'CWE-532': 2, 'CWE-668': 2, 'CWE-369': 2, 'CWE-770': 2, 'CWE-209': 2, 'CWE-311': 2, 'CWE-129': 2, 'CWE-667': 1, 'CWE-276': 1, 'CWE-681': 1, 'CWE-1284': 1, 'CWE-755': 1, 'CWE-327': 1, 'CWE-330': 1, 'CWE-345': 1, 'CWE-834': 1, 'CWE-835': 1, 'CWE-522': 1, 'CWE-763': 1, 'CWE-470': 1, 'CWE-326': 1, 'CWE-916': 1, 'CWE-754': 1, 'CWE-922': 1,

In [14]:
def create_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

Train the model 3 times to ensure consistency of the results

In [15]:
models = []
f1_scores_list = []  # To store scores for all models
for i in range(3):
    model = create_model(input_dim, output_dim)
    f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))
    model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, 
              validation_data=(X_val, label_encoder_train.transform(y_val)), 
              verbose=1, callbacks=[f1_callback])
    models.append(model)
    
    # Save the F1 scores for this model
    f1_scores = pd.DataFrame({
        'Epoch': list(range(1, len(f1_callback.f1_scores_weighted) + 1)),
        'Weighted F1': f1_callback.f1_scores_weighted,
        'Macro F1': f1_callback.f1_scores_macro
    })
    f1_scores.to_csv(f'f1_scores_model_{i+1}.csv', index=False)
    f1_scores_list.append(f1_scores)

# Save the retrained models
for i, model in enumerate(models):
    model.save(f'/home/simonettos/cybersecurity_dataset/pseudo-learning/base_model{i}.h5')
    print(f"Saved base model ")
joblib.dump(label_encoder_train, 'label_encoder_train_descr.joblib')

Epoch 1/40
Epoch 1 - F1 Score (Weighted): 0.2776
Saved best model (Weighted)
Epoch 1 - F1 Score (Macro): 0.0815
Saved best model (Macro)
Epoch 1 - Weighted F1: 0.2776, Macro F1: 0.0815
Epoch 2/40
Epoch 2 - F1 Score (Weighted): 0.3157
Saved best model (Weighted)
Epoch 2 - F1 Score (Macro): 0.1100
Saved best model (Macro)
Epoch 2 - Weighted F1: 0.3157, Macro F1: 0.1100
Epoch 3/40
Epoch 3 - Weighted F1: 0.3155, Macro F1: 0.1035
Epoch 4/40
Epoch 4 - F1 Score (Weighted): 0.3534
Saved best model (Weighted)
Epoch 4 - F1 Score (Macro): 0.1456
Saved best model (Macro)
Epoch 4 - Weighted F1: 0.3534, Macro F1: 0.1456
Epoch 5/40
Epoch 5 - Weighted F1: 0.3447, Macro F1: 0.1394
Epoch 6/40
Epoch 6 - F1 Score (Weighted): 0.3586
Saved best model (Weighted)
Epoch 6 - Weighted F1: 0.3586, Macro F1: 0.1424
Epoch 7/40
Epoch 7 - F1 Score (Weighted): 0.4040
Saved best model (Weighted)
Epoch 7 - F1 Score (Macro): 0.2233
Saved best model (Macro)
Epoch 7 - Weighted F1: 0.4040, Macro F1: 0.2233
Epoch 8/40
Epoch 

['label_encoder_train_descr.joblib']

Average the performanes

In [16]:
from sklearn.metrics import classification_report
import numpy as np

# Assuming `models` contains all trained models
y_val_encoded = label_encoder_train.transform(y_test)

# Initialize an array to store the cumulative probabilities for averaging
cumulative_probs = np.zeros((len(y_test), len(label_encoder_train.classes_)))

for model in models:
    # Predict probabilities with the current model
    y_pred_probs = model.predict(X_test)
    cumulative_probs += y_pred_probs  # Sum the probabilities for averaging

# Average the probabilities across all models
average_probs = cumulative_probs / len(models)

# Get the final predictions by taking the class with the highest probability
y_pred = np.argmax(average_probs, axis=1)

# Decode the predicted labels to their original form
y_pred_decoded = label_encoder_train.inverse_transform(y_pred)

# Print the classification report
print("Classification Report (Averaged Across Models):")
print(classification_report(y_test, y_pred_decoded, digits=4))

Classification Report (Averaged Across Models):
              precision    recall  f1-score   support

     CWE-125     0.0000    0.0000    0.0000         2
      CWE-22     1.0000    0.3333    0.5000         3
     CWE-287     0.0000    0.0000    0.0000         2
     CWE-352     1.0000    1.0000    1.0000         2
     CWE-404     0.5000    1.0000    0.6667         1
     CWE-416     1.0000    1.0000    1.0000         4
     CWE-434     0.0000    0.0000    0.0000         1
     CWE-476     0.5000    0.5000    0.5000         2
      CWE-77     1.0000    0.3333    0.5000         3
     CWE-787     0.7000    1.0000    0.8235         7
      CWE-79     0.8400    1.0000    0.9130        21
     CWE-862     0.0000    0.0000    0.0000         1
      CWE-89     0.9167    0.9167    0.9167        12
      CWE-94     0.6667    0.6667    0.6667         3

    accuracy                         0.7969        64
   macro avg     0.5802    0.5536    0.5348        64
weighted avg     0.7662    0.796

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
