Training for BERT based on CVE descriptions

In [7]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('bert_comparison_train_0.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('bert_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_description_bert_mean'].tolist() for item in balanced if item['cwe'] != 'None'])

test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_description_bert_mean'].tolist() for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_descr.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_descr.joblib')

Epoch 1/40
Epoch 1 - F1 Score: 0.4408
Saved best model
[0.4407841720225724]
Epoch 2/40
Epoch 2 - F1 Score: 0.5075
Saved best model
[0.4407841720225724, 0.5074592719286187]
Epoch 3/40
Epoch 3 - F1 Score: 0.5399
Saved best model
[0.4407841720225724, 0.5074592719286187, 0.5399428219271998]
Epoch 4/40
Epoch 4 - F1 Score: 0.5640
Saved best model
[0.4407841720225724, 0.5074592719286187, 0.5399428219271998, 0.5639548241222992]
Epoch 5/40
Epoch 5 - F1 Score: 0.5780
Saved best model
[0.4407841720225724, 0.5074592719286187, 0.5399428219271998, 0.5639548241222992, 0.5780211731920412]
Epoch 6/40
Epoch 7/40
Epoch 7 - F1 Score: 0.5829
Saved best model
[0.4407841720225724, 0.5074592719286187, 0.5399428219271998, 0.5639548241222992, 0.5780211731920412, 0.5667529985536977, 0.5828855072510987]
Epoch 8/40
Epoch 8 - F1 Score: 0.5914
Saved best model
[0.4407841720225724, 0.5074592719286187, 0.5399428219271998, 0.5639548241222992, 0.5780211731920412, 0.5667529985536977, 0.5828855072510987, 0.591399850241795

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['label_encoder_train_descr.joblib']

Inference for CVE description

In [1]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('bert_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_description_bert_mean'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_descr.joblib')
label_encoder_train=joblib.load('label_encoder_train_descr.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.6478    0.5019    0.5656      1070
         120     0.4245    0.4592    0.4412       196
         125     0.0000    0.0000    0.0000       532
         134     0.2500    0.6316    0.3582        19
         189     0.6364    0.1765    0.2763       119
         190     0.6576    0.6050    0.6302       200
          20     0.3430    0.3519    0.3473       810
         200     0.5386    0.4492    0.4898       590
         203     0.5625    0.3333    0.4186        27
          22     0.4714    0.9228    0.6240       518
         254     0.0476    0.0882    0.0619        34
         255     0.2434    0.5522    0.3379        67
         264     0.5409    0.5785    0.5591       503
         269     0.2424    0.0755    0.1151       106
         276     0.1500    0.4688    0.2273        64
         284     0.3913    0.0732    0.1233       123
         287     0.3730    0.4947    0.4253       285
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training for BERT based on CVE core terms

In [3]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('bert_comparison_train_0.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('bert_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_terms_bert_mean'].tolist() for item in balanced if item['cwe'] != 'None'])

test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_terms_bert_mean'].tolist() for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_terms.joblib')


Epoch 1/40
Epoch 1 - F1 Score: 0.5597
Saved best model
[0.5596929494004437]
Epoch 2/40
Epoch 2 - F1 Score: 0.5834
Saved best model
[0.5596929494004437, 0.5833781887317446]
Epoch 3/40
Epoch 3 - F1 Score: 0.6078
Saved best model
[0.5596929494004437, 0.5833781887317446, 0.6078257810541927]
Epoch 4/40
Epoch 4 - F1 Score: 0.6130
Saved best model
[0.5596929494004437, 0.5833781887317446, 0.6078257810541927, 0.6130070526836545]
Epoch 5/40
Epoch 5 - F1 Score: 0.6190
Saved best model
[0.5596929494004437, 0.5833781887317446, 0.6078257810541927, 0.6130070526836545, 0.6189541120533575]
Epoch 6/40
Epoch 6 - F1 Score: 0.6283
Saved best model
[0.5596929494004437, 0.5833781887317446, 0.6078257810541927, 0.6130070526836545, 0.6189541120533575, 0.6282516929060225]
Epoch 7/40
Epoch 7 - F1 Score: 0.6308
Saved best model
[0.5596929494004437, 0.5833781887317446, 0.6078257810541927, 0.6130070526836545, 0.6189541120533575, 0.6282516929060225, 0.6307582904229094]
Epoch 8/40
Epoch 9/40
Epoch 9 - F1 Score: 0.6360

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['label_encoder_train_terms.joblib']

Inference for core terms

In [4]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('bert_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_terms_bert_mean'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_terms.joblib')
label_encoder_train=joblib.load('label_encoder_train_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.6253    0.5411    0.5802      1070
         120     0.3598    0.3010    0.3278       196
         125     0.0000    0.0000    0.0000       532
         134     0.4000    0.7368    0.5185        19
         189     0.5000    0.1429    0.2222       119
         190     0.5335    0.8350    0.6511       200
          20     0.3557    0.2679    0.3056       810
         200     0.4402    0.5051    0.4704       590
         203     0.1939    0.7037    0.3040        27
          22     0.7524    0.8977    0.8187       518
         254     0.0000    0.0000    0.0000        34
         255     0.1429    0.4478    0.2166        67
         264     0.3842    0.3201    0.3492       503
         269     0.3214    0.1698    0.2222       106
         276     0.1141    0.5938    0.1914        64
         284     0.6154    0.0650    0.1176       123
         287     0.3807    0.5544    0.4514       285
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
