Training for BERT based on CVE descriptions

In [2]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('train_bert_comp.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('bert_comparison_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_description_bert_mean'].tolist() for item in balanced if item['cwe'] != 'None'])

test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_description_bert_mean'].tolist() for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_descr.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_descr.joblib')

Epoch 1/40
Epoch 1 - F1 Score: 0.5050
Saved best model
[0.5049995175544824]
Epoch 2/40
Epoch 2 - F1 Score: 0.5541
Saved best model
[0.5049995175544824, 0.5541061113098998]
Epoch 3/40
Epoch 3 - F1 Score: 0.5922
Saved best model
[0.5049995175544824, 0.5541061113098998, 0.5921520336822977]
Epoch 4/40
Epoch 4 - F1 Score: 0.6042
Saved best model
[0.5049995175544824, 0.5541061113098998, 0.5921520336822977, 0.6042119934816004]
Epoch 5/40
Epoch 6/40
Epoch 6 - F1 Score: 0.6160
Saved best model
[0.5049995175544824, 0.5541061113098998, 0.5921520336822977, 0.6042119934816004, 0.6038051709773709, 0.6159929895304281]
Epoch 7/40
Epoch 7 - F1 Score: 0.6300
Saved best model
[0.5049995175544824, 0.5541061113098998, 0.5921520336822977, 0.6042119934816004, 0.6038051709773709, 0.6159929895304281, 0.6300056787976503]
Epoch 8/40
Epoch 9/40
Epoch 9 - F1 Score: 0.6454
Saved best model
[0.5049995175544824, 0.5541061113098998, 0.5921520336822977, 0.6042119934816004, 0.6038051709773709, 0.6159929895304281, 0.6300

['label_encoder_train_descr.joblib']

Inference for CVE description

In [1]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('bert_comparison_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_description_bert_mean'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_descr.joblib')
label_encoder_train=joblib.load('label_encoder_train_descr.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.8668    0.4318    0.5764      1070
         120     0.4715    0.6327    0.5403       196
         125     0.6442    0.8271    0.7243       532
         134     0.3793    0.5789    0.4583        19
         190     0.6996    0.8150    0.7529       200
          20     0.4914    0.3160    0.3847       810
         200     0.7224    0.4763    0.5741       590
         203     0.4375    0.5185    0.4746        27
          22     0.8185    0.8359    0.8271       518
         269     0.1909    0.4340    0.2651       106
         276     0.2143    0.1875    0.2000        64
         287     0.3859    0.6526    0.4850       285
         295     0.4574    0.7284    0.5619        81
         306     0.1643    0.2447    0.1966        94
         312     0.2273    0.2381    0.2326        42
         319     0.6000    0.2941    0.3947        51
         326     0.1935    0.1935    0.1935        31
   

Training for BERT based on CVE terms

In [5]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('train_bert_comp.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('bert_comparison_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_terms_bert_mean'].tolist() for item in balanced if item['cwe'] != 'None'])

test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_terms_bert_mean'].tolist() for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_terms.joblib')


Epoch 1/40
Epoch 1 - F1 Score: 0.5107
Saved best model
[0.5107091911759324]
Epoch 2/40
Epoch 2 - F1 Score: 0.5734
Saved best model
[0.5107091911759324, 0.573418450490414]
Epoch 3/40
Epoch 3 - F1 Score: 0.5818
Saved best model
[0.5107091911759324, 0.573418450490414, 0.5817562128291929]
Epoch 4/40
Epoch 4 - F1 Score: 0.6093
Saved best model
[0.5107091911759324, 0.573418450490414, 0.5817562128291929, 0.6093243620623386]
Epoch 5/40
Epoch 5 - F1 Score: 0.6216
Saved best model
[0.5107091911759324, 0.573418450490414, 0.5817562128291929, 0.6093243620623386, 0.6215900727922613]
Epoch 6/40
Epoch 6 - F1 Score: 0.6258
Saved best model
[0.5107091911759324, 0.573418450490414, 0.5817562128291929, 0.6093243620623386, 0.6215900727922613, 0.6258220901438384]
Epoch 7/40
Epoch 7 - F1 Score: 0.6349
Saved best model
[0.5107091911759324, 0.573418450490414, 0.5817562128291929, 0.6093243620623386, 0.6215900727922613, 0.6258220901438384, 0.6348785953994661]
Epoch 8/40
Epoch 8 - F1 Score: 0.6463
Saved best model

['label_encoder_train_terms.joblib']

Inference for CVE terms

In [1]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('bert_comparison_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_terms_bert_mean'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_terms.joblib')
label_encoder_train=joblib.load('label_encoder_train_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.8693    0.4290    0.5745      1070
         120     0.3403    0.6633    0.4498       196
         125     0.7273    0.8421    0.7805       532
         134     0.8889    0.4211    0.5714        19
         190     0.7054    0.8500    0.7710       200
          20     0.5226    0.2284    0.3179       810
         200     0.5746    0.6136    0.5934       590
         203     0.3077    0.5926    0.4051        27
          22     0.7831    0.8919    0.8339       518
         269     0.3071    0.3679    0.3348       106
         276     0.2553    0.3750    0.3038        64
         287     0.4712    0.6035    0.5292       285
         295     0.5600    0.6914    0.6188        81
         306     0.2632    0.1064    0.1515        94
         312     0.1774    0.2619    0.2115        42
         319     0.3939    0.5098    0.4444        51
         326     0.1613    0.1613    0.1613        31
   