Train based on CVE terms with Phi-2 model

In [5]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('train_phi_comp.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('phi_terms_comp_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_terms_phi'] for item in balanced if item['cwe'] != 'None'])
test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_terms_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_terms.joblib')

Epoch 1/40
Epoch 1 - F1 Score: 0.5195
Saved best model
[0.5194653477557486]
Epoch 2/40
Epoch 2 - F1 Score: 0.5555
Saved best model
[0.5194653477557486, 0.555486880994522]
Epoch 3/40
Epoch 3 - F1 Score: 0.5916
Saved best model
[0.5194653477557486, 0.555486880994522, 0.5916274211500077]
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 6 - F1 Score: 0.6193
Saved best model
[0.5194653477557486, 0.555486880994522, 0.5916274211500077, 0.5703611451313957, 0.5873786773630832, 0.6192570572984668]
Epoch 7/40
Epoch 7 - F1 Score: 0.6295
Saved best model
[0.5194653477557486, 0.555486880994522, 0.5916274211500077, 0.5703611451313957, 0.5873786773630832, 0.6192570572984668, 0.6295347264863698]
Epoch 8/40
Epoch 9/40
Epoch 9 - F1 Score: 0.6395
Saved best model
[0.5194653477557486, 0.555486880994522, 0.5916274211500077, 0.5703611451313957, 0.5873786773630832, 0.6192570572984668, 0.6295347264863698, 0.6239142516700469, 0.639475934761174]
Epoch 10/40
Epoch 10 - F1 Score: 0.6425
Saved best model
[0.5194653477557486,

['label_encoder_train_terms.joblib']

Inference CVE terms with Phi-2 model

In [2]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('phi_terms_comp_test.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_terms_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_terms.joblib')
label_encoder_train=joblib.load('label_encoder_train_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.7793    0.5477    0.6432      1070
         120     0.3464    0.6786    0.4586       196
         125     0.7542    0.7669    0.7605       532
         134     0.6154    0.8421    0.7111        19
         190     0.8333    0.8000    0.8163       200
          20     0.4380    0.3753    0.4043       810
         200     0.6949    0.5288    0.6006       590
         203     0.6000    0.5556    0.5769        27
          22     0.8457    0.8996    0.8718       518
         269     0.3333    0.4623    0.3874       106
         276     0.2500    0.2656    0.2576        64
         287     0.4183    0.7368    0.5337       285
         295     0.6190    0.6420    0.6303        81
         306     0.2979    0.1489    0.1986        94
         312     0.3846    0.1190    0.1818        42
         319     0.5161    0.3137    0.3902        51
         326     0.2000    0.1613    0.1786        31
   

Train based on CVE descriptions with Phi-2 model

In [4]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('train_phi_descr_comparison.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('test_phi_descr_comparison.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_description_phi'] for item in balanced if item['cwe'] != 'None'])
test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_description_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'CWE_classes.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train.joblib')

Epoch 1/40
Epoch 1 - F1 Score: 0.5419
Saved best model
[0.5418684902723947]
Epoch 2/40
Epoch 2 - F1 Score: 0.5774
Saved best model
[0.5418684902723947, 0.5773995173326363]
Epoch 3/40
Epoch 3 - F1 Score: 0.5990
Saved best model
[0.5418684902723947, 0.5773995173326363, 0.5990375093601406]
Epoch 4/40
Epoch 4 - F1 Score: 0.6228
Saved best model
[0.5418684902723947, 0.5773995173326363, 0.5990375093601406, 0.6228266641472865]
Epoch 5/40
Epoch 5 - F1 Score: 0.6349
Saved best model
[0.5418684902723947, 0.5773995173326363, 0.5990375093601406, 0.6228266641472865, 0.6348700823131541]
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 8 - F1 Score: 0.6497
Saved best model
[0.5418684902723947, 0.5773995173326363, 0.5990375093601406, 0.6228266641472865, 0.6348700823131541, 0.6315542857791329, 0.624950752853186, 0.6496610214955467]
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 11 - F1 Score: 0.6542
Saved best model
[0.5418684902723947, 0.5773995173326363, 0.5990375093601406, 0.6228266641472865, 0.6348700823131541, 0.

['label_encoder_train.joblib']

Inference based on CVE description with Phi-2 model

In [5]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('test_phi_descr_comparison.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_description_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('CWE_classes.joblib')
label_encoder_train=joblib.load('label_encoder_train.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.8339    0.4879    0.6156      1070
         120     0.3011    0.8112    0.4392       196
         125     0.5305    0.7857    0.6333       532
         134     0.3784    0.7368    0.5000        19
         190     0.6611    0.7900    0.7198       200
          20     0.4990    0.2963    0.3718       810
         200     0.7568    0.4695    0.5795       590
         203     0.5667    0.6296    0.5965        27
          22     0.7509    0.8436    0.7945       518
         269     0.2808    0.3868    0.3254       106
         276     0.2642    0.2188    0.2393        64
         287     0.4330    0.5895    0.4993       285
         295     0.5714    0.6420    0.6047        81
         306     0.2442    0.4468    0.3158        94
         312     0.2222    0.0952    0.1333        42
         319     0.2015    0.5294    0.2919        51
         326     0.3529    0.3871    0.3692        31
   