Train based on CVE terms with Phi-2 model

In [1]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('phi_terms_comparison_train.pickle', 'rb') as f1:
    balanced = pickle.load(f1)

with open('phi_terms_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_terms_phi'] for item in balanced if item['cwe'] != 'None'])
test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_terms_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'best_model_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train_terms.joblib')

Epoch 1/40
Epoch 1 - F1 Score: 0.4825
Saved best model
[0.48247620185550677]
Epoch 2/40
Epoch 2 - F1 Score: 0.5293
Saved best model
[0.48247620185550677, 0.5293309841606527]
Epoch 3/40
Epoch 3 - F1 Score: 0.5408
Saved best model
[0.48247620185550677, 0.5293309841606527, 0.540845384508342]
Epoch 4/40
Epoch 4 - F1 Score: 0.5578
Saved best model
[0.48247620185550677, 0.5293309841606527, 0.540845384508342, 0.5578148681835082]
Epoch 5/40
Epoch 5 - F1 Score: 0.5708
Saved best model
[0.48247620185550677, 0.5293309841606527, 0.540845384508342, 0.5578148681835082, 0.570842212693147]
Epoch 6/40
Epoch 6 - F1 Score: 0.5753
Saved best model
[0.48247620185550677, 0.5293309841606527, 0.540845384508342, 0.5578148681835082, 0.570842212693147, 0.5753379137801508]
Epoch 7/40
Epoch 7 - F1 Score: 0.5756
Saved best model
[0.48247620185550677, 0.5293309841606527, 0.540845384508342, 0.5578148681835082, 0.570842212693147, 0.5753379137801508, 0.5756110124947769]
Epoch 8/40
Epoch 9/40
Epoch 9 - F1 Score: 0.5807


['label_encoder_train_terms.joblib']

Inference CVE terms with Phi-2 model

In [2]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('phi_terms_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_terms_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_terms.joblib')
label_encoder_train=joblib.load('label_encoder_train_terms.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.7070    0.5075    0.5909      1070
         120     0.4062    0.5306    0.4602       196
         125     0.8287    0.7820    0.8046       532
         134     0.5714    0.8421    0.6809        19
         189     0.5148    0.7311    0.6042       119
         190     0.6957    0.7200    0.7076       200
          20     0.4424    0.1802    0.2561       810
         200     0.5134    0.5508    0.5315       590
         203     0.4848    0.5926    0.5333        27
          22     0.8339    0.9015    0.8664       518
         254     0.0000    0.0000    0.0000        34
         255     0.5517    0.2388    0.3333        67
         264     0.5017    0.2922    0.3693       503
         269     0.2892    0.4528    0.3529       106
         276     0.2035    0.3594    0.2599        64
         284     0.4000    0.0813    0.1351       123
         287     0.4462    0.6842    0.5402       285
   

Train based on CVE descriptions with Phi-2 model

In [4]:
import pickle
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib
from keras.callbacks import Callback
from sklearn.preprocessing import LabelEncoder

class F1ScoreCallback(Callback):
    def __init__(self, X_val, y_val):
        super(F1ScoreCallback, self).__init__()
        self.X_val = X_val
        self.y_val = y_val
        self.best_f1 = 0.0
        self.best_model = None
        self.f1_scores = []

    def on_epoch_end(self, epoch, logs=None):
        y_val_pred = np.argmax(self.model.predict(self.X_val), axis=1)
        f1 = f1_score(self.y_val, y_val_pred, average='weighted')
        self.f1_scores.append(f1)
        

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_model = self.model
            print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
            print("Saved best model")
            print(self.f1_scores)

with open('phi_descr_comparison_train.pickle', 'rb') as f1:
    balanced = pickle.load(f1)
print(balanced[0])
with open('phi_descr_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

train = np.array([item['cve_description_phi'] for item in balanced if item['cwe'] != 'None'])
test = np.array([item['cwe'] for item in balanced if item['cwe'] != 'None'])
np.random.seed(42)
X_train, X_val, y_train, y_val = train_test_split(train,test,test_size=0.1,random_state=42)

X_test = np.array([item['cve_description_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

label_encoder_train = LabelEncoder()
y_train_encoded = label_encoder_train.fit_transform(y_train)
label_encoder_test = LabelEncoder()
y_test_encoded = label_encoder_test.fit_transform(y_test)


input_dim = X_train.shape[1]
output_dim = len(np.unique(y_train))

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

f1_callback = F1ScoreCallback(X_val, label_encoder_train.transform(y_val))

history = model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_val, label_encoder_train.transform(y_val)), verbose=1, callbacks=[f1_callback])

best_model = f1_callback.best_model


# Save the best model
joblib.dump(best_model, 'CWE_classes.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

joblib.dump(label_encoder_train, 'label_encoder_train.joblib')

{'cve_id': 'CVE-1999-0007', 'cve_description': 'Information from SSL-encrypted sessions via PKCS #1.', 'cve_terms': ['SSL Encryption Vulnerability', 'PKCS #1 Exposure', 'Information Disclosure'], 'cwe': '327', 'cwe_class': '327', 'cve_description_phi': [1.087753176689148, 0.26409363746643066, 0.8230777382850647, -0.3850973844528198, -0.7270230054855347, 0.7219997644424438, 0.2268471121788025, 0.16935373842716217, -1.2603179216384888, 1.5379236936569214, 0.23670989274978638, -0.6104186773300171, -1.275844931602478, -1.0017726421356201, 0.15842241048812866, -0.1283402144908905, -0.19288694858551025, 1.2531225681304932, -0.8090196847915649, -0.10320411622524261, -0.9572002291679382, 1.0826053619384766, 0.5547383427619934, -0.0780240148305893, -0.5427049994468689, 0.13996493816375732, -1.0220403671264648, -2.688271999359131, 1.9150744676589966, 1.6416929960250854, 0.14876103401184082, 0.4220753610134125, 0.7890188694000244, 0.6650851964950562, 1.0920637845993042, -2.4817707538604736, 0.248

['label_encoder_train.joblib']

Inference based on CVE description with Phi-2 model

In [6]:
import pickle
import numpy as np
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
# Save the best model
with open('phi_descr_comparison_test_0.pickle', 'rb') as f2:
    unbalanced = pickle.load(f2)

X_test = np.array([item['cve_description_phi'] for item in unbalanced if item['cwe'] != 'None'])
y_test = np.array([item['cwe'] for item in unbalanced if item['cwe'] != 'None'])

best_model=joblib.load('best_model_descr.joblib')
label_encoder_train=joblib.load('label_encoder_train_descr.joblib')

# Make predictions on the test set
y_pred_probs = best_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_pred_original = label_encoder_train.inverse_transform(y_pred)

print("Classification Report:\n", classification_report(y_test, y_pred_original, digits=4))

Classification Report:
               precision    recall  f1-score   support

         119     0.8935    0.3607    0.5140      1070
         120     0.3219    0.8112    0.4609       196
         125     0.5465    0.7293    0.6248       532
         134     0.5000    0.6316    0.5581        19
         189     0.6792    0.6050    0.6400       119
         190     0.6063    0.7700    0.6784       200
          20     0.4309    0.1926    0.2662       810
         200     0.7128    0.3407    0.4610       590
         203     0.4667    0.5185    0.4912        27
          22     0.8785    0.7819    0.8274       518
         254     0.0000    0.0000    0.0000        34
         255     0.2966    0.5224    0.3784        67
         264     0.5368    0.6521    0.5889       503
         269     0.3564    0.3396    0.3478       106
         276     0.2110    0.3594    0.2659        64
         284     0.4318    0.1545    0.2275       123
         287     0.3613    0.4982    0.4189       285
   