In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        # List all keys
        print(f"Keys in {file_path}: {list(hdf.keys())}")

        # Load the datasets into pandas Series
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    # Create a DataFrame with the boolean columns
    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    # Create a new column 'Class' based on the boolean columns
    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1  # In case none of the columns are True

    df['Class'] = df.apply(assign_class, axis=1)

    # Now eliminate rows where Class = -1 and the corresponding functionSource
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    # Combine the filtered Class and functionSource data
    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)

    return df_final

In [2]:
# Paths to your HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_validate.hdf5'

# Process the training dataset
print("Processing Training Dataset:")
df_train_final = load_and_process_hdf5(train_hdf5_file_path)

# Downsample training set to 20,000 samples with the given proportions
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}  # Based on your request
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

print("Final Training Data Class Distribution:")
print(df_train_downsampled['Class'].value_counts())

# Process the validation dataset and downsample to 3,900 samples
print("\nProcessing Validation Dataset:")
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

print("Final Validation Data Class Distribution:")
print(df_val_downsampled['Class'].value_counts())

# Process the test dataset and downsample to 3,900 samples
print("\nProcessing Test Dataset:")
df_test_final = load_and_process_hdf5(test_hdf5_file_path)
test_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_test_downsampled = pd.DataFrame()
for cls, n_samples in test_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

print("Final Test Data Class Distribution:")
print(df_test_downsampled['Class'].value_counts())


Processing Training Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_train.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Training Data Class Distribution:
Class
0    5942
1    5777
4    5582
3    2755
2     249
Name: count, dtype: int64

Processing Validation Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_validate.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Validation Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64

Processing Test Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_test.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Test Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd

# Custom Dataset class to handle encodings and labels
class CodeBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert inputs and labels to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize the function source code for training, validation, and test datasets
def tokenize_function(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Load the pre-trained CodeBERT tokenizer and model
codebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=5)

# Load the pre-trained GraphCodeBERT tokenizer and model
graphcodebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
graphcodebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=5)

# Apply the tokenization on the datasets using both tokenizers (CodeBERT and GraphCodeBERT)
train_encodings_codebert = tokenize_function(df_train_downsampled, codebert_tokenizer)
train_encodings_graphcodebert = tokenize_function(df_train_downsampled, graphcodebert_tokenizer)

val_encodings_codebert = tokenize_function(df_val_downsampled, codebert_tokenizer)
val_encodings_graphcodebert = tokenize_function(df_val_downsampled, graphcodebert_tokenizer)

test_encodings_codebert = tokenize_function(df_test_downsampled, codebert_tokenizer)
test_encodings_graphcodebert = tokenize_function(df_test_downsampled, graphcodebert_tokenizer)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Create Dataset objects for training, validation, and test datasets
train_dataset_codebert = CodeBERTDataset(train_encodings_codebert, train_labels)
train_dataset_graphcodebert = CodeBERTDataset(train_encodings_graphcodebert, train_labels)

val_dataset_codebert = CodeBERTDataset(val_encodings_codebert, val_labels)
val_dataset_graphcodebert = CodeBERTDataset(val_encodings_graphcodebert, val_labels)

test_dataset_codebert = CodeBERTDataset(test_encodings_codebert, test_labels)
test_dataset_graphcodebert = CodeBERTDataset(test_encodings_graphcodebert, test_labels)

# Define compute_metrics function for both models
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)

    # Precision, Recall, F1
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)

    # Compute ROC-AUC for multi-class classification
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )
    
    # Full classification report
    report_dict = classification_report(p.label_ids, preds, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)
    
    # Convert to DataFrame for easy rounding and formatting
    report_df = pd.DataFrame(report_dict).transpose()
    report_df = report_df.round(4)  # Round to 4 decimal places
    
    print("\nFinal Classification Report (rounded to 4 decimal places):\n", report_df)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none"  # Disable Weights & Biases (wandb) logging
)

# Trainer setup for CodeBERT
trainer_codebert = Trainer(
    model=codebert_model,
    args=training_args,
    train_dataset=train_dataset_codebert,
    eval_dataset=val_dataset_codebert,
    tokenizer=codebert_tokenizer,
    compute_metrics=compute_metrics
)

# Trainer setup for GraphCodeBERT
trainer_graphcodebert = Trainer(
    model=graphcodebert_model,
    args=training_args,
    train_dataset=train_dataset_graphcodebert,
    eval_dataset=val_dataset_graphcodebert,
    tokenizer=graphcodebert_tokenizer,
    compute_metrics=compute_metrics
)

# Train both models
trainer_codebert.train()
trainer_graphcodebert.train()

# Get predictions from both models on the test dataset
codebert_preds = trainer_codebert.predict(test_dataset_codebert).predictions
graphcodebert_preds = trainer_graphcodebert.predict(test_dataset_graphcodebert).predictions

# Apply weighted averaging (e.g., 30% CodeBERT, 70% GraphCodeBERT)
final_probs = 0.3 * torch.softmax(torch.tensor(codebert_preds), dim=1) + 0.7 * torch.softmax(torch.tensor(graphcodebert_preds), dim=1)

# Final predictions
final_predictions = torch.argmax(final_probs, dim=1).numpy()

# Evaluate the ensemble predictions
accuracy = accuracy_score(test_labels, final_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, final_predictions, average='weighted')
classification_report_final = classification_report(test_labels, final_predictions, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'])

# Print results
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(f"Ensemble Model Precision: {precision:.4f}")
print(f"Ensemble Model Recall: {recall:.4f}")
print(f"Ensemble Model F1-Score: {f1:.4f}")
print("\nFinal Classification Report:\n", classification_report_final)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.8105,0.652909,0.771795,0.780145,0.771795,0.769943,0.908326
2,0.6045,0.622644,0.787949,0.794942,0.787949,0.786687,0.924352
3,0.458,0.633464,0.78641,0.783836,0.78641,0.784055,0.921164


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8791  0.8599    0.8694  1142.0000
Class 1          0.8620  0.7898    0.8243  1099.0000
Class 2          0.4000  0.0377    0.0690    53.0000
Class 3          0.7581  0.6093    0.6756   535.0000
Class 4          0.6204  0.7768    0.6899  1071.0000
accuracy         0.7718  0.7718    0.7718     0.7718
macro avg        0.7039  0.6147    0.6256  3900.0000
weighted avg     0.7801  0.7718    0.7699  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8648  0.8792    0.8719  1142.0000
Class 1          0.8419  0.8480    0.8450  1099.0000
Class 2          0.5128  0.3774    0.4348    53.0000
Class 3          0.8564  0.5794    0.6912   535.0000
Class 4          0.6556  0.7535    0.7011  1071.0000
accuracy         0.7879  0.7879    0.7879     0.7879
macro avg        0.7463  0.6875    0.7088  3900.0000
weighted avg     0.7949  0.7879    0.7867  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8737  0.8783    0.8760  1142.0000
Class 1          0.7950  0.8681    0.8299  1099.0000
Class 2          0.5714  0.3774    0.4545    53.0000
Class 3          0.7495  0.6710    0.7081   535.0000
Class 4          0.7042  0.6825    0.6932  1071.0000
accuracy         0.7864  0.7864    0.7864     0.7864
macro avg        0.7388  0.6955    0.7124  3900.0000
weighted avg     0.7838  0.7864    0.7841  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.7369,0.624847,0.778462,0.785542,0.778462,0.778222,0.914016
2,0.5445,0.603935,0.791282,0.79171,0.791282,0.787884,0.922189
3,0.3863,0.603457,0.798462,0.795585,0.798462,0.795643,0.922423


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8561  0.8853    0.8704  1142.0000
Class 1          0.8807  0.7925    0.8343  1099.0000
Class 2          0.5000  0.2830    0.3614    53.0000
Class 3          0.7582  0.6037    0.6722   535.0000
Class 4          0.6405  0.7619    0.6959  1071.0000
accuracy         0.7785  0.7785    0.7785     0.7785
macro avg        0.7271  0.6653    0.6869  3900.0000
weighted avg     0.7855  0.7785    0.7782  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8495  0.8897    0.8691  1142.0000
Class 1          0.8194  0.8672    0.8426  1099.0000
Class 2          0.5000  0.3019    0.3765    53.0000
Class 3          0.8368  0.5944    0.6951   535.0000
Class 4          0.6935  0.7311    0.7118  1071.0000
accuracy         0.7913  0.7913    0.7913     0.7913
macro avg        0.7399  0.6768    0.6990  3900.0000
weighted avg     0.7917  0.7913    0.7879  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8710  0.8870    0.8790  1142.0000
Class 1          0.7998  0.8872    0.8412  1099.0000
Class 2          0.4390  0.3396    0.3830    53.0000
Class 3          0.7747  0.6748    0.7213   535.0000
Class 4          0.7389  0.6975    0.7176  1071.0000
accuracy         0.7985  0.7985    0.7985     0.7985
macro avg        0.7247  0.6972    0.7084  3900.0000
weighted avg     0.7956  0.7985    0.7956  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8862  0.8590    0.8724  1142.0000
Class 1          0.7933  0.8626    0.8265  1099.0000
Class 2          0.4333  0.2453    0.3133    53.0000
Class 3          0.7755  0.6972    0.7343   535.0000
Class 4          0.7038  0.7143    0.7090  1071.0000
accuracy         0.7897  0.7897    0.7897     0.7897
macro avg        0.7184  0.6757    0.6911  3900.0000
weighted avg     0.7886  0.7897    0.7880  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8784  0.8669    0.8726  1142.0000
Class 1          0.8012  0.8690    0.8337  1099.0000
Class 2          0.4062  0.2453    0.3059    53.0000
Class 3          0.8085  0.7103    0.7562   535.0000
Class 4          0.7349  0.7404    0.7377  1071.0000
accuracy         0.8028  0.8028    0.8028     0.8028
macro avg        0.7259  0.6864    0.7012  3900.0000
weighted avg     0.8012  0.8028    0.8009  3900.0000
Ensemble Model Accuracy: 0.8038
Ensemble Model Precision: 0.8019
Ensemble Model Recall: 0.8038
Ensemble Model F1-Score: 0.8019

Final Classification Report:
               precision    recall  f1-score   support

     Class 0       0.88      0.87      0.88      1142
     Class 1       0.81      0.87      0.84      1099
     Class 2       0.45      0.25      0.32        53
     Class 3       0.80      0.72      0.76       535
     Class 4       0.73      0.73

In [4]:
import os

# Plot AUC-ROC curves for each class and save the figure
def plot_roc_auc_curve(test_labels, predictions, num_classes=5):
    test_labels_bin = label_binarize(test_labels, classes=[0, 1, 2, 3, 4])  # Binarize the labels for multi-class ROC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels_bin[:, i], predictions[:, i])
        roc_auc[i] = roc_auc_score(test_labels_bin[:, i], predictions[:, i])

    # Plot ROC curve for each class
    plt.figure(figsize=(10, 8))
    colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red']
    for i in range(num_classes):
        plt.plot(fpr[i], tpr[i], color=colors[i], lw=2,
                 label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) for each class')
    plt.legend(loc="lower right")

    # Save the figure to a file
    output_dir = './roc_auc_plots'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    plt.savefig(os.path.join(output_dir, 'roc_auc_curve.png'))  # Save the figure as PNG
    plt.show()

# Compute metrics function with ROC-AUC curve plotting and saving
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    
    # Compute ROC-AUC for each class using one-vs-rest approach
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )

    # Plot ROC curves for each class and save to a file
    plot_roc_auc_curve(p.label_ids, p.predictions)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }
