In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        # List all keys
        print(f"Keys in {file_path}: {list(hdf.keys())}")

        # Load the datasets into pandas Series
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    # Create a DataFrame with the boolean columns
    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    # Create a new column 'Class' based on the boolean columns
    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1  # In case none of the columns are True

    df['Class'] = df.apply(assign_class, axis=1)

    # Now eliminate rows where Class = -1 and the corresponding functionSource
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    # Combine the filtered Class and functionSource data
    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)

    return df_final

In [2]:
# Paths to your HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnerable-code/VDISC_validate.hdf5'

# Process the training dataset
print("Processing Training Dataset:")
df_train_final = load_and_process_hdf5(train_hdf5_file_path)

# Downsample training set to 20,000 samples with the given proportions
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}  # Based on your request
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

print("Final Training Data Class Distribution:")
print(df_train_downsampled['Class'].value_counts())

# Process the validation dataset and downsample to 3,900 samples
print("\nProcessing Validation Dataset:")
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

print("Final Validation Data Class Distribution:")
print(df_val_downsampled['Class'].value_counts())

# Process the test dataset and downsample to 3,900 samples
print("\nProcessing Test Dataset:")
df_test_final = load_and_process_hdf5(test_hdf5_file_path)
test_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_test_downsampled = pd.DataFrame()
for cls, n_samples in test_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

print("Final Test Data Class Distribution:")
print(df_test_downsampled['Class'].value_counts())


Processing Training Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_train.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Training Data Class Distribution:
Class
0    5942
1    5777
4    5582
3    2755
2     249
Name: count, dtype: int64

Processing Validation Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_validate.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Validation Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64

Processing Test Dataset:
Keys in /kaggle/input/vulnerable-code/VDISC_test.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Test Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd

# Custom Dataset class to handle encodings and labels
class CodeBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert inputs and labels to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to check and clean NaN values in DataFrames
def clean_data(df):
    df = df.dropna()  # Drop rows with NaN values
    return df

# Function to tokenize the function source code for training, validation, and test datasets
def tokenize_function(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Load the pre-trained GraphCodeBERT tokenizer and model
graphcodebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
graphcodebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=5)

# Load the pre-trained UniXcoder tokenizer and model
unixcoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base")
unixcoder_model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=5)

# Clean and process the datasets to ensure no NaN values
df_train_downsampled = clean_data(df_train_downsampled)
df_val_downsampled = clean_data(df_val_downsampled)
df_test_downsampled = clean_data(df_test_downsampled)

# Apply the tokenization on the datasets using both tokenizers (GraphCodeBERT and UniXcoder)
train_encodings_graphcodebert = tokenize_function(df_train_downsampled, graphcodebert_tokenizer)
train_encodings_unixcoder = tokenize_function(df_train_downsampled, unixcoder_tokenizer)

val_encodings_graphcodebert = tokenize_function(df_val_downsampled, graphcodebert_tokenizer)
val_encodings_unixcoder = tokenize_function(df_val_downsampled, unixcoder_tokenizer)

test_encodings_graphcodebert = tokenize_function(df_test_downsampled, graphcodebert_tokenizer)
test_encodings_unixcoder = tokenize_function(df_test_downsampled, unixcoder_tokenizer)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Create Dataset objects for training, validation, and test datasets
train_dataset_graphcodebert = CodeBERTDataset(train_encodings_graphcodebert, train_labels)
train_dataset_unixcoder = CodeBERTDataset(train_encodings_unixcoder, train_labels)

val_dataset_graphcodebert = CodeBERTDataset(val_encodings_graphcodebert, val_labels)
val_dataset_unixcoder = CodeBERTDataset(val_encodings_unixcoder, val_labels)

test_dataset_graphcodebert = CodeBERTDataset(test_encodings_graphcodebert, test_labels)
test_dataset_unixcoder = CodeBERTDataset(test_encodings_unixcoder, test_labels)

# Define compute_metrics function for both models
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)

    # Precision, Recall, F1
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)

    # Compute ROC-AUC for multi-class classification
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )
    
    # Full classification report
    report_dict = classification_report(p.label_ids, preds, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)
    
    # Convert to DataFrame for easy rounding and formatting
    report_df = pd.DataFrame(report_dict).transpose()
    report_df = report_df.round(4)  # Round to 4 decimal places
    
    print("\nFinal Classification Report (rounded to 4 decimal places):\n", report_df)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none"  # Disable Weights & Biases (wandb) logging
)

# Trainer setup for GraphCodeBERT
trainer_graphcodebert = Trainer(
    model=graphcodebert_model,
    args=training_args,
    train_dataset=train_dataset_graphcodebert,
    eval_dataset=val_dataset_graphcodebert,
    tokenizer=graphcodebert_tokenizer,
    compute_metrics=compute_metrics
)

# Trainer setup for UniXcoder
trainer_unixcoder = Trainer(
    model=unixcoder_model,
    args=training_args,
    train_dataset=train_dataset_unixcoder,
    eval_dataset=val_dataset_unixcoder,
    tokenizer=unixcoder_tokenizer,
    compute_metrics=compute_metrics
)

# Train both models
trainer_graphcodebert.train()
trainer_unixcoder.train()

# Get predictions from both models on the test dataset
graphcodebert_preds = trainer_graphcodebert.predict(test_dataset_graphcodebert).predictions
unixcoder_preds = trainer_unixcoder.predict(test_dataset_unixcoder).predictions

# Apply weighted averaging (e.g., 30% GraphCodeBERT, 70% UniXcoder)
final_probs = 0.3 * torch.softmax(torch.tensor(graphcodebert_preds), dim=1) + 0.7 * torch.softmax(torch.tensor(unixcoder_preds), dim=1)

# Final predictions
final_predictions = torch.argmax(final_probs, dim=1).numpy()

# Evaluate the ensemble predictions
accuracy = accuracy_score(test_labels, final_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, final_predictions, average='weighted')
classification_report_final = classification_report(test_labels, final_predictions, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'], digits=4)

# Print results
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(f"Ensemble Model Precision: {precision:.4f}")
print(f"Ensemble Model Recall: {recall:.4f}")
print(f"Ensemble Model F1-Score: {f1:.4f}")
print("\nFinal Classification Report:\n", classification_report_final)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.7889,0.630525,0.777949,0.793404,0.777949,0.778248,0.923684
2,0.6183,0.60629,0.791026,0.791977,0.791026,0.788279,0.925691
3,0.4517,0.618764,0.797179,0.79455,0.797179,0.794843,0.925699


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8861  0.8581    0.8719  1142.0000
Class 1          0.8803  0.7898    0.8326  1099.0000
Class 2          0.5625  0.1698    0.2609    53.0000
Class 3          0.7905  0.5925    0.6774   535.0000
Class 4          0.6183  0.8030    0.6986  1071.0000
accuracy         0.7779  0.7779    0.7779     0.7779
macro avg        0.7475  0.6427    0.6683  3900.0000
weighted avg     0.7934  0.7779    0.7782  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score   support
Class 0          0.8740  0.8870    0.8805  1142.000
Class 1          0.8104  0.8672    0.8378  1099.000
Class 2          0.4857  0.3208    0.3864    53.000
Class 3          0.8217  0.5944    0.6898   535.000
Class 4          0.6859  0.7320    0.7082  1071.000
accuracy         0.7910  0.7910    0.7910     0.791
macro avg        0.7355  0.6803    0.7005  3900.000
weighted avg     0.7920  0.7910    0.7883  3900.000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8748  0.8809    0.8778  1142.0000
Class 1          0.8099  0.8799    0.8434  1099.0000
Class 2          0.4737  0.3396    0.3956    53.0000
Class 3          0.7671  0.6710    0.7159   535.0000
Class 4          0.7229  0.7087    0.7157  1071.0000
accuracy         0.7972  0.7972    0.7972     0.7972
macro avg        0.7297  0.6960    0.7097  3900.0000
weighted avg     0.7945  0.7972    0.7948  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.751,0.609715,0.79359,0.804266,0.79359,0.792396,0.923738
2,0.5001,0.561291,0.811026,0.811534,0.811026,0.809092,0.930976
3,0.3359,0.589483,0.815128,0.813114,0.815128,0.813397,0.93177


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8645  0.8827    0.8735  1142.0000
Class 1          0.8824  0.8189    0.8495  1099.0000
Class 2          0.7083  0.3208    0.4416    53.0000
Class 3          0.8231  0.5738    0.6762   535.0000
Class 4          0.6553  0.8058    0.7228  1071.0000
accuracy         0.7936  0.7936    0.7936     0.7936
macro avg        0.7867  0.6804    0.7127  3900.0000
weighted avg     0.8043  0.7936    0.7924  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score   support
Class 0          0.8611  0.9011    0.8806  1142.000
Class 1          0.8513  0.8644    0.8578  1099.000
Class 2          0.4468  0.3962    0.4200    53.000
Class 3          0.8280  0.6299    0.7155   535.000
Class 4          0.7278  0.7712    0.7489  1071.000
accuracy         0.8110  0.8110    0.8110     0.811
macro avg        0.7430  0.7126    0.7246  3900.000
weighted avg     0.8115  0.8110    0.8091  3900.000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8797  0.8897    0.8846  1142.0000
Class 1          0.8250  0.8835    0.8533  1099.0000
Class 2          0.5238  0.4151    0.4632    53.0000
Class 3          0.7845  0.7009    0.7404   535.0000
Class 4          0.7586  0.7423    0.7504  1071.0000
accuracy         0.8151  0.8151    0.8151     0.8151
macro avg        0.7543  0.7263    0.7384  3900.0000
weighted avg     0.8131  0.8151    0.8134  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8911  0.8599    0.8752  1142.0000
Class 1          0.8085  0.8644    0.8355  1099.0000
Class 2          0.4706  0.3019    0.3678    53.0000
Class 3          0.7850  0.7028    0.7416   535.0000
Class 4          0.7090  0.7348    0.7217  1071.0000
accuracy         0.7977  0.7977    0.7977     0.7977
macro avg        0.7328  0.6928    0.7084  3900.0000
weighted avg     0.7976  0.7977    0.7967  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8859  0.8774    0.8817  1142.0000
Class 1          0.8404  0.8817    0.8606  1099.0000
Class 2          0.4706  0.3019    0.3678    53.0000
Class 3          0.8219  0.7421    0.7800   535.0000
Class 4          0.7461  0.7656    0.7558  1071.0000
accuracy         0.8215  0.8215    0.8215     0.8215
macro avg        0.7530  0.7137    0.7292  3900.0000
weighted avg     0.8203  0.8215    0.8202  3900.0000
Ensemble Model Accuracy: 0.8190
Ensemble Model Precision: 0.8179
Ensemble Model Recall: 0.8190
Ensemble Model F1-Score: 0.8176

Final Classification Report:
               precision    recall  f1-score   support

     Class 0     0.8880    0.8748    0.8813      1142
     Class 1     0.8363    0.8835    0.8593      1099
     Class 2     0.4848    0.3019    0.3721        53
     Class 3     0.8191    0.7364    0.7756       535
     Class 4     0.7400    0.7600

In [4]:
import os

# Plot AUC-ROC curves for each class and save the figure
def plot_roc_auc_curve(test_labels, predictions, num_classes=5):
    test_labels_bin = label_binarize(test_labels, classes=[0, 1, 2, 3, 4])  # Binarize the labels for multi-class ROC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels_bin[:, i], predictions[:, i])
        roc_auc[i] = roc_auc_score(test_labels_bin[:, i], predictions[:, i])

    # Plot ROC curve for each class
    plt.figure(figsize=(10, 8))
    colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red']
    for i in range(num_classes):
        plt.plot(fpr[i], tpr[i], color=colors[i], lw=2,
                 label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) for each class')
    plt.legend(loc="lower right")

    # Save the figure to a file
    output_dir = './roc_auc_plots'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    plt.savefig(os.path.join(output_dir, 'roc_auc_curve.png'))  # Save the figure as PNG
    plt.show()

# Compute metrics function with ROC-AUC curve plotting and saving
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    
    # Compute ROC-AUC for each class using one-vs-rest approach
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )

    # Plot ROC curves for each class and save to a file
    plot_roc_auc_curve(p.label_ids, p.predictions)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }
