In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        # List all keys
        print(f"Keys in {file_path}: {list(hdf.keys())}")

        # Load the datasets into pandas Series
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    # Create a DataFrame with the boolean columns
    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    # Create a new column 'Class' based on the boolean columns
    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1  # In case none of the columns are True

    df['Class'] = df.apply(assign_class, axis=1)

    # Now eliminate rows where Class = -1 and the corresponding functionSource
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    # Combine the filtered Class and functionSource data
    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)

    return df_final

In [2]:
# Paths to your HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnerabilitycode/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnerabilitycode/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnerabilitycode/VDISC_validate.hdf5'

# Process the training dataset
print("Processing Training Dataset:")
df_train_final = load_and_process_hdf5(train_hdf5_file_path)

# Downsample training set to 20,000 samples with the given proportions
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}  # Based on your request
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

print("Final Training Data Class Distribution:")
print(df_train_downsampled['Class'].value_counts())

# Process the validation dataset and downsample to 3,900 samples
print("\nProcessing Validation Dataset:")
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

print("Final Validation Data Class Distribution:")
print(df_val_downsampled['Class'].value_counts())

# Process the test dataset and downsample to 3,900 samples
print("\nProcessing Test Dataset:")
df_test_final = load_and_process_hdf5(test_hdf5_file_path)
test_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}  # Recalculated for 3900 samples
df_test_downsampled = pd.DataFrame()
for cls, n_samples in test_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

print("Final Test Data Class Distribution:")
print(df_test_downsampled['Class'].value_counts())


Processing Training Dataset:
Keys in /kaggle/input/vulnerabilitycode/VDISC_train.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Training Data Class Distribution:
Class
0    5942
1    5777
4    5582
3    2755
2     249
Name: count, dtype: int64

Processing Validation Dataset:
Keys in /kaggle/input/vulnerabilitycode/VDISC_validate.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Validation Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64

Processing Test Dataset:
Keys in /kaggle/input/vulnerabilitycode/VDISC_test.hdf5: ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']
Final Test Data Class Distribution:
Class
0    1142
1    1099
4    1071
3     535
2      53
Name: count, dtype: int64


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd

# Custom Dataset class to handle encodings and labels
class UniXcoderDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert inputs and labels to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize the function source code for training, validation, and test datasets
def tokenize_function(df):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Load the pre-trained UniXcoder tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base")

# Apply the tokenization on the datasets
train_encodings = tokenize_function(df_train_downsampled)
val_encodings = tokenize_function(df_val_downsampled)
test_encodings = tokenize_function(df_test_downsampled)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Create Dataset objects for training, validation, and test datasets
train_dataset = UniXcoderDataset(train_encodings, train_labels)
val_dataset = UniXcoderDataset(val_encodings, val_labels)
test_dataset = UniXcoderDataset(test_encodings, test_labels)

# Define compute_metrics function to calculate accuracy, precision, recall, F1, ROC-AUC, and final classification report
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)

    # Precision, Recall, F1
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)

    # Compute ROC-AUC for multi-class classification
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )
    
    # Full classification report (output as a dictionary)
    report_dict = classification_report(p.label_ids, preds, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'], output_dict=True)
    
    # Convert to DataFrame for easy rounding and formatting
    report_df = pd.DataFrame(report_dict).transpose()
    report_df = report_df.round(4)  # Round to 4 decimal places
    
    print("\nFinal Classification Report (rounded to 4 decimal places):\n", report_df)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,  # Adjust batch size based on available GPU
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Increase to 4-5 epochs if needed
    learning_rate=2e-5,  # Standard learning rate for fine-tuning transformers
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,  # Save only the 2 most recent models
    report_to="none"  # Disable Weights & Biases (wandb) logging
)

# Load the pre-trained UniXcoder model for classification
model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=5)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Custom training dataset
    eval_dataset=val_dataset,  # Custom validation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Metrics function for accuracy, precision, recall, F1, ROC-AUC, classification report
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate(eval_dataset=test_dataset)  # Custom test dataset
print(results)


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.7939,0.613832,0.79641,0.798206,0.79641,0.795769,0.923464
2,0.4997,0.57563,0.806154,0.806527,0.806154,0.804033,0.931404
3,0.348,0.60411,0.81,0.807359,0.81,0.807958,0.933134


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8742  0.8827    0.8784  1142.0000
Class 1          0.8687  0.8371    0.8526  1099.0000
Class 2          0.5312  0.3208    0.4000    53.0000
Class 3          0.7489  0.6467    0.6941   535.0000
Class 4          0.6826  0.7610    0.7196  1071.0000
accuracy         0.7964  0.7964    0.7964     0.7964
macro avg        0.7411  0.6896    0.7090  3900.0000
weighted avg     0.7982  0.7964    0.7958  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8544  0.8940    0.8738  1142.0000
Class 1          0.8604  0.8690    0.8646  1099.0000
Class 2          0.6176  0.3962    0.4828    53.0000
Class 3          0.8116  0.6280    0.7081   535.0000
Class 4          0.7071  0.7572    0.7313  1071.0000
accuracy         0.8062  0.8062    0.8062     0.8062
macro avg        0.7702  0.7089    0.7321  3900.0000
weighted avg     0.8065  0.8062    0.8040  3900.0000


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score  support
Class 0          0.8685  0.8905    0.8794  1142.00
Class 1          0.8417  0.8808    0.8608  1099.00
Class 2          0.6000  0.3962    0.4773    53.00
Class 3          0.7614  0.6860    0.7217   535.00
Class 4          0.7401  0.7339    0.7370  1071.00
accuracy         0.8100  0.8100    0.8100     0.81
macro avg        0.7624  0.7175    0.7352  3900.00
weighted avg     0.8074  0.8100    0.8080  3900.00


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Final Classification Report (rounded to 4 decimal places):
               precision  recall  f1-score    support
Class 0          0.8833  0.8748    0.8790  1142.0000
Class 1          0.8279  0.8799    0.8531  1099.0000
Class 2          0.4828  0.2642    0.3415    53.0000
Class 3          0.7980  0.7308    0.7629   535.0000
Class 4          0.7338  0.7414    0.7376  1071.0000
accuracy         0.8115  0.8115    0.8115     0.8115
macro avg        0.7451  0.6982    0.7148  3900.0000
weighted avg     0.8095  0.8115    0.8096  3900.0000
{'eval_loss': 0.6207788586616516, 'eval_accuracy': 0.8115384615384615, 'eval_precision': 0.8094904117036521, 'eval_recall': 0.8115384615384615, 'eval_f1': 0.8096433435889524, 'eval_roc_auc': 0.9228311086643176, 'eval_runtime': 69.7183, 'eval_samples_per_second': 55.939, 'eval_steps_per_second': 3.5, 'epoch': 3.0}


In [4]:
import os

# Plot AUC-ROC curves for each class and save the figure
def plot_roc_auc_curve(test_labels, predictions, num_classes=5):
    test_labels_bin = label_binarize(test_labels, classes=[0, 1, 2, 3, 4])  # Binarize the labels for multi-class ROC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(test_labels_bin[:, i], predictions[:, i])
        roc_auc[i] = roc_auc_score(test_labels_bin[:, i], predictions[:, i])

    # Plot ROC curve for each class
    plt.figure(figsize=(10, 8))
    colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red']
    for i in range(num_classes):
        plt.plot(fpr[i], tpr[i], color=colors[i], lw=2,
                 label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) for each class')
    plt.legend(loc="lower right")

    # Save the figure to a file
    output_dir = './roc_auc_plots'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    plt.savefig(os.path.join(output_dir, 'roc_auc_curve.png'))  # Save the figure as PNG
    plt.show()

# Compute metrics function with ROC-AUC curve plotting and saving
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    
    # Compute ROC-AUC for each class using one-vs-rest approach
    label_binarized = label_binarize(p.label_ids, classes=[0, 1, 2, 3, 4])
    auc_score = roc_auc_score(
        y_true=label_binarized,
        y_score=p.predictions,
        multi_class="ovr"
    )

    # Plot ROC curves for each class and save to a file
    plot_roc_auc_curve(p.label_ids, p.predictions)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': auc_score
    }
