In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1

    df['Class'] = df.apply(assign_class, axis=1)
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)
    return df_final

# Paths to HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnerabledata/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnerabledata/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnerabledata/VDISC_validate.hdf5'

# Process the datasets
df_train_final = load_and_process_hdf5(train_hdf5_file_path)
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
df_test_final = load_and_process_hdf5(test_hdf5_file_path)

# Downsample datasets
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

# Prepare datasets for validation and test
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

df_test_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

# Custom Dataset class to handle encodings and labels
class CodeBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize data
def tokenize_function(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Tokenize and clean datasets
df_train_downsampled = df_train_downsampled.dropna()
df_val_downsampled = df_val_downsampled.dropna()
df_test_downsampled = df_test_downsampled.dropna()

# Initialize tokenizers and models
graphcodebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
graphcodebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=5)

unixcoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base")
unixcoder_model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=5)

# Tokenize the data
train_encodings_graphcodebert = tokenize_function(df_train_downsampled, graphcodebert_tokenizer)
train_encodings_unixcoder = tokenize_function(df_train_downsampled, unixcoder_tokenizer)

val_encodings_graphcodebert = tokenize_function(df_val_downsampled, graphcodebert_tokenizer)
val_encodings_unixcoder = tokenize_function(df_val_downsampled, unixcoder_tokenizer)

test_encodings_graphcodebert = tokenize_function(df_test_downsampled, graphcodebert_tokenizer)
test_encodings_unixcoder = tokenize_function(df_test_downsampled, unixcoder_tokenizer)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Dataset objects
train_dataset_graphcodebert = CodeBERTDataset(train_encodings_graphcodebert, train_labels)
val_dataset_graphcodebert = CodeBERTDataset(val_encodings_graphcodebert, val_labels)
test_dataset_graphcodebert = CodeBERTDataset(test_encodings_graphcodebert, test_labels)

train_dataset_unixcoder = CodeBERTDataset(train_encodings_unixcoder, train_labels)
val_dataset_unixcoder = CodeBERTDataset(val_encodings_unixcoder, val_labels)
test_dataset_unixcoder = CodeBERTDataset(test_encodings_unixcoder, test_labels)

# Train models
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    report_to="none"
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Train GraphCodeBERT
trainer_graphcodebert = Trainer(
    model=graphcodebert_model,
    args=training_args,
    train_dataset=train_dataset_graphcodebert,
    eval_dataset=val_dataset_graphcodebert,
    tokenizer=graphcodebert_tokenizer,
    compute_metrics=compute_metrics
)

trainer_unixcoder = Trainer(
    model=unixcoder_model,
    args=training_args,
    train_dataset=train_dataset_unixcoder,
    eval_dataset=val_dataset_unixcoder,
    tokenizer=unixcoder_tokenizer,
    compute_metrics=compute_metrics
)

trainer_graphcodebert.train()
trainer_unixcoder.train()

# Get predictions for both models
graphcodebert_val_outputs = trainer_graphcodebert.predict(val_dataset_graphcodebert)
unixcoder_val_outputs = trainer_unixcoder.predict(val_dataset_unixcoder)

graphcodebert_test_outputs = trainer_graphcodebert.predict(test_dataset_graphcodebert)
unixcoder_test_outputs = trainer_unixcoder.predict(test_dataset_unixcoder)

# Convert predictions to probabilities
graphcodebert_val_probs = torch.softmax(torch.tensor(graphcodebert_val_outputs.predictions), dim=1).numpy()
unixcoder_val_probs = torch.softmax(torch.tensor(unixcoder_val_outputs.predictions), dim=1).numpy()

graphcodebert_test_probs = torch.softmax(torch.tensor(graphcodebert_test_outputs.predictions), dim=1).numpy()
unixcoder_test_probs = torch.softmax(torch.tensor(unixcoder_test_outputs.predictions), dim=1).numpy()

# Stack the predictions
X_train_stack = np.concatenate([graphcodebert_val_probs, unixcoder_val_probs], axis=1)
X_test_stack = np.concatenate([graphcodebert_test_probs, unixcoder_test_probs], axis=1)

y_train_stack = val_labels
y_test_stack = test_labels

# List of meta-models
meta_models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=200, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and evaluate each meta-model
for name, model in meta_models.items():
    print(f"\nTraining and Evaluating {name}")
    model.fit(X_train_stack, y_train_stack)
    stacked_predictions = model.predict(X_test_stack)

    # Evaluation
    accuracy = accuracy_score(y_test_stack, stacked_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_stack, stacked_predictions, average='weighted')
    classification_report_final = classification_report(y_test_stack, stacked_predictions, digits=4)

    # AUC
    y_test_binarized = label_binarize(y_test_stack, classes=[0, 1, 2, 3, 4])
    final_test_probs = model.predict_proba(X_test_stack)
    auc_score = roc_auc_score(y_test_binarized, final_test_probs, multi_class='ovr')

    # Print evaluation metrics
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-Score: {auc_score:.4f}")
    print(f"\nClassification Report for {name}:\n", classification_report_final)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6499,0.639904,0.770513,0.76987,0.775903,0.770513
2,0.5555,0.591849,0.797436,0.795824,0.796628,0.797436
3,0.4179,0.658826,0.787949,0.782443,0.787847,0.787949
4,0.3661,0.715337,0.795128,0.791204,0.796568,0.795128
5,0.2717,0.807701,0.800769,0.798466,0.799012,0.800769
6,0.2101,0.980438,0.792564,0.789709,0.792971,0.792564
7,0.1564,1.144551,0.798462,0.795734,0.797029,0.798462
8,0.063,1.278588,0.794872,0.793081,0.793823,0.794872
9,0.1107,1.367601,0.796154,0.794618,0.794636,0.796154
10,0.0677,1.418645,0.794359,0.791759,0.792246,0.794359


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5958,0.60979,0.787436,0.789684,0.803241,0.787436
2,0.5087,0.551294,0.817949,0.817,0.820179,0.817949
3,0.3881,0.610818,0.805128,0.80046,0.804744,0.805128
4,0.3082,0.690785,0.818205,0.816329,0.817028,0.818205
5,0.2321,0.845745,0.819231,0.818198,0.818318,0.819231
6,0.1186,1.063138,0.811538,0.809988,0.809649,0.811538
7,0.0808,1.247019,0.814872,0.811807,0.813068,0.814872
8,0.065,1.331175,0.814615,0.813903,0.813975,0.814615
9,0.0278,1.405308,0.811795,0.810987,0.810433,0.811795
10,0.0359,1.420517,0.812051,0.810811,0.810288,0.812051


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training and Evaluating Logistic Regression

Logistic Regression Results:
Accuracy: 0.8236
Precision: 0.8259
Recall: 0.8236
F1-Score: 0.8221
AUC-Score: 0.9285

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0     0.9033    0.8748    0.8888      1142
           1     0.8683    0.8635    0.8659      1099
           2     0.4706    0.1509    0.2286        53
           3     0.8448    0.7327    0.7848       535
           4     0.7082    0.8067    0.7543      1071

    accuracy                         0.8236      3900
   macro avg     0.7590    0.6857    0.7045      3900
weighted avg     0.8259    0.8236    0.8221      3900


Training and Evaluating Random Forest

Random Forest Results:
Accuracy: 0.8228
Precision: 0.8245
Recall: 0.8228
F1-Score: 0.8213
AUC-Score: 0.9245

Classification Report for Random Forest:
               precision    recall  f1-score   support

           0     0.9066    0.8757    0.8909      1142
  