In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1

    df['Class'] = df.apply(assign_class, axis=1)
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)
    return df_final

# Paths to HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_validate.hdf5'

# Process the datasets
df_train_final = load_and_process_hdf5(train_hdf5_file_path)
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
df_test_final = load_and_process_hdf5(test_hdf5_file_path)

# Downsample datasets
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

# Prepare datasets for validation and test
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

df_test_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

# Custom Dataset class to handle encodings and labels
class CodeBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize data
def tokenize_function(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Tokenize and clean datasets
df_train_downsampled = df_train_downsampled.dropna()
df_val_downsampled = df_val_downsampled.dropna()
df_test_downsampled = df_test_downsampled.dropna()

# Initialize tokenizers and models
graphcodebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
graphcodebert_model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=5)

unixcoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base")
unixcoder_model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=5)

# Tokenize the data
train_encodings_graphcodebert = tokenize_function(df_train_downsampled, graphcodebert_tokenizer)
train_encodings_unixcoder = tokenize_function(df_train_downsampled, unixcoder_tokenizer)

val_encodings_graphcodebert = tokenize_function(df_val_downsampled, graphcodebert_tokenizer)
val_encodings_unixcoder = tokenize_function(df_val_downsampled, unixcoder_tokenizer)

test_encodings_graphcodebert = tokenize_function(df_test_downsampled, graphcodebert_tokenizer)
test_encodings_unixcoder = tokenize_function(df_test_downsampled, unixcoder_tokenizer)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Dataset objects
train_dataset_graphcodebert = CodeBERTDataset(train_encodings_graphcodebert, train_labels)
val_dataset_graphcodebert = CodeBERTDataset(val_encodings_graphcodebert, val_labels)
test_dataset_graphcodebert = CodeBERTDataset(test_encodings_graphcodebert, test_labels)

train_dataset_unixcoder = CodeBERTDataset(train_encodings_unixcoder, train_labels)
val_dataset_unixcoder = CodeBERTDataset(val_encodings_unixcoder, val_labels)
test_dataset_unixcoder = CodeBERTDataset(test_encodings_unixcoder, test_labels)

# Train models
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    report_to="none"
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Train GraphCodeBERT
trainer_graphcodebert = Trainer(
    model=graphcodebert_model,
    args=training_args,
    train_dataset=train_dataset_graphcodebert,
    eval_dataset=val_dataset_graphcodebert,
    tokenizer=graphcodebert_tokenizer,
    compute_metrics=compute_metrics
)

trainer_unixcoder = Trainer(
    model=unixcoder_model,
    args=training_args,
    train_dataset=train_dataset_unixcoder,
    eval_dataset=val_dataset_unixcoder,
    tokenizer=unixcoder_tokenizer,
    compute_metrics=compute_metrics
)

trainer_graphcodebert.train()
trainer_unixcoder.train()

# Get predictions for both models
graphcodebert_val_outputs = trainer_graphcodebert.predict(val_dataset_graphcodebert)
unixcoder_val_outputs = trainer_unixcoder.predict(val_dataset_unixcoder)

graphcodebert_test_outputs = trainer_graphcodebert.predict(test_dataset_graphcodebert)
unixcoder_test_outputs = trainer_unixcoder.predict(test_dataset_unixcoder)

# Convert predictions to probabilities
graphcodebert_val_probs = torch.softmax(torch.tensor(graphcodebert_val_outputs.predictions), dim=1).numpy()
unixcoder_val_probs = torch.softmax(torch.tensor(unixcoder_val_outputs.predictions), dim=1).numpy()

graphcodebert_test_probs = torch.softmax(torch.tensor(graphcodebert_test_outputs.predictions), dim=1).numpy()
unixcoder_test_probs = torch.softmax(torch.tensor(unixcoder_test_outputs.predictions), dim=1).numpy()

# Stack the predictions
X_train_stack = np.concatenate([graphcodebert_val_probs, unixcoder_val_probs], axis=1)
X_test_stack = np.concatenate([graphcodebert_test_probs, unixcoder_test_probs], axis=1)

y_train_stack = val_labels
y_test_stack = test_labels

# List of traditional meta-models
meta_models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=200, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and evaluate each meta-model
for name, model in meta_models.items():
    print(f"\nTraining and Evaluating {name}")
    model.fit(X_train_stack, y_train_stack)
    stacked_predictions = model.predict(X_test_stack)

    # Evaluation
    accuracy = accuracy_score(y_test_stack, stacked_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_stack, stacked_predictions, average='weighted')
    classification_report_final = classification_report(y_test_stack, stacked_predictions, digits=4)

    # AUC
    y_test_binarized = label_binarize(y_test_stack, classes=[0, 1, 2, 3, 4])
    final_test_probs = model.predict_proba(X_test_stack)
    auc_score = roc_auc_score(y_test_binarized, final_test_probs, multi_class='ovr')

    # Print evaluation metrics
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-Score: {auc_score:.4f}")
    print(f"\nClassification Report for {name}:\n", classification_report_final)

# Neural Network Meta-Learner
class MetaLearnerNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MetaLearnerNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the meta-learner
input_dim = X_train_stack.shape[1]
hidden_dim = 128
output_dim = 5
meta_learner = MetaLearnerNN(input_dim, hidden_dim, output_dim)

# Convert data to tensors
X_train_tensor = torch.tensor(X_train_stack, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_stack, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_stack, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_stack, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Set up optimizer and loss function
optimizer = optim.Adam(meta_learner.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop for meta-learner
num_epochs = 10
for epoch in range(num_epochs):
    meta_learner.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = meta_learner(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

# Evaluation on test set
meta_learner.eval()
with torch.no_grad():
    correct = 0
    total = 0
    all_preds = []
    for X_batch, y_batch in test_loader:
        outputs = meta_learner(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
        all_preds.extend(predicted.numpy())
    accuracy = correct / total
    print(f"Meta-Learner Accuracy: {accuracy:.4f}")

# Generate classification report for the neural network meta-learner
print(classification_report(y_test_stack, all_preds, digits=4))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6611,0.643033,0.772051,0.772481,0.78934,0.772051
2,0.5777,0.597576,0.79,0.786759,0.7867,0.79
3,0.4242,0.647893,0.780769,0.775224,0.781901,0.780769
4,0.3652,0.674167,0.800513,0.798164,0.798476,0.800513
5,0.2586,0.806368,0.795641,0.793689,0.793192,0.795641
6,0.1817,1.007837,0.78641,0.782607,0.785951,0.78641
7,0.1167,1.182927,0.786154,0.781839,0.784923,0.786154
8,0.0607,1.287357,0.79,0.787198,0.788744,0.79
9,0.0812,1.346089,0.787436,0.785665,0.785268,0.787436
10,0.0584,1.385537,0.786923,0.784281,0.784699,0.786923


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5855,0.61948,0.792564,0.793727,0.802533,0.792564
2,0.5269,0.566216,0.808205,0.805471,0.807483,0.808205
3,0.4101,0.611552,0.805385,0.801007,0.805166,0.805385
4,0.3034,0.724187,0.808718,0.805989,0.807477,0.808718
5,0.2057,0.855648,0.81641,0.814611,0.813869,0.81641
6,0.1572,1.056557,0.805897,0.803888,0.805625,0.805897
7,0.0891,1.246916,0.810769,0.808169,0.809922,0.810769
8,0.0636,1.337039,0.814872,0.813763,0.814374,0.814872
9,0.0554,1.395372,0.815128,0.813686,0.813734,0.815128
10,0.0274,1.437205,0.814359,0.813201,0.813138,0.814359


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Training and Evaluating Logistic Regression

Logistic Regression Results:
Accuracy: 0.8113
Precision: 0.8110
Recall: 0.8113
F1-Score: 0.8090
AUC-Score: 0.9293

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0     0.8935    0.8739    0.8836      1142
           1     0.8474    0.8690    0.8580      1099
           2     0.4375    0.1321    0.2029        53
           3     0.8158    0.7121    0.7605       535
           4     0.7016    0.7684    0.7335      1071

    accuracy                         0.8113      3900
   macro avg     0.7392    0.6711    0.6877      3900
weighted avg     0.8110    0.8113    0.8090      3900


Training and Evaluating Random Forest

Random Forest Results:
Accuracy: 0.8156
Precision: 0.8187
Recall: 0.8156
F1-Score: 0.8144
AUC-Score: 0.9232

Classification Report for Random Forest:
               precision    recall  f1-score   support

           0     0.9067    0.8599    0.8827      1142
  