In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import os

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1

    df['Class'] = df.apply(assign_class, axis=1)
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)
    return df_final

# Paths to HDF5 files
train_hdf5_file_path = '/kaggle/input/vulner-code/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulner-code/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulner-code/VDISC_validate.hdf5'

# Process the datasets
df_train_final = load_and_process_hdf5(train_hdf5_file_path)
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
df_test_final = load_and_process_hdf5(test_hdf5_file_path)

# Downsample datasets
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

# Prepare datasets for validation and test
val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

df_test_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

# Custom Dataset class to handle encodings and labels
class UniXcoderDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize data
def tokenize_function(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Tokenize and clean datasets
df_train_downsampled = df_train_downsampled.dropna()
df_val_downsampled = df_val_downsampled.dropna()
df_test_downsampled = df_test_downsampled.dropna()

# Initialize tokenizers and models for UniXcoder
unixcoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base")
unixcoder_model = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base", num_labels=5)

# Tokenize the data
train_encodings_unixcoder = tokenize_function(df_train_downsampled, unixcoder_tokenizer)
val_encodings_unixcoder = tokenize_function(df_val_downsampled, unixcoder_tokenizer)
test_encodings_unixcoder = tokenize_function(df_test_downsampled, unixcoder_tokenizer)

# Prepare labels
train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# Dataset objects
train_dataset_unixcoder = UniXcoderDataset(train_encodings_unixcoder, train_labels)
val_dataset_unixcoder = UniXcoderDataset(val_encodings_unixcoder, val_labels)
test_dataset_unixcoder = UniXcoderDataset(test_encodings_unixcoder, test_labels)

# Train models
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    report_to="none"
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Train UniXcoder
trainer_unixcoder = Trainer(
    model=unixcoder_model,
    args=training_args,
    train_dataset=train_dataset_unixcoder,
    eval_dataset=val_dataset_unixcoder,
    tokenizer=unixcoder_tokenizer,
    compute_metrics=compute_metrics
)

trainer_unixcoder.train()

# Get predictions for UniXcoder
unixcoder_val_outputs = trainer_unixcoder.predict(val_dataset_unixcoder)
unixcoder_test_outputs = trainer_unixcoder.predict(test_dataset_unixcoder)

# Convert predictions to probabilities
unixcoder_val_probs = torch.softmax(torch.tensor(unixcoder_val_outputs.predictions), dim=1).numpy()
unixcoder_test_probs = torch.softmax(torch.tensor(unixcoder_test_outputs.predictions), dim=1).numpy()

# Classification Report for UniXcoder (before stacking)
unixcoder_val_preds = np.argmax(unixcoder_val_outputs.predictions, axis=-1)
classification_report_unixcoder = classification_report(val_labels, unixcoder_val_preds, digits=4)
print("\nClassification Report for UniXcoder (Before Stacking):")
print(classification_report_unixcoder)

# Function to plot and save AUC curve
def plot_and_save_auc_curve(y_true, y_pred_probs, title, file_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_probs, pos_label=1)
    plt.figure()
    plt.plot(fpr, tpr, lw=2, label=f'AUC: {roc_auc_score(y_true, y_pred_probs):.2f}')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig(file_name)
    plt.close()

# Prepare paths for saving AUC curves
output_dir = '/kaggle/working/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Compute AUC and plot curve for UniXcoder (pre-stacking)
y_val_binarized = label_binarize(val_labels, classes=[0, 1, 2, 3, 4])
auc_score = roc_auc_score(y_val_binarized, unixcoder_val_probs, multi_class='ovr')
print(f"UniXcoder AUC Score: {auc_score:.4f}")

# Plot and save AUC curve for UniXcoder
plot_and_save_auc_curve(y_val_binarized.ravel(), unixcoder_val_probs.ravel(), 'UniXcoder AUC Curve (Pre-Stacking)', os.path.join(output_dir, 'unixcoder_auc_curve.png'))

# Stack the predictions
X_train_stack = unixcoder_val_probs
X_test_stack = unixcoder_test_probs

y_train_stack = val_labels
y_test_stack = test_labels

# List of traditional meta-models
meta_models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=200, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and evaluate each meta-model
for name, model in meta_models.items():
    print(f"\nTraining and Evaluating {name}")
    model.fit(X_train_stack, y_train_stack)
    stacked_predictions = model.predict(X_test_stack)

    # Evaluation
    accuracy = accuracy_score(y_test_stack, stacked_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_stack, stacked_predictions, average='weighted')
    classification_report_final = classification_report(y_test_stack, stacked_predictions, digits=4)

    # AUC
    y_test_binarized = label_binarize(y_test_stack, classes=[0, 1, 2, 3, 4])
    final_test_probs = model.predict_proba(X_test_stack)
    auc_score_meta = roc_auc_score(y_test_binarized, final_test_probs, multi_class='ovr')
    
    # Print evaluation metrics
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-Score: {auc_score_meta:.4f}")
    print(f"\nClassification Report for {name}:\n", classification_report_final)

    # Plot and save AUC curve for the stacked model
    file_name = os.path.join(output_dir, f'{name}_auc_curve.png')
    plot_and_save_auc_curve(y_test_binarized.ravel(), final_test_probs.ravel(), f'{name} AUC Curve (Post-Stacking)', file_name)

# Neural Network Meta-Learner
class MetaLearnerNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MetaLearnerNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the meta-learner
input_dim = X_train_stack.shape[1]
hidden_dim = 128
output_dim = 5
meta_learner = MetaLearnerNN(input_dim, hidden_dim, output_dim)

# Convert data to tensors
X_train_tensor = torch.tensor(X_train_stack, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_stack, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_stack, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_stack, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Set up optimizer and loss function
optimizer = optim.Adam(meta_learner.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop for meta-learner
num_epochs = 10
for epoch in range(num_epochs):
    meta_learner.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = meta_learner(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

# Evaluation on test set
meta_learner.eval()
with torch.no_grad():
    correct = 0
    total = 0
    all_preds = []
    for X_batch, y_batch in test_loader:
        outputs = meta_learner(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
        all_preds.extend(predicted.numpy())
    accuracy = correct / total
    print(f"Meta-Learner Accuracy: {accuracy:.4f}")

# Generate classification report for the neural network meta-learner
print("\nClassification Report for Neural Network Meta-Learner:")
print(classification_report(y_test_stack, all_preds, digits=4))


tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/938k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5922,0.602197,0.792821,0.792799,0.798486,0.792821
2,0.4973,0.568101,0.815385,0.814941,0.819553,0.815385
3,0.3746,0.624299,0.813077,0.809817,0.812464,0.813077
4,0.2853,0.709111,0.813846,0.810292,0.813826,0.813846
5,0.2138,0.899696,0.816923,0.815554,0.81516,0.816923
6,0.1416,1.135734,0.806923,0.802715,0.805124,0.806923
7,0.0822,1.206132,0.815128,0.811864,0.812065,0.815128
8,0.0445,1.328753,0.816154,0.813832,0.813721,0.816154
9,0.0348,1.393561,0.818974,0.817691,0.817459,0.818974
10,0.0374,1.41579,0.818205,0.816783,0.816209,0.818205


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Classification Report for UniXcoder (Before Stacking):
              precision    recall  f1-score   support

           0     0.8976    0.8827    0.8901      1142
           1     0.8827    0.8562    0.8693      1099
           2     0.6154    0.3019    0.4051        53
           3     0.7942    0.6710    0.7275       535
           4     0.6942    0.7993    0.7431      1071

    accuracy                         0.8154      3900
   macro avg     0.7768    0.7022    0.7270      3900
weighted avg     0.8196    0.8154    0.8149      3900

UniXcoder AUC Score: 0.9380

Training and Evaluating Logistic Regression

Logistic Regression Results:
Accuracy: 0.8151
Precision: 0.8170
Recall: 0.8151
F1-Score: 0.8136
AUC-Score: 0.9249

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0     0.9024    0.8739    0.8879      1142
           1     0.8641    0.8508    0.8574      1099
           2     0.3529    0.1132    0.1714        53
 