In [1]:
import h5py
import pandas as pd
from sklearn.utils import resample
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
from sklearn.metrics import classification_report, precision_recall_fscore_support, roc_auc_score
import numpy as np

# Function to load and process the HDF5 file
def load_and_process_hdf5(file_path):
    with h5py.File(file_path, 'r') as hdf:
        cwe_119_data = pd.Series(hdf['CWE-119'][:], name='CWE-119')
        cwe_120_data = pd.Series(hdf['CWE-120'][:], name='CWE-120')
        cwe_469_data = pd.Series(hdf['CWE-469'][:], name='CWE-469')
        cwe_476_data = pd.Series(hdf['CWE-476'][:], name='CWE-476')
        cwe_other_data = pd.Series(hdf['CWE-other'][:], name='CWE-other')
        function_source_data = pd.Series(hdf['functionSource'][:], name='functionSource')

    df = pd.concat([cwe_119_data, cwe_120_data, cwe_469_data, cwe_476_data, cwe_other_data], axis=1)

    def assign_class(row):
        if row['CWE-119']:
            return 0
        elif row['CWE-120']:
            return 1
        elif row['CWE-469']:
            return 2
        elif row['CWE-476']:
            return 3
        elif row['CWE-other']:
            return 4
        else:
            return -1

    df['Class'] = df.apply(assign_class, axis=1)
    mask = df['Class'] != -1
    df_filtered = df[mask]
    function_source_filtered = function_source_data[mask]

    df_final = pd.concat([df_filtered['Class'], function_source_filtered], axis=1)
    return df_final

# Paths to HDF5 files
train_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_train.hdf5'
test_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_test.hdf5'
validation_hdf5_file_path = '/kaggle/input/vulnercode/VDISC_validate.hdf5'

# Process the datasets
df_train_final = load_and_process_hdf5(train_hdf5_file_path)
df_val_final = load_and_process_hdf5(validation_hdf5_file_path)
df_test_final = load_and_process_hdf5(test_hdf5_file_path)

# Downsample datasets
train_sample_proportions = {0: 5942, 1: 5777, 4: 5582, 3: 2755, 2: 249}
df_train_downsampled = pd.DataFrame()
for cls, n_samples in train_sample_proportions.items():
    class_data = df_train_final[df_train_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_train_downsampled = pd.concat([df_train_downsampled, class_downsampled])

val_sample_proportions = {0: 1142, 1: 1099, 4: 1071, 3: 535, 2: 53}
df_val_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_val_final[df_val_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_val_downsampled = pd.concat([df_val_downsampled, class_downsampled])

df_test_downsampled = pd.DataFrame()
for cls, n_samples in val_sample_proportions.items():
    class_data = df_test_final[df_test_final['Class'] == cls]
    class_downsampled = resample(class_data, replace=False, n_samples=n_samples, random_state=42)
    df_test_downsampled = pd.concat([df_test_downsampled, class_downsampled])

# Custom Dataset class to handle encodings and labels
class CodeBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': self.encodings[idx]}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Function to tokenize data for LSTM
def tokenize_function_lstm(df, tokenizer):
    return tokenizer(
        df['functionSource'].astype(str).tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )['input_ids']

# Tokenize the data using RobertaTokenizer
graphcodebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
train_encodings_lstm = tokenize_function_lstm(df_train_downsampled, graphcodebert_tokenizer)
val_encodings_lstm = tokenize_function_lstm(df_val_downsampled, graphcodebert_tokenizer)
test_encodings_lstm = tokenize_function_lstm(df_test_downsampled, graphcodebert_tokenizer)

train_labels = df_train_downsampled['Class'].tolist()
val_labels = df_val_downsampled['Class'].tolist()
test_labels = df_test_downsampled['Class'].tolist()

# LSTM Model with Attention
class AttentionLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, attention_dim):
        super(AttentionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.attention_layer = nn.Linear(hidden_dim, attention_dim)
        self.context_vector = nn.Linear(attention_dim, 1, bias=False)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def attention(self, lstm_output):
        attention_scores = torch.tanh(self.attention_layer(lstm_output))
        attention_weights = torch.softmax(self.context_vector(attention_scores), dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_output, _ = self.lstm(embedded)
        attention_output = self.attention(lstm_output)
        output = self.fc(attention_output)
        return output

# Ensure vocab_size matches the tokenizer's vocab size
vocab_size = graphcodebert_tokenizer.vocab_size  
embedding_dim = 128
hidden_dim = 256
attention_dim = 64
output_dim = 5  # 5 classes

# Instantiate the model, loss function, and optimizer
model = AttentionLSTM(embedding_dim, hidden_dim, output_dim, vocab_size, attention_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Create Dataloaders
train_dataset = CodeBERTDataset(train_encodings_lstm, train_labels)
val_dataset = CodeBERTDataset(val_encodings_lstm, val_labels)
test_dataset = CodeBERTDataset(test_encodings_lstm, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Training function with AUC, Precision, Recall, F1 score, and Classification Report
def train_model(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            inputs = batch['input_ids']
            labels = batch['labels']
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_loss, correct, total = 0, 0, 0
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids']
                labels = batch['labels']
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())

        accuracy = correct / total
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        auc_score = roc_auc_score(np.eye(output_dim)[all_labels], np.eye(output_dim)[all_preds], multi_class="ovr")

        print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {accuracy:.4f}, "
              f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, AUC: {auc_score:.4f}")

        # Classification Report
        print("\nClassification Report (Validation):")
        print(classification_report(all_labels, all_preds, digits=4))

# Test the model with AUC, Precision, Recall, F1 score, and Classification Report
def test_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['input_ids']
            labels = batch['labels']
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    accuracy = correct / total
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    auc_score = roc_auc_score(np.eye(output_dim)[all_labels], np.eye(output_dim)[all_preds], multi_class="ovr")

    print(f"Test Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"F1 Score: {f1:.4f}, AUC: {auc_score:.4f}")

    # Classification Report
    print("\nClassification Report (Test):")
    print(classification_report(all_labels, all_preds, digits=4))

# Train the model for 10 epochs
train_model(model, train_loader, val_loader, epochs=10)

# Evaluate the model on the test set
test_model(model, test_loader)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]



Epoch 1, Loss: 0.9297
Validation Loss: 0.7487, Accuracy: 0.7272, Precision: 0.7442, Recall: 0.7272, F1 Score: 0.7249, AUC: 0.7481

Classification Report (Validation):
              precision    recall  f1-score   support

           0     0.8790    0.8590    0.8689      1142
           1     0.7980    0.7298    0.7624      1099
           2     1.0000    0.0189    0.0370        53
           3     0.6780    0.5234    0.5907       535
           4     0.5656    0.7208    0.6338      1071

    accuracy                         0.7272      3900
   macro avg     0.7841    0.5704    0.5786      3900
weighted avg     0.7442    0.7272    0.7249      3900

Epoch 2, Loss: 0.6845
Validation Loss: 0.6899, Accuracy: 0.7549, Precision: 0.7629, Recall: 0.7549, F1 Score: 0.7551, AUC: 0.7754

Classification Report (Validation):
              precision    recall  f1-score   support

           0     0.8964    0.8485    0.8718      1142
           1     0.8371    0.7716    0.8030      1099
           2  