In [3]:
!pip install transformers==4.38.2
!pip install torch

Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
     ---------------------------------------- 0.0/130.7 kB ? eta -:--:--
     --- ------------------------------------ 10.2/130.7 kB ? eta -:--:--
     --- ------------------------------------ 10.2/130.7 kB ? eta -:--:--
     -------- ---------------------------- 30.7/130.7 kB 262.6 kB/s eta 0:00:01
     ----------- ------------------------- 41.0/130.7 kB 196.9 kB/s eta 0:00:01
     ----------------- ------------------- 61.4/130.7 kB 252.2 kB/s eta 0:00:01
     -------------------- ---------------- 71.7/130.7 kB 262.6 kB/s eta 0:00:01
     --------------------------------- -- 122.9/130.7 kB 379.3 kB/s eta 0:00:01
     ------------------------------------ 130.7/130.7 kB 386.0 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.38.2)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.

In [6]:
import transformers
import torch

print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())


Transformers version: 4.38.2
Torch version: 2.7.1+cpu
CUDA Available: False


In [12]:
import pandas as pd

# Load tokenized data
devign_df = pd.read_csv("devign_tokens.csv")
bigvul_df = pd.read_csv("bigvul_tokens.csv")
nvd_df = pd.read_csv("nvd_tokens.csv")

# Merge all labeled datasets
combined_df = pd.concat([devign_df, bigvul_df, nvd_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Total combined samples:", len(combined_df))


Total combined samples: 218534


In [14]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")


Train: 157344 | Val: 17483 | Test: 43707


In [16]:
import torch
from torch.utils.data import Dataset

class VulnDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df["input_ids"].apply(eval).tolist()
        self.attn_mask = df["attention_mask"].apply(eval).tolist()
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attn_mask[idx]),
            "labels": torch.tensor(self.labels[idx])
        }

train_dataset = VulnDataset(train_df)
val_dataset = VulnDataset(val_df)
test_dataset = VulnDataset(test_df)


In [29]:
!pip install --upgrade pip
!pip uninstall -y transformers accelerate torch
!pip install accelerate>=0.21.0 --no-deps
!pip install transformers[torch] --upgrade
!pip install torch --upgrade

# Verify installations
!pip show accelerate
!pip show transformers
!pip show torch

# IPython.Application.instance().kernel.do_shutdown(True)

Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\Shivani\anaconda3\python.exe -m pip install --upgrade pip


Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Found existing installation: accelerate 1.7.0
Uninstalling accelerate-1.7.0:
  Successfully uninstalled accelerate-1.7.0
Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Collecting transformers[torch]
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch<2.7,>=2.1 (from transformers[torch])
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch<2.7,>=2.1->transformers[torch])
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)
Installing collected packages: sympy, torch, transformers
  Attempting uninstall: sympy
    Found existing install

In [33]:
pip install --upgrade pip

Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.1.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install transformers>=4.0.0 --upgrade

In [4]:
import transformers
print(f"Current transformers version: {transformers.__version__}")

Current transformers version: 4.52.4


In [6]:
# Let's inspect the actual TrainingArguments class to see what parameters it accepts
import transformers
from transformers import TrainingArguments
import inspect

# Print the version again to confirm
print(f"Transformers version: {transformers.__version__}")

# Get the signature of the TrainingArguments class
sig = inspect.signature(TrainingArguments.__init__)
print("Available parameters for TrainingArguments:")
for param_name, param in sig.parameters.items():
    if param_name != 'self' and param_name != 'kwargs':
        print(f"- {param_name}")

# Now let's create the training arguments based on what's actually available
from transformers import RobertaForSequenceClassification

# Create the model
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Create a minimal set of arguments first to test
print("\nTrying to create TrainingArguments with minimal parameters...")
try:
    training_args = TrainingArguments(
        output_dir="./vuln_model",
    )
    print("Success!")
    
    # Now try with the full set of parameters we want
    print("\nTrying with full parameters...")
    training_args = TrainingArguments(
        output_dir="./vuln_model",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        # Try each parameter one by one
        # Uncomment these as needed
        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=100,
        # load_best_model_at_end=True,
        # metric_for_best_model="eval_loss",
    )
    print("Full parameters worked!")
except Exception as e:
    print(f"Error: {e}")
    print("Please check the available parameters listed above and adjust accordingly.")

Transformers version: 4.52.4
Available parameters for TrainingArguments:
- output_dir
- overwrite_output_dir
- do_train
- do_eval
- do_predict
- eval_strategy
- prediction_loss_only
- per_device_train_batch_size
- per_device_eval_batch_size
- per_gpu_train_batch_size
- per_gpu_eval_batch_size
- gradient_accumulation_steps
- eval_accumulation_steps
- eval_delay
- torch_empty_cache_steps
- learning_rate
- weight_decay
- adam_beta1
- adam_beta2
- adam_epsilon
- max_grad_norm
- num_train_epochs
- max_steps
- lr_scheduler_type
- lr_scheduler_kwargs
- warmup_ratio
- warmup_steps
- log_level
- log_level_replica
- log_on_each_node
- logging_dir
- logging_strategy
- logging_first_step
- logging_steps
- logging_nan_inf_filter
- save_strategy
- save_steps
- save_total_limit
- save_safetensors
- save_on_each_node
- save_only_model
- restore_callback_states_from_checkpoint
- no_cuda
- use_cpu
- use_mps_device
- seed
- data_seed
- jit_mode_eval
- use_ipex
- bf16
- fp16
- fp16_opt_level
- half_precis

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trying to create TrainingArguments with minimal parameters...
Success!

Trying with full parameters...
Full parameters worked!


In [17]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# Create the model
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Create training arguments with the correct parameter names
training_args = TrainingArguments(
    output_dir="./vuln_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

print("Training arguments created successfully!")



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training arguments created successfully!


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()



Epoch,Training Loss,Validation Loss
