# SET UP LIBRARIES + CONSTANT

In [1]:
!pip install --upgrade pip
!pip install transformers datasets accelerate evaluate seqeval
!pip install peft bitsandbytes
!pip install git+https://github.com/unslothai/unsloth.git
!pip install --upgrade transformers
!pip install evaluate

Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-fyv_nmhc
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-fyv_nmhc
  Resolved https://github.com/unslothai/unsloth.git to commit 71172a6bd7160cb386d9f3630b2f8675f9338538
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
import torch
print(torch.cuda.is_available())

True


In [5]:
MODEL_NAME = "microsoft/codebert-base"

# LOAD DATASET + PREPROCESSING

In [6]:
# load dataset
from datasets import load_dataset
ds = load_dataset("DaniilOr/SemEval-2026-Task13", 'A')
for split in ds:
    print(split, len(ds[split]))

train 500000
validation 100000
test 1000


In [7]:
import os
import shutil
import zipfile
from datasets import load_from_disk
from transformers import AutoTokenizer

zip_path = "/content/tokenized_dataset.zip"
temp_dir = "/content/temp_tokenized_data"

if os.path.exists(zip_path):
    print(f"Found existing {zip_path}")

    if os.path.exists(temp_dir):
      shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    print(f"Loading dataset from {temp_dir}...")
    tokenized = load_from_disk(temp_dir)
    shutil.rmtree(temp_dir)
else:

    print(f"{zip_path} not found. Starting new tokenization process...")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize_batch(ex):
        return tokenizer(ex["code"], truncation=True, padding="max_length", max_length=512)

    tokenized = ds.map(tokenize_batch, batched=True, remove_columns=["code", "generator", "language"])
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.set_format("torch")


    tokenized.save_to_disk(temp_dir)

    shutil.make_archive(base_name=zip_path.replace(".zip", ""), format='zip', root_dir=temp_dir, base_dir=".")
    shutil.rmtree(temp_dir)

print("Tokenization done!")

Found existing /content/tokenized_dataset.zip
Loading dataset from /content/temp_tokenized_data...
Tokenization done!


In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


In [None]:
# training
from transformers import TrainingArguments, Trainer
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="/content/sem_eval_task13_lora",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    fp16=True,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=1e-4,
    weight_decay=0.05,
    logging_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="/content/sem_eval_task13_logs",
    dataloader_num_workers=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

model.save_pretrained("/content/sem_eval_task13_lora_adapter")

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
500,No log,0.068186,0.9794
1000,No log,0.045293,0.98472
1500,No log,0.042054,0.98566


In [None]:
# test/evaluate on test split and compute precision/recall/F1
from sklearn.metrics import classification_report
import torch

model.eval()
dataloader = torch.utils.data.DataLoader(tokenized["test"], batch_size=32)
y_true, y_pred = [], []
for batch in dataloader:
    inputs = {k:v.cuda() if torch.cuda.is_available() else v for k,v in batch.items() if k!="labels"}
    labels = batch["labels"].numpy()
    with torch.no_grad():
        outputs = model(**{k:v for k,v in inputs.items()})
    preds = outputs.logits.argmax(-1).cpu().numpy()
    y_true.extend(labels.tolist())
    y_pred.extend(preds.tolist())

print(classification_report(y_true, y_pred, digits=4))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
import os

folders_to_zip = [
    # "/content/sem_eval_task13_logs",
    # "/content/sem_eval_task13_lora",
    # "/content/wandb",
    "/content/tokenized_dataset"
]

for folder_path in folders_to_zip:
    if os.path.exists(folder_path):
        zip_filename = os.path.basename(folder_path) + ".zip"
        shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_path)
        print(f"Created {zip_filename}")
    else:
        print(f"Folder not found: {folder_path}")