In [1]:


!pip install --upgrade transformers==4.45.0 huggingface_hub
%pip install torch numpy pandas scikit-learn evaluate
!pip install "transformers==4.57.1" "accelerate>=1.1.1" "torch>=2.4.0" "datasets>=3.0.0" "peft>=0.13.0"

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


Collecting transformers==4.45.0
  Using cached transformers-4.45.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.0)
  Using cached tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.0-py3-none-any.whl (9.9 MB)
Using cached tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.22.1
[2K    Uninstalling tokenizers-0.22.1:
[2K      Successfully uninstalled tokenizers-0.22.1
[2K  Attempting uninstall: transformers
[2K    Found existing installation: transformers 4.57.1
[2K    Uninstalling transformers-4.57.1:╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [transformers]
[2K      Successfully unin

In [2]:
import os
#os.environ["WANDB_DISABLED"] = "true"
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import argparse
import warnings
warnings.filterwarnings("ignore")


We suggest using a single class, it will make refinement easier. 

In your implementation, feel free to update the training procedure, change model and do whatever feels right 

In [18]:
from datasets import load_dataset
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


class GraphCodeBERTTrainer:
    def __init__(self, max_length=512, model_name="microsoft/graphcodebert-base"):
        self.max_length = max_length
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.num_labels = 2

        # --- Better reproducibility ---
        set_seed(42)
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.benchmark = False

    def load_and_prepare_data(self):
        print("Loading BigCloneBench dataset...")

        train = load_dataset(
            "google/code_x_glue_cc_clone_detection_big_clone_bench", split="train[:10%]"
        )
        val = load_dataset(
            "google/code_x_glue_cc_clone_detection_big_clone_bench", split="validation[:10%]"
        )

        print(f"Train samples: {len(train)}")
        print(f"Val samples:   {len(val)}")

        return train, val

    def initialize_model_and_tokenizer(self):
        print(f"Initializing tokenizer + model: {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
        )

        # --- Add dropout to fight overfitting ---
        self.model.config.hidden_dropout_prob = 0.2
        self.model.config.attention_probs_dropout_prob = 0.2

        # Move model to device
        self.model.to("cuda" if torch.cuda.is_available() else "cpu")

    def tokenize_function(self, batch):
        return self.tokenizer(
            batch["func1"],
            batch["func2"],
            truncation=True,
            padding=False,
            max_length=self.max_length,
        )

    def prepare_datasets(self, train_ds, val_ds):
        print("Label normalization + tokenization...")

        def format_labels(example):
            # robust conversion
            if isinstance(example["label"], bool):
                example["labels"] = int(example["label"])
            else:
                example["labels"] = int(example["label"])
            return example

        # convert boolean → int labels
        train_ds = train_ds.map(format_labels)
        val_ds = val_ds.map(format_labels)

        # tokenize
        remove_cols = ['id', 'func1', 'func2', 'label']
        train_ds = train_ds.map(self.tokenize_function, batched=True,
                                remove_columns=remove_cols)
        val_ds = val_ds.map(self.tokenize_function, batched=True,
                            remove_columns=remove_cols)

        train_ds = train_ds.with_format("torch")
        val_ds = val_ds.with_format("torch")

        return train_ds, val_ds

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average="binary", zero_division=0
        )
        return {
            "accuracy": accuracy_score(labels, predictions),
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

    def train(self, train_dataset, val_dataset, output_dir="./results",
              num_epochs=3, batch_size=8, learning_rate=2e-5):

        print("Starting training...")

        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            eval_strategy="steps",
            eval_steps=200,
            save_strategy="steps",
            save_steps=200,
            save_total_limit=3,  # prevent checkpoint bloat
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            learning_rate=learning_rate,
            warmup_ratio=0.1,
            weight_decay=0.03,  # ↑ stronger regularization
            max_grad_norm=1.0,  # gradient clipping
            gradient_accumulation_steps=2,
            fp16=torch.cuda.is_available(),
            bf16=False,
            dataloader_pin_memory=True,
            dataloader_num_workers=4,  # faster loading
            report_to=[],
        )

        data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer, pad_to_multiple_of=8
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )

        trainer.train()
        trainer.save_model()
        print("Training complete!")
        return trainer

    def run_full_pipeline(self, output_dir="./results_clone",
                          num_epochs=3, batch_size=8, learning_rate=2e-5):

        train_data, val_data = self.load_and_prepare_data()
        self.initialize_model_and_tokenizer()
        train_dataset, val_dataset = self.prepare_datasets(train_data, val_data)

        trainer = self.train(
            train_dataset,
            val_dataset,
            output_dir=output_dir,
            num_epochs=num_epochs,
            batch_size=batch_size,
            learning_rate=learning_rate
        )
        return trainer


# ============================
# RUN TRAINING
# ============================

OUTPUT_DIR = "graphcodebert-clone-detection"

trainer_obj = GraphCodeBERTTrainer(
    max_length=512,
    model_name="microsoft/graphcodebert-base"
)

t = trainer_obj.run_full_pipeline(
    output_dir=OUTPUT_DIR,
    num_epochs=2,
    batch_size=8,
    learning_rate=1e-5
)


Loading BigCloneBench dataset...
Train samples: 90103
Val samples:   41542
Initializing tokenizer + model: microsoft/graphcodebert-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label normalization + tokenization...


Map:   0%|          | 0/90103 [00:00<?, ? examples/s]

Map:   0%|          | 0/41542 [00:00<?, ? examples/s]

Map:   0%|          | 0/90103 [00:00<?, ? examples/s]

Map:   0%|          | 0/41542 [00:00<?, ? examples/s]

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,0.676,0.623014,0.855977,0.452342,0.458394,0.455348
400,0.4396,0.38795,0.819797,0.404426,0.787207,0.534337
600,0.2871,0.222687,0.92918,0.694884,0.821481,0.752898
800,0.2666,0.233882,0.923523,0.656589,0.875733,0.750491
1000,0.2485,0.198879,0.9319,0.688315,0.879949,0.772424
1200,0.2213,0.143681,0.953902,0.807005,0.853006,0.829368
1400,0.219,0.180892,0.949497,0.765749,0.88673,0.821811
1600,0.1579,0.215081,0.945718,0.741148,0.901576,0.813528


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training complete!


In [13]:
import torch
import logging
from itertools import chain
from datasets import load_dataset
from tqdm import tqdm


@torch.no_grad()
def predict_with_trainer(trainer_obj, parquet_path, output_path, max_length=512, batch_size=16, device=None):
    """
    Uses trainer_obj.model and trainer_obj.tokenizer to run streaming inference
    over a parquet file with columns ['ID','code'] and writes 'ID,prediction' CSV.
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Pull model & tokenizer from your trainer object
    model = trainer_obj.model
    tokenizer = trainer_obj.tokenizer if hasattr(trainer_obj, "tokenizer") else trainer_obj.args._setup_devices and None
    if tokenizer is None and hasattr(trainer_obj, "tokenizer"):
        tokenizer = trainer_obj.tokenizer
    if tokenizer is None:
        raise ValueError("trainer_obj must have a tokenizer (e.g., provided when creating the Trainer).")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    # Stream parquet (no RAM blowup)
    ds = load_dataset("parquet", data_files=parquet_path, split="train", streaming=True)

    # Validate schema and re-chain the first row back into the stream
    it = iter(ds)
    first = next(it)
    if not {"ID", "code"}.issubset(first.keys()):
        raise ValueError("Parquet file must contain 'ID' and 'code' columns")
    stream = chain([first], it)

    def batcher(iterator, bs):
        buf = []
        for ex in iterator:
            buf.append(ex)
            if len(buf) == bs:
                yield buf
                buf = []
        if buf:
            yield buf

    with open(output_path, "w") as f:
        f.write("ID,prediction\n")

        for batch in tqdm(batcher(stream, batch_size), desc="Predicting"):
            codes = [row["code"] for row in batch]
            ids   = [row["ID"] for row in batch]

            enc = tokenizer(
                codes,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors="pt",
            )
            input_ids = enc["input_ids"].to(device)
            attention_mask = enc["attention_mask"].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            pred_labels = logits.argmax(dim=-1).cpu().tolist()

            for ex_id, pred in zip(ids, pred_labels):
                f.write(f"{ex_id},{pred}\n")

    print(f"Predictions saved to {output_path}")


In [14]:
# After training:
# trainer_obj = CodeBERTTrainer(...).run_full_pipeline(output_dir=..., ...)

TEST_PARQUET = "./dataset/test.parquet"  # adjust if needed
OUT_CSV = "submission.csv"

predict_with_trainer(
    trainer_obj=t,          
    parquet_path=TEST_PARQUET,
    output_path=OUT_CSV,
    max_length=256,
    batch_size=32,
    device="cuda"              
)

print("Wrote:", OUT_CSV)


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Predicting: 32it [00:02, 14.38it/s]

Predictions saved to submission.csv
Wrote: submission.csv





In [15]:
!pip install ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [16]:
import os
from transformers import AutoTokenizer, AutoModel

# Directory to save the model
save_dir = "./graphcodebert_11_27"
os.makedirs(save_dir, exist_ok=True)

# Get model + tokenizer
try:
    model = trainer_obj.model
    tokenizer = trainer_obj.tokenizer
except:
    print("Falling back to manually loaded model/tokenizer")

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model + tokenizer saved to {save_dir}")

Model + tokenizer saved to ./graphcodebert_11_27


In [17]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ALWAYS load the base GraphCodeBERT model (NOT the classifier model!)
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = AutoModel.from_pretrained("microsoft/graphcodebert-base")

model.to(device)
model.eval()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)          # <-- NOW contains last_hidden_state

    cls_emb = outputs.last_hidden_state[:, 0, :]
    return cls_emb.cpu()

def compute_similarity(t1, t2):
    e1 = get_embedding(t1)
    e2 = get_embedding(t2)
    return F.cosine_similarity(e1, e2).item()

# Example
compute_similarity("print('hello')", "print('hello')")


Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1.0