In [5]:


!pip install --upgrade transformers==4.45.0 huggingface_hub
%pip install torch numpy pandas scikit-learn evaluate
!pip install "transformers==4.57.1" "accelerate>=1.1.1" "torch>=2.4.0" "datasets>=3.0.0" "peft>=0.13.0"

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


Collecting transformers==4.45.0
  Using cached transformers-4.45.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-1.1.4-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.0)
  Using cached tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.0-py3-none-any.whl (9.9 MB)
Using cached tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.22.1
[2K    Uninstalling tokenizers-0.22.1:
[2K      Successfully uninstalled tokenizers-0.22.1
[2K  Attempting uninstall: transformers
[2K    Found existing installation: transformers 4.57.1
[2K    Uninstalling transformers-4.57.1:╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [transformers]
[2K      Successfully unin

In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import argparse
import warnings
warnings.filterwarnings("ignore")


We suggest using a single class, it will make refinement easier. 

In your implementation, feel free to update the training procedure, change model and do whatever feels right 

In [7]:
class GraphCodeBERTTrainer:
    def __init__(self, max_length=512, model_name="microsoft/graphcodebert-base"):
        self.max_length = max_length
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.num_labels = None
        
    def load_and_prepare_data(self):
        
        try:
            df = pd.read_parquet('./dataset/train.parquet')
            
            print(f"Dataset columns: {df.columns.tolist()}")
            print(f"Sample data:\n{df.head()}")
            
            if 'code' not in df.columns or 'label' not in df.columns:
                raise ValueError("Dataset must contain 'code' and 'label' columns")
            
            df = df.dropna(subset=['code', 'label'])
            
            df['label'] = df['label'].astype(int)
            self.num_labels = df['label'].nunique()
            
            print(f"Number of unique labels: {self.num_labels}")
            print(f"Label range: {df['label'].min()} to {df['label'].max()}")
            print(f"Label distribution:\n{df['label'].value_counts().sort_index()}")

            val_df = pd.read_parquet('./dataset/validation.parquet')
            
            print(f"Train samples: {len(df)}, Validation samples: {len(val_df)}")
            
            return df, val_df
            
        except Exception as e:
            print(f"Error loading dataset: {e}")
            raise
    
    def initialize_model_and_tokenizer(self):
        print(f"Initializing {self.model_name} model and tokenizer...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            problem_type="single_label_classification",
            trust_remote_code=True
        ).to('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f"Model initialized with {self.num_labels} labels")
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            examples['code'],
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
    
    def prepare_datasets(self, train_df, val_df):
        print("Preparing datasets for training...")
        
        train_dataset = Dataset.from_pandas(train_df[['code', 'label']])
        val_dataset = Dataset.from_pandas(val_df[['code', 'label']])
        
        train_dataset = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=['code']
        )
        val_dataset = val_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=['code']
        )
        
        train_dataset = train_dataset.rename_column('label', 'labels')
        val_dataset = val_dataset.rename_column('label', 'labels')
        
        return train_dataset, val_dataset
    
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        accuracy = accuracy_score(labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def train(self, train_dataset, val_dataset, output_dir="./results", num_epochs=3, batch_size=16, learning_rate=2e-5):
        print("Starting training...")
        print(self.model)
        print(self.model.device)
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            # warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=5,
            eval_strategy="steps",
            eval_steps=500,
            save_strategy="steps",
            save_steps=500,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            remove_unused_columns=False,
            learning_rate=learning_rate,
            lr_scheduler_type="linear",
            save_total_limit=2,
            report_to=[],
        )
        
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )
        print(f"Start training")
        trainer.train()
        
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        print(f"Training completed. Model saved to {output_dir}")
        
        return trainer
    
    def evaluate_model(self, trainer, val_dataset):
        print("Evaluating model...")
        
        predictions = trainer.predict(val_dataset)
        y_pred = np.argmax(predictions.predictions, axis=1)
        y_true = predictions.label_ids
        
        print("Classification Report:")
        print(classification_report(y_true, y_pred))
        
        return predictions
    
    def run_full_pipeline(self, output_dir="./results", num_epochs=3, batch_size=16, learning_rate=2e-5):
        try:
            train_df, val_df = self.load_and_prepare_data()
            
            self.initialize_model_and_tokenizer()
            
            train_dataset, val_dataset = self.prepare_datasets(train_df, val_df)
            
            trainer = self.train(
                train_dataset, val_dataset, 
                output_dir=output_dir,
                num_epochs=num_epochs,
                batch_size=batch_size,
                learning_rate=learning_rate
            )
            
            self.evaluate_model(trainer, val_dataset)
            
            print("Pipeline completed successfully!")
            return trainer
            
        except Exception as e:
            print(f"Error in pipeline: {e}")
            raise
    


In [8]:
OUTPUT_DIR = "taskA-model"

trainer_obj = GraphCodeBERTTrainer(
    max_length=256,
    model_name="microsoft/graphcodebert-base" 
)

t = trainer_obj.run_full_pipeline(
    output_dir=OUTPUT_DIR,
    num_epochs=10,
    batch_size=16,
    learning_rate=2e-5
)


Dataset columns: ['code', 'generator', 'label', 'language']
Sample data:
                                                code  \
0  (a, b, c, d) = [int(x) for x in input().split(...   
1  valid version for the language; all others can...   
2  python\ndef min_cards_to_flip(s):\n    vowels ...   
3  T = int(input())\nfor t in range(T):\n\tcolor ...   
4  for i in range(int(input())):\n\tinput()\n\ta ...   

                        generator  label language  
0                           human      0   Python  
1         Qwen/Qwen2.5-Coder-1.5B      1   Python  
2  Qwen/Qwen2.5-Coder-7B-Instruct      1   Python  
3                           human      0   Python  
4                           human      0   Python  
Number of unique labels: 2
Label range: 0 to 1
Label distribution:
label
0    238475
1    261525
Name: count, dtype: int64
Train samples: 500000, Validation samples: 100000
Initializing microsoft/graphcodebert-base model and tokenizer...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 2 labels
Preparing datasets for training...


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 500000/500000 [03:24<00:00, 2449.24 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:41<00:00, 2416.98 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Starting training...
RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0485,0.047567,0.98676,0.986759,0.986766,0.98676
1000,0.0711,0.041126,0.98727,0.987272,0.987318,0.98727
1500,0.0157,0.031702,0.98961,0.989609,0.989619,0.98961
2000,0.0217,0.029173,0.99025,0.990252,0.990297,0.99025
2500,0.0155,0.025944,0.99095,0.99095,0.99095,0.99095
3000,0.0213,0.025263,0.99154,0.99154,0.99154,0.99154
3500,0.0387,0.025718,0.99169,0.99169,0.99169,0.99169
4000,0.016,0.026102,0.99088,0.990882,0.990924,0.99088
4500,0.0175,0.023057,0.99219,0.99219,0.99219,0.99219
5000,0.0241,0.021204,0.99217,0.992171,0.992178,0.99217


Training completed. Model saved to taskA-model
Evaluating model...


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     47695
           1       0.99      0.99      0.99     52305

    accuracy                           0.99    100000
   macro avg       0.99      0.99      0.99    100000
weighted avg       0.99      0.99      0.99    100000

Pipeline completed successfully!


In [9]:
import torch
import logging
from itertools import chain
from datasets import load_dataset
from tqdm import tqdm


@torch.no_grad()
def predict_with_trainer(trainer_obj, parquet_path, output_path, max_length=512, batch_size=16, device=None):
    """
    Uses trainer_obj.model and trainer_obj.tokenizer to run streaming inference
    over a parquet file with columns ['ID','code'] and writes 'ID,prediction' CSV.
    """
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # Pull model & tokenizer from your trainer object
    model = trainer_obj.model
    tokenizer = trainer_obj.tokenizer if hasattr(trainer_obj, "tokenizer") else trainer_obj.args._setup_devices and None
    if tokenizer is None and hasattr(trainer_obj, "tokenizer"):
        tokenizer = trainer_obj.tokenizer
    if tokenizer is None:
        raise ValueError("trainer_obj must have a tokenizer (e.g., provided when creating the Trainer).")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    # Stream parquet (no RAM blowup)
    ds = load_dataset("parquet", data_files=parquet_path, split="train", streaming=True)

    # Validate schema and re-chain the first row back into the stream
    it = iter(ds)
    first = next(it)
    if not {"ID", "code"}.issubset(first.keys()):
        raise ValueError("Parquet file must contain 'ID' and 'code' columns")
    stream = chain([first], it)

    def batcher(iterator, bs):
        buf = []
        for ex in iterator:
            buf.append(ex)
            if len(buf) == bs:
                yield buf
                buf = []
        if buf:
            yield buf

    with open(output_path, "w") as f:
        f.write("ID,prediction\n")

        for batch in tqdm(batcher(stream, batch_size), desc="Predicting"):
            codes = [row["code"] for row in batch]
            ids   = [row["ID"] for row in batch]

            enc = tokenizer(
                codes,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors="pt",
            )
            input_ids = enc["input_ids"].to(device)
            attention_mask = enc["attention_mask"].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            pred_labels = logits.argmax(dim=-1).cpu().tolist()

            for ex_id, pred in zip(ids, pred_labels):
                f.write(f"{ex_id},{pred}\n")

    print(f"Predictions saved to {output_path}")


In [10]:
# After training:
# trainer_obj = CodeBERTTrainer(...).run_full_pipeline(output_dir=..., ...)

TEST_PARQUET = "./dataset/test.parquet"  # adjust if needed
OUT_CSV = "submission.csv"

predict_with_trainer(
    trainer_obj=t,          
    parquet_path=TEST_PARQUET,
    output_path=OUT_CSV,
    max_length=256,
    batch_size=32,
    device="cuda"              
)

print("Wrote:", OUT_CSV)


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Predicting: 32it [00:06,  4.74it/s]

Predictions saved to submission.csv
Wrote: submission.csv





In [17]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ALWAYS load the base GraphCodeBERT model (NOT the classifier model!)
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = AutoModel.from_pretrained("microsoft/graphcodebert-base")

model.to(device)
model.eval()

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)          # <-- NOW contains last_hidden_state

    cls_emb = outputs.last_hidden_state[:, 0, :]
    return cls_emb.cpu()

def compute_similarity(t1, t2):
    e1 = get_embedding(t1)
    e2 = get_embedding(t2)
    return F.cosine_similarity(e1, e2).item()

# Example
compute_similarity("print('hello')", "print('hello')")


Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1.0