In [16]:
# Step 1: Clone the repo and setup environment

import os
import subprocess

repo_dir = "./tamarind-finetune"
repo_url = "https://github.com/smartrics/tamarind-finetune.git"

if os.path.isdir(repo_dir):
    print("Directory 'tamarind-finetune' exists. Pulling latest changes...")
    subprocess.run(["git", "-C", repo_dir, "pull"], check=True)
else:
    print("Directory 'tamarind-finetune' does not exist. Cloning repository...")
    subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
print("finished!")

Directory 'tamarind-finetune' exists. Pulling latest changes...
finished!


In [17]:
%cd ./tamarind-finetune

/content/tamarind-finetune/tamarind-finetune


In [None]:
# These are the core libraries: Transformers, Datasets, PEFT (for LoRA), TRL (Trainer), BitsAndBytes (4-bit quant)
%pip install -r requirements.txt


In [19]:
# --- 1. Prepare the Data ---

from datasets import load_dataset, DatasetDict

# Load each split from JSONL files
train_dataset = load_dataset("json", data_files="data_codet5/training_data.jsonl", split="train")
eval_dataset = load_dataset("json", data_files="data_codet5/validation_data.jsonl", split="train")
test_dataset = load_dataset("json", data_files="data_codet5/test_data.jsonl", split="train")


# Create a single DatasetDict
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset,
    "test": test_dataset
})

if raw_datasets["train"] is None or raw_datasets["validation"] is None or raw_datasets["test"] is None:
    print("Error loading datasets. Please check file paths and contents.")
else:
    print(f"training data points: #{len(raw_datasets['train'])}")
    print(f"validation data points: #{len(raw_datasets['validation'])}")
    print(f"test data points: #{len(raw_datasets['test'])}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

training data points: #430
validation data points: #101
test data points: #53


In [20]:
from huggingface_hub import notebook_login

# --- 2. Login to Hugging Face Hub ---
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
# --- 3. Load Tokenizer and Model ---
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 4092  # Define your desired max input length
max_output_length = 2800 # Define your desired max output length

def preprocess_function(examples):
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_output_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

print("OK")

# --- 4.1. Configure Training Arguments ---
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import os

# Set the WANDB_MODE environment variable to 'disabled'
os.environ["WANDB_MODE"] = "disabled"

output_dir = "./codet5-tamarind"  # Adjust output directory
learning_rate = 1e-5  # Adjusted for small dataset
batch_size = 1      # Adjusted for small dataset
num_epochs = 20     # Set a higher number of epochs as early stopping will handle it
gradient_accumulation_steps = 4
weight_decay = 0.01

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_checkpointing=True,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=True,
    push_to_hub=True,
    hub_model_id="smartrics/codet5-tamarind",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
)
# --- 4.2. Define the Trainer with Early Stopping Callback ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print("ok")

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

  trainer = Trainer(


OK
ok


In [44]:
# --- 5. Train the Model ---
print("Starting training with early stopping...")
trainer.train()
print("Training finished!")



Starting training with early stopping...


Epoch,Training Loss,Validation Loss


RuntimeError: No executable batch size found, reached zero.

In [None]:
# --- 6. Push the Model to Hugging Face Hub ---
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()
print(f"Model pushed to https://huggingface.co/{training_args.hub_model_id}")

print("Fine-tuning complete.")