In [16]:
# Step 1: Clone the repo and setup environment

import os
import subprocess

repo_dir = "./tamarind-finetune"
repo_url = "https://github.com/smartrics/tamarind-finetune.git"

if os.path.isdir(repo_dir):
    print("Directory 'tamarind-finetune' exists. Pulling latest changes...")
    subprocess.run(["git", "-C", repo_dir, "pull"], check=True)
else:
    print("Directory 'tamarind-finetune' does not exist. Cloning repository...")
    subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
print("finished!")

Directory 'tamarind-finetune' exists. Pulling latest changes...
finished!


In [17]:
%cd ./tamarind-finetune

/content/tamarind-finetune/tamarind-finetune


In [None]:
# These are the core libraries: Transformers, Datasets, PEFT (for LoRA), TRL (Trainer), BitsAndBytes (4-bit quant)
%pip install -r requirements.txt


In [19]:
# --- 1. Prepare the Data ---

from datasets import load_dataset, DatasetDict

# Load each split from JSONL files
train_dataset = load_dataset("json", data_files="data_codet5/training_data.jsonl", split="train")
eval_dataset = load_dataset("json", data_files="data_codet5/validation_data.jsonl", split="train")
test_dataset = load_dataset("json", data_files="data_codet5/test_data.jsonl", split="train")


# Create a single DatasetDict
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset,
    "test": test_dataset
})

if raw_datasets["train"] is None or raw_datasets["validation"] is None or raw_datasets["test"] is None:
    print("Error loading datasets. Please check file paths and contents.")
else:
    print(f"training data points: #{len(raw_datasets['train'])}")
    print(f"validation data points: #{len(raw_datasets['validation'])}")
    print(f"test data points: #{len(raw_datasets['test'])}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

training data points: #430
validation data points: #101
test data points: #53


In [20]:
from huggingface_hub import notebook_login

# --- 2. Login to Hugging Face Hub ---
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [51]:
# --- 3. Load Tokenizer and Model ---
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 1200  # Define your desired max input length
max_output_length = 851 # Define your desired max output length

def preprocess_function(examples):
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_output_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def count_tokens(example):
    input_ids = tokenizer(example["input"], truncation=False).input_ids
    output_ids = tokenizer(example["output"], truncation=False).input_ids
    return {
        "input_token_count": len(input_ids),
        "output_token_count": len(output_ids),
        "total_token_count": len(input_ids) + len(output_ids),
    }

# Apply it to your dataset
token_counts = raw_datasets["train"].map(count_tokens)
import pandas as pd
df = token_counts.to_pandas()
print(df[["input_token_count", "output_token_count", "total_token_count"]].describe())

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

print("OK")


       input_token_count  output_token_count  total_token_count
count         430.000000          430.000000         430.000000
mean          728.774419          383.641860        1112.416279
std           140.771030          190.817349         247.085805
min           440.000000            7.000000         661.000000
25%           646.250000          231.250000         987.250000
50%           721.000000          389.500000        1091.000000
75%           802.750000          496.000000        1260.000000
max          1144.000000          851.000000        1857.000000


Filter:   0%|          | 0/430 [00:00<?, ? examples/s]

       input_token_count  output_token_count  total_token_count
count                0.0                 0.0                0.0
mean                 NaN                 NaN                NaN
std                  NaN                 NaN                NaN
min                  NaN                 NaN                NaN
25%                  NaN                 NaN                NaN
50%                  NaN                 NaN                NaN
75%                  NaN                 NaN                NaN
max                  NaN                 NaN                NaN


Map:   0%|          | 0/430 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

OK


In [48]:

# --- 4.1. Configure Training Arguments ---
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import os

# Set the WANDB_MODE environment variable to 'disabled'
os.environ["WANDB_MODE"] = "disabled"

output_dir = "./codet5-tamarind"  # Adjust output directory
learning_rate = 1e-5  # Adjusted for small dataset
batch_size = 1      # Adjusted for small dataset
num_epochs = 20     # Set a higher number of epochs as early stopping will handle it
gradient_accumulation_steps = 4
weight_decay = 0.01

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_checkpointing=True,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    use_cache = False,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=True,
    push_to_hub=True,
    hub_model_id="smartrics/codet5-tamarind",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
)
# --- 4.2. Define the Trainer with Early Stopping Callback ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print("ok")

Map:   0%|          | 0/430 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (721 > 512). Running this sequence through the model will result in indexing errors


train tokens: Dataset({
    features: ['input', 'output', 'input_token_count', 'output_token_count', 'total_token_count'],
    num_rows: 430
})


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

validation tokens: Dataset({
    features: ['input', 'output', 'input_token_count', 'output_token_count', 'total_token_count'],
    num_rows: 101
})


Map:   0%|          | 0/53 [00:00<?, ? examples/s]

test tokens: Dataset({
    features: ['input', 'output', 'input_token_count', 'output_token_count', 'total_token_count'],
    num_rows: 53
})


Map:   0%|          | 0/430 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

OK


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'use_cache'

In [47]:
# --- 5. Train the Model ---
print("Starting training with early stopping...")
trainer.train()
print("Training finished!")



Starting training with early stopping...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# --- 6. Push the Model to Hugging Face Hub ---
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()
print(f"Model pushed to https://huggingface.co/{training_args.hub_model_id}")

print("Fine-tuning complete.")