In [None]:
# Step 1: Clone the repo and setup environment

import os
import subprocess

repo_dir = "./tamarind-finetune"
repo_url = "https://github.com/smartrics/tamarind-finetune.git"

if os.path.isdir(repo_dir):
    print("Directory 'tamarind-finetune' exists. Pulling latest changes...")
    subprocess.run(["git", "-C", repo_dir, "pull"], check=True)
else:
    print("Directory 'tamarind-finetune' does not exist. Cloning repository...")
    subprocess.run(["git", "clone", repo_url, repo_dir], check=True)
print("finished!")

Directory 'tamarind-finetune' does not exist. Cloning repository...
finished!


In [None]:
%cd ./tamarind-finetune

In [None]:
# These are the core libraries: Transformers, Datasets, PEFT (for LoRA), TRL (Trainer), BitsAndBytes (4-bit quant)
%pip install -r requirements.txt


Collecting gcsfs (from -r requirements.txt (line 11))
  Downloading gcsfs-2025.3.2-py2.py3-none-any.whl.metadata (1.9 kB)
INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.
  Downloading gcsfs-2025.3.1-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading gcsfs-2025.3.0-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading gcsfs-2025.2.0-py2.py3-none-any.whl.metadata (1.9 kB)
  Downloading gcsfs-2024.12.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting google-auth>=1.2 (from gcsfs->-r requirements.txt (line 11))
  Downloading google_auth-2.39.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-oauthlib (from gcsfs->-r requirements.txt (line 11))
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-cloud-storage (from gcsfs->-r requirements.txt (line 11))
  Downloading google_cloud_storage-3.1.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting c


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# --- 1. Prepare the Data ---

from datasets import Dataset, DatasetDict
import json
import os

def load_data(file_paths):
    data = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    try:
                        json_obj = json.loads(line)
                        messages = json_obj.get("messages", [])
                        system_content = None
                        user_content = None
                        assistant_content = None

                        for message in messages:
                            role = message.get("role")
                            content = message.get("content")
                            if role == "system" and content:
                                system_content = content
                            elif role == "user" and content:
                                user_content = content
                            elif role == "assistant" and content:
                                assistant_content = content

                        if user_content and assistant_content:
                            input_text = (system_content + " " if system_content else "") + user_content
                            data.append({"input": input_text.strip(), "output": assistant_content})
                        else:
                            print(f"Warning: Skipping malformed line in {file_path}: {line.strip()}")

                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON line in {file_path}: {line.strip()}")
        except FileNotFoundError as e:
            print(f"Error: File not found: {e}")
            return None
    return Dataset.from_dict({"input": [item["input"] for item in data], "output": [item["output"] for item in data]})

# Load data for each split and type
spec_train_files = ["data/spec_training_data.jsonl"]
spec_eval_files = ["data/spec_validation_data.jsonl"]
spec_test_files = ["data/spec_test_data.jsonl"]

wf_train_files = ["data/wf_training_data.jsonl"]
wf_eval_files = ["data/wf_validation_data.jsonl"]
wf_test_files = ["data/wf_test_data.jsonl"]

spec_train_dataset = load_data(spec_train_files)
spec_eval_dataset = load_data(spec_eval_files)
spec_test_dataset = load_data(spec_test_files)

wf_train_dataset = load_data(wf_train_files)
wf_eval_dataset = load_data(wf_eval_files)
wf_test_dataset = load_data(wf_test_files)

# Merge the datasets for each split
train_dataset = Dataset.from_dict({
    "input": spec_train_dataset["input"] + wf_train_dataset["input"],
    "output": spec_train_dataset["output"] + wf_train_dataset["output"]
})

eval_dataset = Dataset.from_dict({
    "input": spec_eval_dataset["input"] + wf_eval_dataset["input"],
    "output": spec_eval_dataset["output"] + wf_eval_dataset["output"]
})

test_dataset = Dataset.from_dict({
    "input": spec_test_dataset["input"] + wf_test_dataset["input"],
    "output": spec_test_dataset["output"] + wf_test_dataset["output"]
})

# Create a single DatasetDict
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset,
    "test": test_dataset
})

if raw_datasets["train"] is None or raw_datasets["validation"] is None or raw_datasets["test"] is None:
    print("Error loading datasets. Please check file paths and contents.")
else:
    print(f"training data points: #{len(raw_datasets['train'])}")
    print(f"validation data points: #{len(raw_datasets['validation'])}")
    print(f"test data points: #{len(raw_datasets['test'])}")

  from .autonotebook import tqdm as notebook_tqdm


training data points: #602
validation data points: #100
test data points: #74


In [None]:
from huggingface_hub import notebook_login

# --- 2. Login to Hugging Face Hub ---
notebook_login()

In [None]:
# --- 3. Load Tokenizer and Model ---
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

max_input_length = 4096  # Define your desired max input length
max_output_length = 4096 # Define your desired max output length

def preprocess_function(examples):
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_output_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

print("OK")

Map: 100%|██████████| 602/602 [00:16<00:00, 36.05 examples/s]
Map: 100%|██████████| 100/100 [00:02<00:00, 45.34 examples/s]
Map: 100%|██████████| 74/74 [00:02<00:00, 34.49 examples/s]


In [None]:
# --- 4.1. Configure Training Arguments ---
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import os

# Set the WANDB_MODE environment variable to 'disabled'
os.environ["WANDB_MODE"] = "disabled"

output_dir = "./codet5-tamarind"  # Adjust output directory
learning_rate = 1e-5  # Adjusted for small dataset
batch_size = 8      # Adjusted for small dataset
num_epochs = 20     # Set a higher number of epochs as early stopping will handle it
gradient_accumulation_steps = 2
weight_decay = 0.01

training_args = TrainingArguments(
    auto_find_batch_size=True,
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    fp16=True,
    push_to_hub=True,
    hub_model_id="smartrics/codet5-tamarind", 
    load_best_model_at_end=True, 
    metric_for_best_model="eval_loss", 
    greater_is_better=False, 
    report_to="none",
)
# --- 4.2. Define the Trainer with Early Stopping Callback ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)


In [None]:
# --- 5. Train the Model ---
print("Starting training with early stopping...")
trainer.train()
print("Training finished!")



In [None]:
# --- 6. Push the Model to Hugging Face Hub ---
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()
print(f"Model pushed to https://huggingface.co/{training_args.hub_model_id}")

print("Fine-tuning complete.")