<a href="https://colab.research.google.com/github/sramponi86/AI-rest-service-with-flask/blob/main/Gherkin_FineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# 📘 Gherkin Test Case Generator - Fine-Tune FLAN-T5
!pip install transformers datasets accelerate -q

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

# Load dataset from local (upload or mount if needed)
dataset = load_dataset("json", data_files={"train": "gherkin_data.json"}, split="train")

# Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess
def tokenize_fn(example):
    input_enc = tokenizer(example["input"], truncation=True, padding="max_length", max_length=256)
    target_enc = tokenizer(example["output"], truncation=True, padding="max_length", max_length=256)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

tokenized_dataset = dataset.map(tokenize_fn)

# Training setup
training_args = TrainingArguments(
    output_dir="./gherkin_model",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=1,
    logging_steps=10,
    save_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

trainer.train()

# Save final model
model.save_pretrained("./gherkin_model")
tokenizer.save_pretrained("./gherkin_model")
