Install dependencies:

In [1]:
!pip install transformers datasets peft accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0-

Import libraries:

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import torch


In [3]:
#Load FLAN-T5 & tokenizer:

In [4]:
model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Apply LoRA:

In [5]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)




Load small dataset (or your own):

make my own dataset

Load Dataset Using datasets Library



In [6]:
import json

data = [
  {
    "context": "The Price Control Department regulates essential commodities in Punjab.",
    "question": "Who regulates essential commodities in Punjab?",
    "answers": {
      "text": ["The Price Control Department"],
      "answer_start": [0]
    }
  },
  {
    "context": "Gradio helps build ML web apps easily.",
    "question": "What does Gradio help build?",
    "answers": {
      "text": ["ML web apps"],
      "answer_start": [19]
    }
  }
]

with open("pccmd_qa.json", "w") as f:
    json.dump({
        "data": [{
            "title": "PCCMD QA",
            "paragraphs": [
                {
                    "context": d["context"],
                    "qas": [{
                        "question": d["question"],
                        "id": str(i),
                        "answers": [{
                            "text": d["answers"]["text"][0],
                            "answer_start": d["answers"]["answer_start"][0]
                        }]
                    }]
                } for i, d in enumerate(data)
            ]
        }]
    }, f)


In [7]:
from datasets import load_dataset, Dataset
import json

# Load the JSON data directly
with open("pccmd_qa.json", "r") as f:
    data = json.load(f)

# Extract and flatten the data
flat_data = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            flat_data.append({
                "context": context,
                "question": qa["question"],
                "answers": qa["answers"]
            })

# Convert the flattened list into a Dataset object
qa_dataset = Dataset.from_list(flat_data)

# Now qa_dataset is ready for further processing
print(qa_dataset)
print(qa_dataset[0])

Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 2
})
{'context': 'The Price Control Department regulates essential commodities in Punjab.', 'question': 'Who regulates essential commodities in Punjab?', 'answers': [{'answer_start': 0, 'text': 'The Price Control Department'}]}


Step 3: Tokenize Dataset for FLAN-T5

FLAN-T5 expects inputs in a text-to-text format. We will convert each QA pair into an input prompt like:

In [8]:
"question: {question} context: {context}"


'question: {question} context: {context}'

and use the answer text as the target.

In [9]:
print(qa_dataset[0])


{'context': 'The Price Control Department regulates essential commodities in Punjab.', 'question': 'Who regulates essential commodities in Punjab?', 'answers': [{'answer_start': 0, 'text': 'The Price Control Department'}]}


In [10]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import json

# 1. Load your JSON dataset (replace "pccmd_qa.json" with your file path)
# The data has already been loaded and flattened in the previous cell,
# so we can directly create the Dataset from the flattened data.
# Assuming 'flat_data' is available from the previous cell's execution
# If not, you would need to reload/reprocess the data here.

# For this example, let's assume flat_data is available from the previous cell
# If you were running this cell independently, you would need to uncomment the following lines
# and ensure the file path is correct.
# with open("pccmd_qa.json", "r") as f:
#     data = json.load(f)
#
# flat_data = []
# for item in data["data"]:
#     for para in item["paragraphs"]:
#         context = para["context"]
#         for qa in para["qas"]:
#             flat_data.append({
#                 "context": context,
#                 "question": qa["question"],
#                 "answers": qa["answers"]
#             })

qa_dataset = Dataset.from_list(flat_data)


# 2. Load tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Define preprocessing function with safe text extraction
def preprocess_function(batch):
    inputs = []
    targets = []
    for q, c, a in zip(batch['question'], batch['context'], batch['answers']):
        q_str = str(q) if q is not None else ""
        c_str = str(c) if c is not None else ""
        inputs.append(f"question: {q_str} context: {c_str}")

        if isinstance(a, dict) and 'text' in a and len(a['text']) > 0:
            target_text = str(a['text'][0])
        elif isinstance(a, list) and len(a) > 0:
            if isinstance(a[0], dict) and 'text' in a[0]:
                target_text = str(a[0]['text'][0]) # Corrected index access
            else:
                target_text = str(a[0])
        else:
            target_text = ""

        targets.append(target_text)

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Replace pad token id's by -100 so loss ignores padding tokens
    labels_input_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels_input_ids
    return model_inputs

# 4. Apply preprocessing
tokenized_dataset = qa_dataset.map(preprocess_function, batched=True, remove_columns=qa_dataset.column_names)

# 5. Verify tokenized data (optional)
print(tokenized_dataset[0])

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'input_ids': [822, 10, 2645, 16363, 7, 1832, 27592, 16, 27864, 58, 2625, 10, 37, 5312, 4330, 1775, 16363, 7, 1832, 27592, 16, 27864, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Load the Pre-trained Model

In [11]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Define Training Arguments

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./flan_t5_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none",  # Disable wandb and other reporting tools
)

Define Evaluation Metric (Optional but Recommended)

In [13]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


Now that the `evaluate` library is installed, we can define the evaluation metric.

In [16]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return result

In [15]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9392773b305488a012f1e1d74eeabfe7b717d2be97178b7494cd5ced76e5ed73
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


 Initialize Trainer

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # You can split train/eval properly for real training
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics  # Uncomment if you implement metrics
)


  trainer = Trainer(


Train the Model

In [18]:
# Re-import and re-initialize accelerate components
from accelerate import Accelerator

accelerator = Accelerator()

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # You can split train/eval properly for real training
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics  # Uncomment if you implement metrics
)

  trainer = Trainer(


Train the Model

In [20]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,4.474472
2,No log,3.329057
3,No log,2.980603


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=3, training_loss=5.43670654296875, metrics={'train_runtime': 457.8181, 'train_samples_per_second': 0.013, 'train_steps_per_second': 0.007, 'total_flos': 4108544114688.0, 'train_loss': 5.43670654296875, 'epoch': 3.0})

 Save Your Fine-tuned Model and Tokenizer

In [21]:
trainer.save_model("./flan_t5_finetuned")
tokenizer.save_pretrained("./flan_t5_finetuned")


('./flan_t5_finetuned/tokenizer_config.json',
 './flan_t5_finetuned/special_tokens_map.json',
 './flan_t5_finetuned/spiece.model',
 './flan_t5_finetuned/added_tokens.json',
 './flan_t5_finetuned/tokenizer.json')

Load and Test Your Fine-tuned Model

In [22]:
from transformers import AutoModelForSeq2SeqLM

fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./flan_t5_finetuned")

# Example input
test_input = "question: Who regulates essential commodities in Punjab? context: The Price Control Department regulates essential commodities in Punjab."
input_ids = tokenizer(test_input, return_tensors="pt").input_ids

outputs = fine_tuned_model.generate(input_ids, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The Price Control Department


 Deploy with Gradio (Optional)

In [23]:
import gradio as gr

def answer_question(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = fine_tuned_model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

iface = gr.Interface(
    fn=answer_question,
    inputs=["text", "text"],
    outputs="text",
    title="PCCMD QA Model"
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f144c8fb8da9a575d4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


