In [1]:
# Install necessary libraries
!pip install transformers datasets accelerate

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import random

# Step 1: Load Pretrained T5 Model and Tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_data():
    # Step 2: Create a Custom Q&A Dataset (100 Samples)
    data = []
    for i in range(100):
        question = f"What is the meaning of question {i}?"
        context = f"This is the context for question {i}."
        answer = f"Answer {i}"
        input_text = f"question: {question} context: {context}"
        data.append({"input_text": input_text, "output_text": answer})
    return data

raw_data = preprocess_data()

# Convert to Hugging Face Dataset Format
dataset = Dataset.from_list(raw_data)

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples["output_text"], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Step 3: Tokenize the Dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Step 5: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# Step 6: Train the Model
trainer.train()

# Step 7: Save the Model
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

# Step 8: Test the Model
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example Test
print(generate_answer("What is the meaning of question 5?", "This is the context for question 5."))

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12=

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprabhsimarsinghdhingra[0m ([33mprabhsimarsinghdhingra-manipal-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,3.370053
2,No log,0.223673
3,No log,0.188839





In [2]:
print(generate_answer("What is the meaning of question 5?", "This is the context for question 5."))




In [3]:
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json')

In [4]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [5]:
print(generate_answer("What is the meaning of question 5?", "This is the context for question 5."))




In [6]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids, max_length=128)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [7]:
question = "What is the meaning of question 5?"
context = "This is the context for question 5."
answer = generate_answer(question, context)

print("Question:", question)
print("Answer:", answer)


Question: What is the meaning of question 5?
Answer: 


In [8]:
def preprocess_data():
    data = [
        {"input_text": "question: What is machine learning? context: Machine learning is a subset of AI.",
         "output_text": "Machine learning is a branch of AI that enables computers to learn from data."},
        {"input_text": "question: Who invented the light bulb? context: Thomas Edison is credited with its invention.",
         "output_text": "Thomas Edison invented the light bulb in 1879."},
        {"input_text": "question: What is the capital of France? context: France is a country in Europe.",
         "output_text": "The capital of France is Paris."},
    ]
    return data


In [9]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids,
                                max_length=128,
                                temperature=0.7,
                                top_p=0.9,
                                top_k=50)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [10]:
question = "What is artificial intelligence?"
context = "Artificial intelligence is the simulation of human intelligence by machines."
print("Question:", question)
print("Answer:", generate_answer(question, context))


Question: What is artificial intelligence?




Answer: simulation of human intelligence by machines
