### Mode Training

In [1]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd






In [3]:
df = pd.read_json("./merged.json")

In [4]:
train_df, eval_df = train_test_split(df, test_size=0.15, random_state=42)

In [5]:
train_df.to_json("train.json", orient="records", lines=True)
eval_df.to_json("eval.json", orient="records", lines=True)

In [6]:
dataset = load_dataset("json", data_files={"train": "train.json", "eval": "eval.json"})

def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = [a for a in examples["answer"]]
    return {"input_text": inputs, "target_text": targets}

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [7]:
dataset = dataset.map(preprocess_function, batched=True)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], padding="max_length", truncation=True, max_length=512
    )
    labels = tokenizer(
        examples["target_text"], padding="max_length", truncation=True, max_length=512
    )

    # Replace padding token ID (0) with -100 for loss masking
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [9]:
print(tokenized_datasets["train"][0])

{'question': 'What are some common pre-packaged system images offered by IaaS providers?', 'answer': 'Common pre-packaged system images include software stacks for web servers, database servers, and LAMP stacks (Linux, Apache, MySQL, PHP).', 'input_text': 'question: What are some common pre-packaged system images offered by IaaS providers?', 'target_text': 'Common pre-packaged system images include software stacks for web servers, database servers, and LAMP stacks (Linux, Apache, MySQL, PHP).', 'input_ids': [822, 10, 363, 33, 128, 1017, 554, 18, 5745, 11438, 358, 1383, 1860, 57, 27, 9, 9, 134, 3580, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [12]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]

from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    evaluation_strategy="epoch",  
    logging_strategy="steps", 
    logging_steps=10,  
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
)



In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  
)

In [14]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,4.2087,3.272163
2,3.6063,3.202309
3,3.4755,3.164886
4,3.3407,3.146168
5,3.3212,3.139753


TrainOutput(global_step=50, training_loss=3.590485916137695, metrics={'train_runtime': 1768.4335, 'train_samples_per_second': 0.215, 'train_steps_per_second': 0.028, 'total_flos': 51429884559360.0, 'train_loss': 3.590485916137695, 'epoch': 5.0})

In [15]:
model.save_pretrained("t5_qa_model")
tokenizer.save_pretrained("t5_qa_model")

('t5_qa_model\\tokenizer_config.json',
 't5_qa_model\\special_tokens_map.json',
 't5_qa_model\\spiece.model',
 't5_qa_model\\added_tokens.json')

### Check weather model is correctly save or not

In [16]:
model = T5ForConditionalGeneration.from_pretrained("./t5_qa_model/")
tokenizer = T5Tokenizer.from_pretrained("./t5_qa_model/")

In [17]:
print("Model Path:", model.config._name_or_path)

Model Path: ./t5_qa_model/


In [18]:
input_text = "question: What is the intended format for representing the questions and answers related to this page?"
tokens = tokenizer(input_text, return_tensors="pt")
print("Tokenized Input:", tokens)

Tokenized Input: {'input_ids': tensor([[ 822,   10,  363,   19,    8, 3855, 1910,   21, 9085,    8,  746,   11,
         4269, 1341,   12,   48,  543,   58,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [19]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop