<a href="https://colab.research.google.com/github/pra-dyumna/GENAI_STAN/blob/main/Untitled106.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets accelerate torch peft


In [None]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType

# Load the dataset
def load_custom_dataset(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    questions = [entry["question"] for entry in data]
    answers = [entry["answer"] for entry in data]
    return Dataset.from_dict({"question": questions, "answer": answers})

# Preprocess the dataset
def preprocess_function(examples, tokenizer, max_length=128):
    inputs = examples['question']
    targets = examples['answer']
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Load dataset
dataset_path = "/content/dataset.json"  # Path to your JSON dataset file
dataset = load_custom_dataset(dataset_path)

# Split the dataset into train and validation
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Load tokenizer and model
model_name = "t5-small"  # You can use t5-base or other variants for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)

# Tokenize dataset
train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_dataset = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_lora_finetuned",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    fp16=True,  # Use mixed precision for faster training
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Save the LoRA fine-tuned model
model.save_pretrained("./t5_lora_finetuned")
tokenizer.save_pretrained("./t5_lora_finetuned")


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
model_name = "./t5_lora_finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Perform inference
def generate_answer(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(inputs.input_ids, max_length=256, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example question
question = "Provide detailed information about Infosys as an IT company."
answer = generate_answer(question)
print("Answer:", answer)


In [None]:
!zip -r /content/./t5_lora_finetuned.zip /content/./t5_lora_finetuned

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset

# Load Dataset from JSON File
# Replace 'dataset.json' with the path to your single dataset file.
dataset = load_dataset("json", data_files={"full": "/content/dataset.json"})["full"]

# Split Dataset into Training and Validation
train_size = 0.9  # 90% for training, 10% for validation

# Convert the dataset to a list of dictionaries before splitting
dataset_list = [row for row in dataset]
train_dataset_list, val_dataset_list = train_test_split(dataset_list, test_size=1-train_size, shuffle=True)


# Convert back to Dataset object
dataset = {
    "train": Dataset.from_list(train_dataset_list),
    "validation": Dataset.from_list(val_dataset_list),
}

# Load T5-Base Model and Tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization Function
def preprocess_function(examples):
    inputs = [f"Question: {q} Answer:" for q in examples["question"]]
    targets = [a for a in examples["answer"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Preprocess Dataset
train_dataset = dataset["train"].map(preprocess_function, batched=True)
val_dataset = dataset["validation"].map(preprocess_function, batched=True)

# LoRA Configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_strategy="epoch",
    logging_steps=100,
    max_grad_norm=1.0,
    fp16=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Fine-Tuning
trainer.train()

# Save Model
trainer.save_model("./t5_lora_finetuned_base")
tokenizer.save_pretrained("./t5_lora_finetuned_base")

In [None]:
# Load Fine-Tuned Model
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "./t5_lora_finetuned_base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Generate Answer
def generate_answer(question, max_length=256, num_beams=6):
    inputs = tokenizer(f"Question: {question} Answer:", return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
question = "To calculate yearly earnings, multiply the monthly earnings by 12."
answer = generate_answer(question)
print(f"Q: {question}\nA: {answer}")


In [None]:
!zip -r /content/./t5_lora_finetuned_base.zip /content/./t5_lora_finetuned_base

In [None]:
# prompt: no no give context and prompt both

# Load the fine-tuned model
model_name = "./t5_lora_finetuned_base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Generate Answer
def generate_answer(question, max_length=256, num_beams=6):
    inputs = tokenizer(f"Question: {question} Answer:", return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
question = ""
answer = generate_answer(question)
print(f"Q: {question}\nA: {answer}")

In [None]:
# prompt: this model not give the perfect result can be prompt tuning the model no fine tuning

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
model_name = "./t5_lora_finetuned_base"  # Assuming this is your best performing model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Generate Answer
def generate_answer(question, max_length=256, num_beams=6):
    inputs = tokenizer(f"Question: {question} Answer:", return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage (you can change this to any question)
question = "What are the benefits of using a TPU?"
answer = generate_answer(question)
print(f"Q: {question}\nA: {answer}")