<a href="https://colab.research.google.com/github/priyanshi-nigam123/Paraphrase_generation_using_t5/blob/main/Paraphrase_generation_using_t5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install datasets

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
import os
from google.colab import drive

drive.mount('/content/drive')

In [None]:
dataset = load_dataset("paws", "labeled_final")


def preprocess_paws(dataset, label=1):
  df = pd.DataFrame(dataset)
  df = df[df['label']==label]

  df['input_text'] = "paraphrase :" + df['sentence1']
  df['target_text'] = df['sentence2']

  return df[['input_text','target_text']]

train_df = preprocess_paws(dataset['train']).sample(3000, random_state=42)
test_df = preprocess_paws(dataset['test']).sample(300, random_state=42)
validation_df = preprocess_paws(dataset['validation']).sample(300, random_state=42)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Initialize tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    max_length = 512  # T5-base typically uses 512 as default

    inputs = tokenizer(examples['input_text'], max_length=max_length, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=max_length, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print(f"✓ Train dataset tokenized: {len(train_dataset)} examples")
print(f"✓ Validation dataset tokenized: {len(validation_dataset)} examples")
print(f"✓ Test dataset tokenized: {len(test_dataset)} examples")

In [11]:
results_dir = "/content/drive/MyDrive/results"
model_dir = "/content/drive/MyDrive/saved_t5_model"

os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [None]:
# Define training arguments with smaller batch size
training_args = TrainingArguments(
    output_dir=results_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Train the model
trainer.train()

# SAVE MODEL
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"✅ Model saved to: {model_dir}")

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_dir = "/content/drive/MyDrive/saved_t5_model"

model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("✅ Model loaded successfully!")

In [None]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

In [4]:
# Define model_dir
model_dir = "/content/drive/MyDrive/saved_t5_model"

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define max_length
max_length = 512  # Same as training

# Preprocessing function for inference
def preprocess_input(sentence):
    return "paraphrase: " + sentence

# Generate paraphrases with corrected num_beams and num_return_sequences
def generate_paraphrase(input_text, model, tokenizer, max_length=512, num_beams=5, num_return_sequences=4, top_k=100, top_p=0.9, temperature=1.0):
    # Preprocess input
    input_text = preprocess_input(input_text)

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate paraphrases
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length + 20,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        do_sample=True,
        early_stopping=True
    )

    # Decode generated outputs
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_texts

# Example sentence
input_sentence = "The quick brown fox jumps over the lazy dog."

# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, max_length=512, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

In [15]:
input_sentence = "She enjoys reading books on rainy afternoons."

paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: She enjoys reading books on rainy afternoons.
Paraphrase 1: She enjoys reading books on rainy afternoons.
Paraphrase 2: She enjoys reading books on rainy afternoons .
Paraphrase 3: On rainy afternoons, she enjoys reading books.
Paraphrase 4: She loves reading books on rainy afternoons.


In [16]:
input_sentence = "The dog barked loudly at the stranger outside the house."

paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: The dog barked loudly at the stranger outside the house.
Paraphrase 1: The dog barked loudly at the stranger outside the house.
Paraphrase 2: The dog barked loudly at the stranger outside the house .
Paraphrase 3: The dog barked loudly at a stranger outside the house.
Paraphrase 4: The dog loudly barked at the stranger outside the house.


In [17]:
input_sentence = "Climate change is one of the most pressing issues of our time."

paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: Climate change is one of the most pressing issues of our time.
Paraphrase 1: Climate change is one of the most pressing issues of our time.
Paraphrase 2: Climate change is one of the most pressing issues of our time .
Paraphrase 3: The climate change is one of the most pressing issues of our time.
Paraphrase 4: Climate Change is one of the most pressing issues of our time.
