## Importing Libraries

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from datasets import load_dataset
from transformers import get_scheduler
from tqdm.auto import tqdm


In [2]:
!tmux display-message -p '#S'

jupyter_session


## Load and Process the Dataset

#### Old

In [None]:
dataset = load_dataset("quora")

dataset

In [None]:
dataset = dataset.filter(lambda x: x['is_duplicate'] == 1)

In [None]:
# Define a function to flatten and prepare the data
def prepare_data(examples):
    # Create lists to store processed examples
    input_texts = []
    target_texts = []
    
    # Process each entry
    for question_pair in examples['questions']:
        # Assuming each entry in 'questions' has two questions
        if len(question_pair['text']) == 2:
            input_texts.append("paraphrase: " + question_pair['text'][0])
            target_texts.append(question_pair['text'][1])
    
    # Return a dictionary of processed examples
    return {'input_text': input_texts, 'target_text': target_texts}

# Apply the function to each entry in the dataset
processed_datasets = dataset.map(prepare_data, batched=True, remove_columns=['questions', 'is_duplicate'])

In [None]:
processed_datasets

#### New

In [39]:
import pandas as pd
from datasets import load_dataset
import re

def load_and_process_quora():
    dataset = load_dataset("quora", trust_remote_code=True)
    all_data = []
    
    for split in dataset.keys():
        df = pd.DataFrame(dataset[split])
        df = df[df['is_duplicate'] == 1]
        
        original = []
        paraphrased = []
        
        for row in df['questions']:
            if len(row['text']) == 2:
                original.append(row['text'][0])
                paraphrased.append(row['text'][1])
        
        all_data.append(pd.DataFrame({'original': original, 'paraphrased': paraphrased}))
    
    return pd.concat(all_data, ignore_index=True)

def load_and_process_paws():
    dataset = load_dataset("paws", "labeled_final")
    all_data = []
    
    for split in dataset.keys():
        df = pd.DataFrame(dataset[split])
        df = df[df['label'] == 1]
        all_data.append(df[['sentence1', 'sentence2']].rename(columns={'sentence1': 'original', 'sentence2': 'paraphrased'}))
    
    return pd.concat(all_data, ignore_index=True)

def load_and_process_mrpc():
    dataset = load_dataset("glue", "mrpc")
    all_data = []
    
    for split in dataset.keys():
        df = pd.DataFrame(dataset[split])
        df = df[df['label'] == 1]
        all_data.append(df[['sentence1', 'sentence2']].rename(columns={'sentence1': 'original', 'sentence2': 'paraphrased'}))
    
    return pd.concat(all_data, ignore_index=True)

def load_and_process_paranmt(file_path):
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Process each line
    original = []
    paraphrased = []
    scores = []
    
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            orig, para, score = parts
            score = float(score)
            if (0.76 <= score <= 0.78 and
                30 <= len(orig) <= 40 and
                30 <= len(para) <= 40 and
                not re.search(r'\d', orig) and
                not re.search(r'\d', para)):
                original.append(orig)
                paraphrased.append(para)
                scores.append(score)
    
    # Create DataFrame
    df = pd.DataFrame({
        'original': original,
        'paraphrased': paraphrased
        #'score': scores
    })
    
    return df


In [30]:
# Load and process each dataset
quora_df = load_and_process_quora()

In [31]:
quora_df

Unnamed: 0,original,paraphrased
0,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
1,How can I be a good geologist?,What should I do to be a great geologist?
2,How do I read and find my YouTube comments?,How can I see all my Youtube comments?
3,What can make Physics easy to learn?,How can you make physics easy to learn?
4,What was your first sexual experience like?,What was your first sexual experience?
...,...,...
149258,What are some outfit ideas to wear to a frat p...,What are some outfit ideas wear to a frat them...
149259,Why is Manaphy childish in Pokémon Ranger and ...,Why is Manaphy annoying in Pokemon ranger and ...
149260,How does a long distance relationship work?,How are long distance relationships maintained?
149261,What does Jainism say about homosexuality?,What does Jainism say about Gays and Homosexua...


In [32]:
paws_df = load_and_process_paws()

In [33]:
paws_df

Unnamed: 0,original,paraphrased
0,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...
1,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...
2,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...
3,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ..."
4,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...
...,...,...
28899,"In Advent , the traditional `` Tauberbischofsh...","During Advent , the traditional `` Tauberbisch..."
28900,"In 2002 , the song was released by British pro...","In 2002 , the song was published by the Britis..."
28901,"Tommy Connolly , who plays Rory Jennings , pla...","Tommy Connolly , who plays Rory Jennings , pla..."
28902,"Monroe Meadows , in Yosemite valley near Brida...","Monroe Meadows , in Yosemite Valley near Brida..."


In [34]:
mrpc_df = load_and_process_mrpc()

In [35]:
mrpc_df

Unnamed: 0,original,paraphrased
0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr..."
1,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ..."
2,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...
3,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart 's compa...
4,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....
...,...,...
3895,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...
3896,""" I am advised that certain allegations of cri...",""" I am advised that certain allegations of cri..."
3897,"The deal , approved by both companies ' board ...",The acquisition has been approved by both comp...
3898,"Last week the power station ’ s US owners , AE...","The news comes after Drax 's American owner , ..."


In [40]:
paranmt_df = load_and_process_paranmt('./para-nmt-50m.txt')

In [41]:
paranmt_df

Unnamed: 0,original,paraphrased
0,She must not have recognized you.,she probably didn't recognize you.
1,"It just - I was just... tired, you know.","I'm just... I'm really tired, you know?"
2,"David, we're not here to drink wine.","David, we're not here for wine, okay?"
3,Want to join us in the shelter?,would you like to join us in the shed?
4,"Shit, this one I can't even pronounce.","gosh, I can't even pronounce this."
...,...,...
118515,Raylene. Hi. I need to talk to you.,"Raylan, I need to talk to you."
118516,What happened to the money I gave you?,what happened to the money from me?
118517,Sometimes it's like there's two of him.,sometimes it's like being like two.
118518,I tried to remain as numb as possible.,I tried to be as insensitive as I could.


In [42]:
# Combine all datasets
combined_df = pd.concat([quora_df, paws_df, mrpc_df, paranmt_df], ignore_index=True)

In [43]:
# Remove duplicates
combined_df = combined_df.drop_duplicates(subset=['original', 'paraphrased'])

In [44]:
# Shuffle the dataset
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [46]:
print(f"Combined dataset shape: {combined_df.shape}")

Combined dataset shape: (298102, 2)


In [48]:
combined_df

Unnamed: 0,original,paraphrased
0,How do I add more details to Quora like this s...,How can I add more details when I am submittin...
1,What is the disadvantage of option subject ant...,What are disadvantages of anthropology?
2,Clark has some explaining to do.,Clark has something to explain.
3,"""You have great skill with your hands""","""you're good with your hands."""
4,How could I improve my English pronunciation?,How can I increase my English fluency?
...,...,...
298097,What are some ways people make money without a...,How can I make my money make money?
298098,"I will speak with you alone, Kirk.","I'd like to be alone with you, Kirk."
298099,Why do I see questions with basic grammatical ...,Why do I always get a notification from Quora ...
298100,Who is better Trump or Clinton?,Who is better Donald Trump or Hillary Clinton ...


In [49]:
# Save the combined dataset
combined_df.to_csv('combined_paraphrase_dataset.csv', index=False)

In [50]:
# Perform train-test split
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined_df, test_size=0.05, random_state=42)

In [51]:
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

Train set shape: (283196, 2)
Test set shape: (14906, 2)


In [52]:
# Save train and test sets
train_df.to_csv('paraphrase_train.csv', index=False)
test_df.to_csv('paraphrase_test.csv', index=False)

# Newest Training

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import get_scheduler
from tqdm.auto import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import wandb
from torch.nn.utils import clip_grad_norm_
from torch.optim import AdamW

# Custom dataset class
class ParaphraseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = f"paraphrase: {row['original']}"
        target_text = row['paraphrased']

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        labels = target_encoding.input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding.input_ids.flatten(),
            'attention_mask': input_encoding.attention_mask.flatten(),
            'labels': labels.flatten()
        }


In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-large').to('cuda')

In [None]:
# Load the dataset
df = pd.read_csv('combined_paraphrase_dataset.csv')
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Create datasets and dataloaders
train_dataset = ParaphraseDataset(train_df, tokenizer)
val_dataset = ParaphraseDataset(val_df, tokenizer)

batch_size = 16  # Increased batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Initialize wandb
wandb.init(project="t5-paraphrase", name="training-run-2")

# Log model architecture
wandb.watch(model, log="all")

In [None]:
# Training setup
optimizer = AdamW(model.parameters(), lr=1e-5)  # Reduced learning rate
num_epochs = 5  # Increased number of epochs
num_training_steps = num_epochs * len(train_dataloader)
num_warmup_steps = int(0.1 * num_training_steps)  # 10% of total steps for warmup

lr_scheduler = get_scheduler(
    "cosine",  # Changed to cosine schedule with warmup
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Log hyperparameters
wandb.config.update({
    "learning_rate": 1e-5,
    "epochs": num_epochs,
    "batch_size": batch_size,
    "model_name": "t5-large",
    "optimizer": "AdamW",
    "scheduler": "cosine with warmup",
    "warmup_steps": num_warmup_steps,
    "train_size": len(train_df),
    "val_size": len(val_df)
})



In [None]:
# Training loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)  # Added gradient clipping
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)

        # Log training metrics
        wandb.log({
            "train_loss": loss.item(),
            "learning_rate": optimizer.param_groups[0]['lr'],
            "epoch": epoch,
            "step": epoch * len(train_dataloader) + step
        })

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to('cuda') for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    val_loss /= len(val_dataloader)

    # Log validation metrics
    wandb.log({
        "epoch": epoch,
        "train_loss": avg_train_loss,
        "val_loss": val_loss
    })

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    model.train()

In [None]:
# Save the model
model.save_pretrained("./t5_paraphrase_model_improved")
tokenizer.save_pretrained("./t5_paraphrase_model_improved")

print("Training completed and model saved!")

# Finish wandb run
wandb.finish()

# New Training

In [60]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import get_scheduler
from tqdm.auto import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import wandb
from torch.optim import AdamW

# Custom dataset class
class ParaphraseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = f"paraphrase: {row['original']}"
        target_text = row['paraphrased']

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

        labels = target_encoding.input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding.input_ids.flatten(),
            'attention_mask': input_encoding.attention_mask.flatten(),
            'labels': labels.flatten()
        }


In [54]:
# Load the dataset
df = pd.read_csv('combined_paraphrase_dataset.csv')
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [55]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-large').to('cuda')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [56]:
# Create datasets and dataloaders
train_dataset = ParaphraseDataset(train_df, tokenizer)
val_dataset = ParaphraseDataset(val_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [57]:
# Initialize wandb
wandb.init(project="t5-paraphrase", name="training-run-1")

[34m[1mwandb[0m: Currently logged in as: [33msrhnylmz14[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [58]:
# Log model architecture
wandb.watch(model, log="all")

[]

In [61]:
# Training setup
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Log hyperparameters
wandb.config.update({
    "learning_rate": 5e-5,
    "epochs": num_epochs,
    "batch_size": 8,
    "model_name": "t5-large",
    "optimizer": "AdamW",
    "scheduler": "linear",
    "train_size": len(train_df),
    "val_size": len(val_df)
})

In [62]:
# Training loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)

        # Log training metrics
        wandb.log({
            "train_loss": loss.item(),
            "learning_rate": optimizer.param_groups[0]['lr'],
            "epoch": epoch,
            "step": epoch * len(train_dataloader) + step
        })

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to('cuda') for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    val_loss /= len(val_dataloader)

    # Log validation metrics
    wandb.log({
        "epoch": epoch,
        "train_loss": avg_train_loss,
        "val_loss": val_loss
    })

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    model.train()

  0%|          | 0/100611 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Save the model
model.save_pretrained("./t5_paraphrase_model")
tokenizer.save_pretrained("./t5_paraphrase_model")

print("Training completed and model saved!")

In [None]:
# Finish wandb run
wandb.finish()

# Old Training

## Tokenize the Data

In [None]:
from transformers import T5Tokenizer

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-large')

# Define the function to tokenize the data
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=128, truncation=True, padding="max_length")
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Apply tokenization to all sets in the dataset
tokenized_datasets = processed_datasets.map(tokenize_function, batched=True)

## Prepare the Dataloaders

In [None]:
from torch.utils.data import DataLoader

# Define a helper function to create the DataLoader
def create_dataloader(tokenized_data, batch_size=8):
    # Convert list of dictionaries into a format DataLoader can handle
    dataset = tokenized_data.remove_columns(['input_text', 'target_text'])  # Remove text columns not needed for training
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    # Create the DataLoader
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create DataLoaders for training (and optionally validation)
train_dataloader = create_dataloader(tokenized_datasets['train'])


## Load Model / Set Up Training

In [11]:
model = T5ForConditionalGeneration.from_pretrained('t5-large').cuda()

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of training epochs
num_epochs = 3

# Set up the learning rate scheduler
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)




## Train the Model

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/55974 [00:00<?, ?it/s]

## Save the Model

In [15]:
model.save_pretrained("./t5_paraphrase_model")
tokenizer.save_pretrained("./t5_paraphrase_model")

('./t5_paraphrase_model/tokenizer_config.json',
 './t5_paraphrase_model/special_tokens_map.json',
 './t5_paraphrase_model/spiece.model',
 './t5_paraphrase_model/added_tokens.json')

# Inference

## Load model and tokenizer

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model_path = "./t5_paraphrase_model"
model = T5ForConditionalGeneration.from_pretrained(model_path).cuda()
tokenizer = T5Tokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Function to Generate Paraphrases

In [63]:
def generate_paraphrases(input_text, num_returns=3):
    # Encode the input text
    input_ids = tokenizer.encode("paraphrase: " + input_text, return_tensors="pt").to(model.device)
    
    # Generate paraphrases
    paraphrases = model.generate(
        input_ids,
        max_length=50,
        num_beams=num_returns,
        num_return_sequences=num_returns,
        no_repeat_ngram_size=1,
        early_stopping=True
    )
    
    # Decode and print each paraphrase
    return [tokenizer.decode(paraphrase, skip_special_tokens=True) for paraphrase in paraphrases]


## Generate

In [64]:
# Example usage
input_sentence = "What is the best way to learn artificial intelligence?"
paraphrase_outputs = generate_paraphrases(input_sentence, num_returns=5)
for i, paraphrase in enumerate(paraphrase_outputs, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Paraphrase 1: What is the best way to learn artificial intelligence?
Paraphrase 2: How do I learn artificial intelligence?
Paraphrase 3: What is the best way to learn Artificial Intelligence?
Paraphrase 4: What are the best ways to learn artificial intelligence?
Paraphrase 5: How can I learn artificial intelligence?


In [65]:
# Example usage
input_sentence = "What occupation did Albert Einstein have?"
paraphrase_outputs = generate_paraphrases(input_sentence, num_returns=5)
for i, paraphrase in enumerate(paraphrase_outputs, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Paraphrase 1: What occupation did Albert Einstein have?
Paraphrase 2: What was Albert Einstein's occupation?
Paraphrase 3: What occupation did Albert Einstein hold?
Paraphrase 4: What occupations did Albert Einstein have?
Paraphrase 5: What job did Albert Einstein have?


In [66]:
# Example usage
input_sentence = "What nationality did the physicist Albert Einstein have?"
paraphrase_outputs = generate_paraphrases(input_sentence, num_returns=5)
for i, paraphrase in enumerate(paraphrase_outputs, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Paraphrase 1: What nationality did the physicist Albert Einstein have?
Paraphrase 2: What nationality did physicist Albert Einstein have?
Paraphrase 3: What nationality did Albert Einstein have?
Paraphrase 4: What nationality did physicist Albert Einstein belong to?
Paraphrase 5: What nationality did physicist Albert Einstein come from?


In [67]:
# Example usage
input_sentence = "The restaurant is a carved-off space up a couple of stairs to one side, dominated by faux bare-brick columns, faux-wood floors and an air of foetid despondency"
paraphrase_outputs = generate_paraphrases(input_sentence, num_returns=5)
for i, paraphrase in enumerate(paraphrase_outputs, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Paraphrase 1: The restaurant is a cut-off space on one side of the stairs, with fake brick columns and faux wood floors dominating an air foetid despondency
Paraphrase 2: The restaurant is a cut-off space on one side of the stairs, with fake brick columns and faux wood floors dominating an air foetid despair.
Paraphrase 3: The restaurant is a cut-off space on one side of the stairs, with fake brick columns and faux wood floors dominating an air foetid despairance.
Paraphrase 4: The restaurant is a cut-off space on one side of the stairs, with fake brick columns and faux wood floors dominating.
Paraphrase 5: The restaurant is a cut-off space on one side of the stairs, with fake brick columns and faux wood floors.
