In [5]:
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset


def paraphrase_sentence(sentences, model, tokenizer, num_return_sequences=1, batch_size=32):
    paraphrased_sentences = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        input_ids = tokenizer(
            batch, padding=True, return_tensors='pt', truncation=True)['input_ids']
        outputs = model.generate(
            input_ids, num_return_sequences=num_return_sequences)
        paraphrases = [tokenizer.decode(
            output, skip_special_tokens=True) for output in outputs]
        paraphrased_sentences.extend(paraphrases)

    return paraphrased_sentences


def augment_data(data, model, tokenizer, probability=0.5, num_return_sequences=1):
    augmented_data = []
    for example in data:
        premise, hypothesis, label = example['premise'], example['hypothesis'], example['label']

        if random.random() < probability:
            paraphrased_premises = paraphrase_sentence(
                premise, model, tokenizer, num_return_sequences)
            premise = paraphrased_premises[0]

        if random.random() < probability:
            paraphrased_hypotheses = paraphrase_sentence(
                hypothesis, model, tokenizer, num_return_sequences)
            hypothesis = paraphrased_hypotheses[0]

        augmented_data.append(
            {'premise': premise, 'hypothesis': hypothesis, 'label': label})

    return augmented_data


In [6]:
# Load the SNLI dataset
# Using a small portion for demonstration purposes
snli_dataset = load_dataset("snli", split="train[:10%]")

# Load the pre-trained paraphrasing model and tokenizer
paraphrasing_model = AutoModelForSeq2SeqLM.from_pretrained(
    "t5-small")
paraphrasing_tokenizer = AutoTokenizer.from_pretrained(
    "t5-small")

# Augment the data
probability = 0.5  # Probability to paraphrase each sentence
augmented_data = augment_data(
    snli_dataset, paraphrasing_model, paraphrasing_tokenizer, probability=probability)

# Print some augmented examples
for original, augmented in zip(snli_dataset[:5], augmented_data[:5]):
    print(f"Original premise: {original['premise']}")
    print(f"Original hypothesis: {original['hypothesis']}")
    print(f"Augmented premise: {augmented['premise']}")
    print(f"Augmented hypothesis: {augmented['hypothesis']}")
    print()


Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 248kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 242M/242M [00:07<00:00, 31.8MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 23.7kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.92MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 3.05MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


KeyboardInterrupt: 

In [None]:
augmented_data


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


class NLIDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        encoding = self.tokenizer(example['premise'], example['hypothesis'],
                                  padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': example['label']
        }


# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')

# Convert the combined data into the required format
dataset = NLIDataset(augmented_data, tokenizer)

# Define the DataLoader
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'cross-encoder/nli-roberta-base')

# Fine-tune the model using the Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()
