In [4]:
# Small paraphrasing model to augment dataset for classification task
%pip install "protobuf<5" tiktoken transformers torch tqdm sentencepiece

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json, random
from tqdm import tqdm

MODEL_NAME = "eugenesiow/bart-paraphrase"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def paraphrase(text, num_return_sequences=2, num_beams=5):
    prompt = f"paraphrase: {text} </s>"
    encoding = tokenizer.encode_plus(prompt, padding='longest', return_tensors="pt")
    outputs = model.generate(
        **encoding,
        max_length=64,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        temperature=1.5
    )
    return [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

# Load the dataset I want to augment
data = []
with open("dataset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

augmented = []

for entry in tqdm(data, desc="Augmenting"):
    if random.random() < 0.5:  # augment 50% of samples
        new_texts = paraphrase(entry["text"], num_return_sequences=2)
        for t in new_texts:
            augmented.append({"text": t, "label": entry["label"]})

# Save augmented + original in a new file called data_augmented.jsonl
all_data = data + augmented
with open("data_augmented.jsonl", "w", encoding="utf-8") as f:
    for ex in all_data:
        json.dump(ex, f)
        f.write("\n")

print(f"Augmented dataset saved: {len(data)} → {len(all_data)} samples")


Note: you may need to restart the kernel to use updated packages.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Augmenting:   0%|          | 0/211 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Augmenting: 100%|██████████| 211/211 [02:14<00:00,  1.57it/s]

Augmented dataset saved: 211 → 417 samples



