In [None]:
import pandas as pd
import random
import nltk
import time
import torch
from nltk.corpus import wordnet
from transformers import MarianMTModel, MarianTokenizer
from multiprocessing import Pool, cpu_count

nltk.download('wordnet')
nltk.download('omw-1.4')

# Load MarianMT model for back translation (English <-> Spanish)
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer_en_es = MarianTokenizer.from_pretrained(model_name)
model_en_es = MarianMTModel.from_pretrained(model_name).to('cuda')

model_name = 'Helsinki-NLP/opus-mt-es-en'
tokenizer_es_en = MarianTokenizer.from_pretrained(model_name)
model_es_en = MarianMTModel.from_pretrained(model_name).to('cuda')

def translate_batch(sentences, tokenizer, model):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to('cuda')
    with torch.no_grad():
        translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random.shuffle(words)
    replaced = 0

    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym.lower() != word.lower():
                new_words = [synonym if w == word else w for w in new_words]
                replaced += 1
                if replaced >= n:
                    break
    return ' '.join(new_words)

def back_translate(sentences):
    try:
        spanish = translate_batch(sentences, tokenizer_en_es, model_en_es)
        english = translate_batch(spanish, tokenizer_es_en, model_es_en)
        return english
    except Exception as e:
        return sentences  # Return original if translation fails

def augment_text(sentences):
    syn_aug = [synonym_replacement(sentence) for sentence in sentences]
    back_aug = back_translate(sentences)
    return list(zip(syn_aug, back_aug))

def process_batch(batch, start_index):
    start_time = time.time()
    results = augment_text(batch)
    elapsed_time = time.time() - start_time
    print(f'Processed {start_index + len(batch)} records... Time elapsed: {elapsed_time:.2f} seconds')
    return results

def augment_dataset(df, column):
    sentences = df[column].tolist()
    num_workers = cpu_count()
    batch_size = 1000  # Optimized batch size

    batches = [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

    start_time = time.time()
    with Pool(num_workers) as pool:
        results = pool.starmap(process_batch, [(batch, i * batch_size) for i, batch in enumerate(batches)])

    total_time = time.time() - start_time
    print(f'Total time for processing {len(sentences)} records: {total_time:.2f} seconds')
    estimated_time = (total_time / len(sentences)) * 30000
    print(f'Estimated time for 30,000 records: {estimated_time:.2f} seconds')

    augmented_data = [item for sublist in results for item in sublist]
    df['synonym_aug'] = [x[0] for x in augmented_data]
    df['back_trans_aug'] = [x[1] for x in augmented_data]
    return df

# Load dataset
df = pd.read_csv('news_dataset.csv')  # Change to actual file path
augmented_df = augment_dataset(df, 'headline')
augmented_df.to_csv('augmented_headlines.csv', index=False)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Processed 1000 records... Time elapsed: 7.85 seconds
Processed 5000 records... Time elapsed: 7.86 seconds
Processed 2000 records... Time elapsed: 1.14 seconds
Processed 6000 records... Time elapsed: 1.19 seconds
Processed 3000 records... Time elapsed: 0.44 seconds
Processed 7000 records... Time elapsed: 0.47 seconds
Processed 4000 records... Time elapsed: 0.41 seconds
Processed 8000 records... Time elapsed: 0.41 seconds
Processed 9000 records... Time elapsed: 0.39 seconds
Processed 13000 records... Time elapsed: 0.38 seconds
Processed 10000 records... Time elapsed: 0.38 seconds
Processed 14000 records... Time elapsed: 0.35 seconds
Processed 11000 records... Time elapsed: 0.36 seconds
Processed 15000 records... Time elapsed: 0.33 seconds
Processed 12000 records... Time elapsed: 0.36 seconds
Processed 16000 records... Time elapsed: 0.35 seconds
Processed 17000 records... Time elapsed: 0.35 seconds
Processed 21000 records... Time elapsed: 0.33 seconds
Processed 18000 records... Time elaps

In [None]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [None]:
import pandas as pd
df = pd.read_csv("/content/augmented_headlines.csv")
print(f"Total records: {len(df)}")


Total records: 26709


In [None]:
import pandas as pd

# Load Dataset
data_path = "/content/augmented_headlines.csv"
df = pd.read_csv(data_path)

# Ensure no missing values
df = df.dropna(subset=['headline', 'synonym_aug', 'back_trans_aug'])

# Create separate rows for each augmentation
augmented_data = []

for _, row in df.iterrows():
    augmented_data.append([row['headline'], row['is_sarcastic']])  # Original
    augmented_data.append([row['synonym_aug'], row['is_sarcastic']])  # Synonym Replacement
    augmented_data.append([row['back_trans_aug'], row['is_sarcastic']])  # Back Translation

# Convert to DataFrame
df_augmented = pd.DataFrame(augmented_data, columns=['headline', 'is_sarcastic'])

# Save the updated dataset
df_augmented.to_csv("/content/final_augmented_headlines.csv", index=False)

# Check final count
print(f"Total records after augmentation: {len(df_augmented)}")  # Should be ~90,000


Total records after augmentation: 80127
