In [None]:
import pandas as pd
import re
import random
import nltk
import time
import torch
import emoji
import contractions
from nltk.corpus import wordnet
from transformers import MarianMTModel, MarianTokenizer
from multiprocessing import Pool, cpu_count

# Download WordNet for synonym replacement
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load MarianMT models for back translation
model_name_en_es = 'Helsinki-NLP/opus-mt-en-es'
tokenizer_en_es = MarianTokenizer.from_pretrained(model_name_en_es)
model_en_es = MarianMTModel.from_pretrained(model_name_en_es).to('cuda')

model_name_es_en = 'Helsinki-NLP/opus-mt-es-en'
tokenizer_es_en = MarianTokenizer.from_pretrained(model_name_es_en)
model_es_en = MarianMTModel.from_pretrained(model_name_es_en).to('cuda')

# --- Translation Utility ---
def translate_batch(sentences, tokenizer, model):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to('cuda')
    with torch.no_grad():
        translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# --- Synonym Replacement ---
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random.shuffle(words)
    replaced = 0

    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym.lower() != word.lower():
                new_words = [synonym if w == word else w for w in new_words]
                replaced += 1
                if replaced >= n:
                    break
    return ' '.join(new_words)

# --- Back Translation ---
def back_translate(sentences):
    try:
        spanish = translate_batch(sentences, tokenizer_en_es, model_en_es)
        english = translate_batch(spanish, tokenizer_es_en, model_es_en)
        return english
    except Exception as e:
        print("Back translation error:", e)
        return sentences  # fallback

# --- Preprocessing ---
def advanced_preprocess(text):
    text = str(text)
    text = text.lower()
    text = contractions.fix(text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'http\S+|www\S+', '<URL>', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Elongated words
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Augment Text ---
def augment_text(sentences):
    syn_aug = [synonym_replacement(sentence) for sentence in sentences]
    back_aug = back_translate(sentences)
    return list(zip(syn_aug, back_aug))

# --- Batch Process ---
def process_batch(batch, start_index):
    start_time = time.time()
    results = augment_text(batch)
    elapsed_time = time.time() - start_time
    print(f'Processed {start_index + len(batch)} records... Time: {elapsed_time:.2f} sec')
    return results

# --- Augmentation Wrapper ---
def augment_dataset(df, column):
    df['preprocessed'] = df[column].apply(advanced_preprocess)
    sentences = df['preprocessed'].tolist()

    num_workers = max(1, cpu_count() // 2)
    batch_size = 1000

    batches = [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

    start_time = time.time()
    with Pool(num_workers) as pool:
        results = pool.starmap(process_batch, [(batch, i * batch_size) for i, batch in enumerate(batches)])

    total_time = time.time() - start_time
    print(f'Total time for {len(sentences)} records: {total_time:.2f} seconds')

    augmented_data = [item for sublist in results for item in sublist]
    df['synonym_aug'] = [x[0] for x in augmented_data]
    df['back_trans_aug'] = [x[1] for x in augmented_data]
    return df

# --- Load, Augment, and Save ---
data_path = "/content/Sarcasm.csv"
df = pd.read_csv(data_path)

# Convert `sarcastic` column to binary int if needed
df['sarcastic'] = df['sarcastic'].map({
    'sarcastic': 1, 'not_sarcastic': 0,
    'true': 1, 'false': 0,
    'True': 1, 'False': 0,
    'yes': 1, 'no': 0,
    'Yes': 1, 'No': 0
}).fillna(df['sarcastic'])  # Keep if already 0/1

df['sarcastic'] = df['sarcastic'].astype(int)

# Apply augmentation
df = augment_dataset(df, 'tweet')

# Drop any NAs just in case
df = df.dropna(subset=['tweet', 'synonym_aug', 'back_trans_aug'])

# Create 3x rows (original + synonym + backtrans)
augmented_rows = []
for _, row in df.iterrows():
    augmented_rows.append([row['tweet'], row['sarcastic']])
    augmented_rows.append([row['synonym_aug'], row['sarcastic']])
    augmented_rows.append([row['back_trans_aug'], row['sarcastic']])

# Create final DataFrame
df_final = pd.DataFrame(augmented_rows, columns=['tweet', 'sarcastic'])

# Save final augmented file
df_final.to_csv("tweets.csv", index=False)

# Done!
print(f"✅ Final dataset saved. Total rows: {len(df_final)}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Back translation error: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Processed 1000 records... Time: 4.76 sec
Back translation error: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Processed 2000 records... Time: 0.34 sec
Back translation error: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Processed 3000 records... Time: 0.29 sec
Back translation error: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
Processed 3468 records... Time: 0.14 sec
Total time for 3468 records: 5.67 seconds
✅ Final dataset saved. Total rows: 10401
