In [2]:
pip install nltk transformers torch tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
import wandb

wandb.login(key="0c5f368f1f51fd942ec7bb3a1c74efb7bdc832d6")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmshoaibvohra[0m ([33mmshoaibvohra-habib-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/content/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/content/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/content/EXIST2025_training_task1_3_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"  # Represents non-sexist tweets (previously "-")
]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

English train set size: 9731 (with selective augmentation)
Spanish train set size: 9731 (with selective augmentation)
