### Data Generation

In [1]:
import os
import json
import random
import glob
import nltk
from nltk.corpus import wordnet
from copy import deepcopy

nltk.download('punkt')
nltk.download('wordnet')

def synonym_replacement(sentence, n=2):
    words = nltk.word_tokenize(sentence)
    new_words = words[:]
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)

    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != random_word:
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

def augment_article(article):
    aug_article = deepcopy(article)
    if "text" in article and article["text"]:
        aug_article["text"] = synonym_replacement(article["text"], n=3)
    if "summary" in article and article["summary"]:
        aug_article["summary"] = synonym_replacement(article["summary"], n=2)
    if "analysis" in article and "reasoning" in article["analysis"]:
        aug_article["analysis"]["reasoning"] = synonym_replacement(article["analysis"]["reasoning"], n=2)
    aug_article["augmented"] = True
    return aug_article

def process_file(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    augmented_data = [augment_article(item) for item in data]
    final_data = data + augmented_data

    output_path = input_path.replace("_analyzed.json", "_augmented.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_data, f, indent=2, ensure_ascii=False)
    print(f"Saved: {output_path}")

def main():
    input_files = glob.glob("./analyzed_articles/llama3_8b/*_analyzed.json")
    for file in input_files:
        process_file(file)

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /Users/pranavi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pranavi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved: ./analyzed_articles/llama3_8b/the_indian_express_augmented.json
Saved: ./analyzed_articles/llama3_8b/ndtv_augmented.json
Saved: ./analyzed_articles/llama3_8b/the_hindu_augmented.json
Saved: ./analyzed_articles/llama3_8b/news18_augmented.json
Saved: ./analyzed_articles/llama3_8b/times_of_india_augmented.json
Saved: ./analyzed_articles/llama3_8b/zee_news_augmented.json
Saved: ./analyzed_articles/llama3_8b/india_today_augmented.json


### Different methods

In [16]:
import os
import json
import random
import nltk
import glob
from nltk.corpus import wordnet
from copy import deepcopy

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

for root, dirs, files in os.walk("/"):
    for file in files:
        if "india_today_analyzed.json" in file:
            print(os.path.join(root, file))

# Configuration
# INPUT_FILE = "./analyzed_articles/llama3_8b/india_today_analyzed.json"
OUTPUT_PATH = "/analyzed_articles/llama3_8b"
INPUT_FILE = glob.glob("./analyzed_articles/llama3_8b/india_today_.json")
AUGMENT_COUNT = 20  # Number of augmented versions per article

def get_synonym(word):
    """Fetch a random synonym for a given word using WordNet."""
    synonyms = wordnet.synsets(word)
    if synonyms:
        lemmas = [lemma.name().replace('_', ' ') for s in synonyms for lemma in s.lemmas()]
        lemmas = list(set([w for w in lemmas if w.lower() != word.lower() and w.isalpha()]))
        if lemmas:
            return random.choice(lemmas)
    return word

def synonym_replacement(text, replace_prob=0.3):
    """Randomly replace words in text with synonyms."""
    words = nltk.word_tokenize(text)
    new_words = []
    for word in words:
        if word.isalpha() and random.random() < replace_prob:
            new_words.append(get_synonym(word))
        else:
            new_words.append(word)
    return ' '.join(new_words)

def augment_article(article, version_num):
    """Create an augmented version of an article."""
    aug = deepcopy(article)
    fields = ['title', 'text', 'keywords', 'summary']
    for field in fields:
        if field in aug and isinstance(aug[field], str):
            aug[field] = synonym_replacement(aug[field], replace_prob=0.3 + version_num * 0.01)
        elif field in aug and isinstance(aug[field], list):
            aug[field] = [synonym_replacement(w, replace_prob=0.4) for w in aug[field]]

    if "bias_category" in aug and isinstance(aug["bias_category"], str):
        aug["bias_category"] = synonym_replacement(aug["bias_category"], replace_prob=0.5)

    if "analysis" in aug and "reasoning" in aug["analysis"]:
        aug["analysis"]["reasoning"] = synonym_replacement(aug["analysis"]["reasoning"], replace_prob=0.3)

    aug["augmented_version"] = version_num
    return aug

def load_articles_from_file():
    """Load articles from a JSON file."""
    if os.path.exists(INPUT_FILE):
        with open(INPUT_FILE, "r", encoding="utf-8") as f:
            try:
                return json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error decoding {INPUT_FILE}: {e}")
    else:
        print(f"File not found: {INPUT_FILE}")
    return []

def main():
    articles = load_articles_from_file()
    if not articles:
        print("No input data found.")
        return

    output_articles = []

    for idx, article in enumerate(articles):
        output_articles.append(article)  # include original
        for v in range(1, AUGMENT_COUNT + 1):
            aug = augment_article(article, version_num=v)
            output_articles.append(aug)
        if idx % 10 == 0:
            print(f"Processed {idx}/{len(articles)} articles")

    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(output_articles, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(output_articles)} articles to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /Users/pranavi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pranavi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyboardInterrupt: 

### Text Attack

In [17]:
import json
from tqdm import tqdm
from textattack.attack_recipes import (
    PWWSRen2019,
    TextFoolerJin2019,
    DeepWordBugGao2018,
    BAEGarg2019,
    CheckList2020
)
from textattack.datasets import Dataset
from textattack.models.wrappers import ModelWrapper
from textattack.attack_results import SuccessfulAttackResult


# Dummy wrapper for compatibility
class DummyWrapper(ModelWrapper):
    def __init__(self):
        self.model = lambda x: [[0.5, 0.5]] * len(x)  # Mock prediction

    def __call__(self, text_input_list):
        return self.model(text_input_list)

    def get_grad(self, text_input_list, labels):
        return None


# Load input file
with open("/analyzed_data/llama3_8b/india_today_analyzed.json") as f:
    data = json.load(f)

# Select top N entries to attack for speed
top_n = 10  # you can change this
data = data[:top_n]

# Initialize model
model_wrapper = DummyWrapper()

# Define attacks
attack_recipes = {
    "PWWS": PWWSRen2019.build(model_wrapper),
    "TextFooler": TextFoolerJin2019.build(model_wrapper),
    "DeepWordBug": DeepWordBugGao2018.build(model_wrapper),
    "BAE": BAEGarg2019.build(model_wrapper),
    "CheckList": CheckList2020.build(model_wrapper),
}

# Prepare dataset for attack
augmented_data = []

print(f"Starting attacks on {len(data)} articles...")

for item in tqdm(data):
    fields_to_augment = ["source", "title", "text", "keywords"]
    result_entry = {
        "original": item,
        "augmented_versions": {}
    }

    for name, attack in attack_recipes.items():
        field_augmentations = {}
        for field in fields_to_augment:
            original_field_value = item.get(field, "")
            # If keywords is a list, join to a string
            if isinstance(original_field_value, list):
                original_field_value = ", ".join(original_field_value)
            try:
                result = attack.attack(original_field_value, "")
                if isinstance(result, SuccessfulAttackResult):
                    adv_text = result.perturbed_text()
                else:
                    adv_text = result.perturbed_text()
                field_augmentations[field] = adv_text
            except Exception as e:
                field_augmentations[field] = f"Error: {str(e)}"
        result_entry["augmented_versions"][name] = field_augmentations

    augmented_data.append(result_entry)


for item in tqdm(data):
    original_text = item.get("text", "")
    result_entry = {
        "original": item,
        "augmented_versions": {}
    }

    for name, attack in attack_recipes.items():
        try:
            result = attack.attack(original_text, "")
            if isinstance(result, SuccessfulAttackResult):
                adv_text = result.perturbed_text()
            else:
                adv_text = result.perturbed_text()
            result_entry["augmented_versions"][name] = adv_text
        except Exception as e:
            result_entry["augmented_versions"][name] = f"Error: {str(e)}"

    augmented_data.append(result_entry)

# Save output
with open("augmented_india_today.json", "w", encoding="utf-8") as f:
    json.dump(augmented_data, f, ensure_ascii=False, indent=2)

print("Done! Saved to 'augmented_data_india_today.json'")

2025-04-19 10:40:13.898825: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'lru'

In [16]:
!pip install jieba lemminflect




In [14]:
#!pip install transformers
!pip install flair

Collecting flair
  Using cached flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Using cached boto3-1.37.37-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Using cached conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting deprecated>=1.2.13 (from flair)
  Using cached Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting ftfy>=6.1.0 (from flair)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting gdown>=4.4.0 (from flair)
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect>=1.0.9 (from flair)
  Using cached langdetect-1.0.9-py3-none-any.whl
Collecting mpld3>=0.3 (from flair)
  Using cached mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Using cached pptree-3.1-py3-none-any.whl
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Using cached pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting 