In [None]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/salveendutt/nltk_data...


True

In [1]:
import nltk
import pandas as pd
import random
from topic_modeling import *
from nltk.tokenize import word_tokenize, sent_tokenize
random.seed(42)
from nltk.corpus import wordnet


In [2]:
def get_synonyms(word):
    """Find synonyms for a given word."""
    synonyms = []
    
    # Get POS tag for the word
    pos_tag = nltk.pos_tag([word])[0][1]
    
    # Map the POS tag to WordNet POS name
    tag_map = {
        'JJ': wordnet.ADJ,
        'NN': wordnet.NOUN,
        'NNS': wordnet.NOUN,
        'RB': wordnet.ADV,
        'VB': wordnet.VERB,
        'VBD': wordnet.VERB,
        'VBG': wordnet.VERB,
        'VBN': wordnet.VERB,
        'VBP': wordnet.VERB,
        'VBZ': wordnet.VERB
    }
    
    wordnet_pos = tag_map.get(pos_tag[0:2])
    
    if wordnet_pos:
        for syn in wordnet.synsets(word, pos=wordnet_pos):
            for lemma in syn.lemmas():
                if lemma.name() != word and '_' not in lemma.name():
                    synonyms.append(lemma.name())
    
    return list(set(synonyms))

def get_antonyms(word):
    """Find antonyms for a given word using WordNet."""
    antonyms = []
    
    # Get POS tag for the word
    pos_tag = nltk.pos_tag([word])[0][1]
    
    # Map the POS tag to WordNet POS name
    tag_map = {
        'JJ': wordnet.ADJ,
        'NN': wordnet.NOUN,
        'NNS': wordnet.NOUN,
        'RB': wordnet.ADV,
        'VB': wordnet.VERB,
        'VBD': wordnet.VERB,
        'VBG': wordnet.VERB,
        'VBN': wordnet.VERB,
        'VBP': wordnet.VERB,
        'VBZ': wordnet.VERB
    }
    
    wordnet_pos = tag_map.get(pos_tag[0:2])
    
    if wordnet_pos:
        for syn in wordnet.synsets(word, pos=wordnet_pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    if antonym.name() != word and '_' not in antonym.name():
                        antonyms.append(antonym.name())
    
    return list(set(antonyms))

In [3]:


def add_random_word(text, probability=0.1):
    """Add random words from WordNet to the text with a certain probability."""
    if not isinstance(text, str):
        return text
    
    # Preload synsets to avoid repeated expensive calls
    synsets = list(wordnet.all_synsets())

    def get_random_wordnet_word():
        """Get a random word from WordNet, ensuring it's a single-word noun or verb."""
        while True:
            random_synset = random.choice(synsets)  # Avoid regenerating the list
            word = random_synset.lemmas()[0].name()  # Get first lemma of the synset
            if '_' not in word:  # Ensure single-word output
                return word
            
    words = text.split()
    modified_words = []
    
    for word in words:
        modified_words.append(word)
        if random.random() < probability:
            random_word = get_random_wordnet_word()
            modified_words.append(random_word)
    
    return ' '.join(modified_words)


def add_random_character(text, probability=0.1):
    """Add random characters to words in text with a certain probability."""
    if not isinstance(text, str):
        return text
    
    words = text.split()
    modified_words = []
    
    for word in words:
        if len(word) > 2 and random.random() < probability:
            # Choose a random position to insert the character
            position = random.randint(1, len(word) - 1)
            # Choose a random lowercase letter
            random_char = random.choice('abcdefghijklmnopqrstuvwxyz')
            word = word[:position] + random_char + word[position:]
        modified_words.append(word)
    
    return ' '.join(modified_words)

def random_word_deletion(text, probability=0.1):
    """Randomly delete words from text with a certain probability."""
    if not isinstance(text, str):
        return text
    
    words = text.split()
    if len(words) <= 3:  # Don't delete if text is too short
        return text
    
    modified_words = []
    
    for word in words:
        if random.random() >= probability:
            modified_words.append(word)
    
    # Ensure we don't delete all words
    if not modified_words:
        modified_words = [random.choice(words)]
    
    return ' '.join(modified_words)

def shuffle_sentences(text, probability=0.5):
    """Reorder sentences within a document with a certain probability."""
    if not isinstance(text, str) or len(text) < 10:
        return text
    
    if random.random() < probability:
        sentences = sent_tokenize(text)
        if len(sentences) > 1:
            random.shuffle(sentences)
        return ' '.join(sentences)
    
    return text

def replace_with_synonym(text, probability=0.1):
    """Replace words with their synonyms with a certain probability."""
    if not isinstance(text, str):
        return text
    
    words = nltk.word_tokenize(text)
    modified_words = []
    
    for word in words:
        if word.isalpha() and len(word) > 3 and random.random() < probability:
            synonyms = get_synonyms(word)
            if synonyms:
                modified_words.append(random.choice(synonyms))
            else:
                modified_words.append(word)
        else:
            modified_words.append(word)
    
    return ' '.join(modified_words)

def create_adversarial_examples(text, probability=0.1, fallback_to_predefined=True):
    """
    Create adversarial examples by replacing words with their antonyms from WordNet.
    If WordNet doesn't have antonyms for a word and fallback_to_predefined is True,
    uses a predefined dictionary of common antonyms.
    """
    if not isinstance(text, str):
        return text
    
    # Common antonym pairs as fallback
    predefined_antonyms = {
        'good': 'bad', 'bad': 'good', 'high': 'low', 'low': 'high',
        'increase': 'decrease', 'decrease': 'increase', 'positive': 'negative',
        'negative': 'positive', 'success': 'failure', 'failure': 'success',
        'true': 'false', 'false': 'true', 'right': 'wrong', 'wrong': 'right',
        'happy': 'sad', 'sad': 'happy', 'up': 'down', 'down': 'up',
        'big': 'small', 'small': 'big', 'fast': 'slow', 'slow': 'fast',
        'strong': 'weak', 'weak': 'strong', 'rich': 'poor', 'poor': 'rich',
        'win': 'lose', 'lose': 'win', 'best': 'worst', 'worst': 'best',
        'approve': 'reject', 'reject': 'approve', 'agree': 'disagree', 'disagree': 'agree',
        'accept': 'deny', 'deny': 'accept', 'buy': 'sell', 'sell': 'buy'
    }
    
    words = nltk.word_tokenize(text)
    modified_words = []
    
    for word in words:
        if word.isalpha() and len(word) > 2 and random.random() < probability:
            word_lower = word.lower()
            antonyms = get_antonyms(word_lower)
            
            if antonyms:
                # Use WordNet antonym
                replacement = random.choice(antonyms)
                
                # Keep the original capitalization pattern
                if word[0].isupper():
                    replacement = replacement.capitalize()
                
                modified_words.append(replacement)
            elif fallback_to_predefined and word_lower in predefined_antonyms:
                # Fallback to predefined antonym if no WordNet antonyms exist
                replacement = predefined_antonyms[word_lower]
                
                # Keep the original capitalization pattern
                if word[0].isupper():
                    replacement = replacement.capitalize()
                
                modified_words.append(replacement)
            else:
                modified_words.append(word)
        else:
            modified_words.append(word)
    
    return ' '.join(modified_words)

In [4]:
# Cache a filtered list of simple words
_CACHED_WORDS = None

def _build_word_cache():
    words = []
    # Use noun synsets only for faster loading
    for synset in list(wordnet.all_synsets()):
        for lemma in synset.lemmas():
            word = lemma.name()
            if '_' not in word:  # only simple words
                words.append(word)
    return list(set(words))  # unique words

def add_random_word(text, probability=0.1):
    """Add random words from WordNet to the text with a certain probability."""
    global _CACHED_WORDS
    if not isinstance(text, str):
        return text

    # Build cache on first use
    if _CACHED_WORDS is None:
        _CACHED_WORDS = _build_word_cache()

    words = text.split()
    modified_words = []

    for word in words:
        modified_words.append(word)
        if random.random() < probability:
            random_word = random.choice(_CACHED_WORDS)
            modified_words.append(random_word)

    return ' '.join(modified_words)

## AG News Dataset

In [6]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train.csv')

# Define the mapping
class_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}

# Apply the mapping to the class column
ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_baseline = ag_test['Description']
ag_news_char_insertion = ag_test['Description'].apply(add_random_character)
ag_news_word_deletion = ag_test['Description'].apply(random_word_deletion)
ag_news_shuffle_sent = ag_test['Description'].apply(shuffle_sentences)
ag_news_adversarial = ag_test['Description'].apply(create_adversarial_examples)
ag_news_synonym = ag_test['Description'].apply(replace_with_synonym)
ag_news_word_insertion = ag_test['Description'].apply(add_random_word)
ag_news_combined = ag_test['Description'].apply(replace_with_synonym) \
    .apply(create_adversarial_examples) \
    .apply(add_random_word) \
    .apply(shuffle_sentences) \
    .apply(random_word_deletion) \
    .apply(add_random_character)

ag_news_true_labels = ag_test['Class Index']

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
    "AG News Added Random Chars":(ag_news_char_insertion, ag_news_true_labels),
    "AG News Random Word Deletion":(ag_news_word_deletion, ag_news_true_labels),
    "AG News Shuffled Sentances":(ag_news_shuffle_sent, ag_news_true_labels),
    "AG News Adversarial":(ag_news_adversarial, ag_news_true_labels),
    "AG News Synonym":(ag_news_synonym, ag_news_true_labels),
    "AG News Added Random Word":(ag_news_word_insertion, ag_news_true_labels),
    "AG News Noisy":(ag_news_combined, ag_news_true_labels),
}

In [7]:
orchestrator = TopicModelOrchestrator()

orchestrator.add_models_grid(
    model_types=['LDA', 'LSI', 'NMF'],
    param_grid={'n_topics': [4]}
)

['LDA_4', 'LSI_4', 'NMF_4']

In [8]:
results = orchestrator.evaluate(ag_news)

Evaluating models on dataset: AG News
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Added Random Chars
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Random Word Deletion
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Shuffled Sentances
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Adversarial
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Synonym
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Added Random Word
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model: NMF_4
Evaluating models on dataset: AG News Noisy
  Evaluated model: LDA_4
  Evaluated model: LSI_4
  Evaluated model:

In [9]:
orchestrator.results

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI Score,Topic Coherence,Cosine Similarity,Reconstruction Error,Model,Dataset
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AG News,LDA_4,0.154719,0.533687,0.389324,,LDA_4,AG News
AG News,LSI_4,0.111348,0.631264,0.56431,,LSI_4,AG News
AG News,NMF_4,0.40505,0.722596,0.473902,86.28456,NMF_4,AG News
AG News Added Random Chars,LDA_4,0.035902,0.378694,0.335707,,LDA_4,AG News Added Random Chars
AG News Added Random Chars,LSI_4,0.00519,0.602564,0.627627,,LSI_4,AG News Added Random Chars
AG News Added Random Chars,NMF_4,0.323788,0.710161,0.514478,86.497064,NMF_4,AG News Added Random Chars
AG News Random Word Deletion,LDA_4,0.146251,0.459802,0.383968,,LDA_4,AG News Random Word Deletion
AG News Random Word Deletion,LSI_4,0.106572,0.531868,0.557175,,LSI_4,AG News Random Word Deletion
AG News Random Word Deletion,NMF_4,0.391456,0.712067,0.474443,86.339798,NMF_4,AG News Random Word Deletion
AG News Shuffled Sentances,LDA_4,0.154719,0.533687,0.389324,,LDA_4,AG News Shuffled Sentances
