In [1]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [None]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train.csv')

# Define the mapping
class_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}

# Apply the mapping to the class column
ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_train_baseline = ag_train['Description']
ag_news_train_true_labels = ag_train['Class Index']

sampled = ag_news_train_baseline.sample(10, random_state=42)
ag_news_train_baseline = sampled
ag_news_train_true_labels = ag_news_train_true_labels.loc[sampled.index]

ag_news_baseline = ag_test['Description']
ag_news_true_labels = ag_test['Class Index']

ag_news_train = {
    "AG News":(ag_news_train_baseline, ag_news_train_true_labels),
}

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
}

In [3]:
ag_news_train_true_labels.value_counts()

Class Index
2    30
3    27
1    24
4    19
Name: count, dtype: int64

In [4]:
topic_modelling_orchestrator = TopicModelingPipelineOrchestrator()

topic_modelling_orchestrator.add_models_grid(
    model_types=['LDA', 'LSI', 'NMF'],
    param_grid={'n_topics': [4]}
)

['LDA_4', 'LSI_4', 'NMF_4']

In [6]:
class_orchestrator = ClassificationPipelineOrchestrator()

class_orchestrator.add_models_grid(
    model_types=[
        'SVMRoberta', 
        'SVM'],
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters

)

['SVMRoberta', 'SVM']

In [7]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    # AddRandomCharsNoise(), 
    # AddRandomWordsNoise(), 
    # DeleteRandomWordsNoise(), 
    ShuffleSentencesNoise(noise_level=0.7), 
    # ReplaceWithSynonymsNoise(), 
    # ReplaceWithAntonymsNoise()
]

In [8]:
x,y = ag_news_train['AG News']

In [9]:
class_orchestrator.evaluate_with_training(ag_news_train, ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/2 [00:00<?, ?it/s]

2025-05-18 12:08:43,096 [INFO] classification_models: Processing 100 documents for RoBERTa embeddings.
  _warn_prf(average, modifier, msg_start, len(result))
2025-05-18 12:11:15,074 [INFO] classification_models: Processing 100 documents for RoBERTa embeddings.
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
2,AG News,NoNoise,SVM,0.353289,0.440713,0.90706,0.353289
0,AG News,NoNoise,SVMRoberta,0.25,0.4,1.0,0.25
3,AG News,ShuffleSentencesNoise,SVM,0.353289,0.440713,0.90706,0.353289
1,AG News,ShuffleSentencesNoise,SVMRoberta,0.25,0.4,1.0,0.25


In [None]:
topic_modelling_orchestrator.evaluate(ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/2 [00:00<?, ?it/s]

2025-05-17 20:23:34,939 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-17 20:23:34,990 [INFO] gensim.corpora.dictionary: built Dictionary<15168 unique tokens: ['disappoint', 'feder', 'firm', 'mogul', 'newal']...> from 7600 documents (total 139244 corpus positions)
2025-05-17 20:23:34,990 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<15168 unique tokens: ['disappoint', 'feder', 'firm', 'mogul', 'newal']...> from 7600 documents (total 139244 corpus positions)", 'datetime': '2025-05-17T20:23:34.990659', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-17 20:23:34,996 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-17 20:23:40,956 [INFO] gensim.topic_coherence.text_analysis: 10 ac

In [6]:
orchestrator.results

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News,NoNoise,LDA_4,0.154719,0.533687,0.389324,
2,AG News,NoNoise,LSI_4,0.111348,0.631264,0.56431,
4,AG News,NoNoise,NMF_4,0.40505,0.722596,0.473902,86.28456
1,AG News,ShuffleSentencesNoise,LDA_4,0.154719,0.533687,0.389324,
3,AG News,ShuffleSentencesNoise,LSI_4,0.111348,0.631264,0.56431,
5,AG News,ShuffleSentencesNoise,NMF_4,0.40505,0.722596,0.473902,86.28456
