In [1]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [2]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train.csv')

# Define the mapping
class_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}

# Apply the mapping to the class column
# ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
# ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_train_baseline = ag_train['Description']
ag_news_train_true_labels = ag_train['Class Index']

sampled = ag_news_train_baseline.sample(1000, random_state=42)
ag_news_train_baseline = sampled
ag_news_train_true_labels = ag_news_train_true_labels.loc[sampled.index]

sampled_indices = ag_test.sample(3600, random_state=42).index
ag_news_baseline = ag_test.loc[sampled_indices, 'Description']
ag_news_true_labels = ag_test.loc[sampled_indices, 'Class Index']

ag_news_train = {
    "AG News":(ag_news_train_baseline, ag_news_train_true_labels),
}

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
}

In [4]:
topic_modelling_orchestrator = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator.add_models_grid(
    model_types=['LDA', 'LSI', 'NMF'],
    param_grid={'n_topics': [4]}
)

class_orchestrator = ClassificationPipelineOrchestrator()
class_orchestrator.add_models_grid(
    model_types=[
        'SVM',
        'XGBoost',
        'RandomForest',
        'LightGBM',
        # 'SVMRoberta', 
        # 'XGBoostRoberta',
        # 'LightGBMRoberta',
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

['SVM', 'XGBoost', 'RandomForest', 'LightGBM']

In [5]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    AddRandomCharsNoise(), 
    AddRandomWordsNoise(), 
    DeleteRandomWordsNoise(), 
    ShuffleSentencesNoise(noise_level=0.7), 
    ReplaceWithSynonymsNoise(), 
    ReplaceWithAntonymsNoise()
]

In [6]:
class_orchestrator.evaluate_with_training(ag_news_train, ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/4 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
22,AG News,AddRandomCharsNoise,LightGBM,0.618889,0.619569,0.623418,0.618889
15,AG News,AddRandomCharsNoise,RandomForest,0.744444,0.74371,0.754973,0.744444
1,AG News,AddRandomCharsNoise,SVM,0.810833,0.813637,0.830119,0.810833
8,AG News,AddRandomCharsNoise,XGBoost,0.711389,0.712402,0.723162,0.711389
23,AG News,AddRandomWordsNoise,LightGBM,0.628611,0.6289,0.631267,0.628611
16,AG News,AddRandomWordsNoise,RandomForest,0.76,0.760479,0.770731,0.76
2,AG News,AddRandomWordsNoise,SVM,0.82,0.822526,0.836986,0.82
9,AG News,AddRandomWordsNoise,XGBoost,0.726389,0.727427,0.735543,0.726389
24,AG News,DeleteRandomWordsNoise,LightGBM,0.611944,0.612355,0.615544,0.611944
17,AG News,DeleteRandomWordsNoise,RandomForest,0.7475,0.747074,0.758981,0.7475


In [6]:
topic_modelling_orchestrator.evaluate(ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:02,538 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:02,563 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:02,563 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:02.563681', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:02,568 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:08,953 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:09,928 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:10,033 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:10,042 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:10.042831', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:10,050 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:16,056 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:17,179 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:17,247 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:17,256 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:17.256802', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:17,264 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:23,166 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

In [7]:
topic_modelling_orchestrator.results

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News,NoNoise,LDA_4,0.106245,0.468141,0.366147,
1,AG News,NoNoise,LSI_4,0.101955,0.55697,0.543418,
2,AG News,NoNoise,NMF_4,0.405814,0.697631,0.465874,59.355705


In [7]:
df = class_orchestrator.results

In [13]:
df.sort_values('F1 Score')

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
24,AG News,DeleteRandomWordsNoise,LightGBM,0.611944,0.612355,0.615544,0.611944
22,AG News,AddRandomCharsNoise,LightGBM,0.618889,0.619569,0.623418,0.618889
26,AG News,ReplaceWithSynonymsNoise,LightGBM,0.626389,0.626331,0.627929,0.626389
23,AG News,AddRandomWordsNoise,LightGBM,0.628611,0.6289,0.631267,0.628611
27,AG News,ReplaceWithAntonymsNoise,LightGBM,0.6325,0.632513,0.634365,0.6325
25,AG News,ShuffleSentencesNoise,LightGBM,0.634167,0.634334,0.636044,0.634167
21,AG News,NoNoise,LightGBM,0.634167,0.634334,0.636044,0.634167
10,AG News,DeleteRandomWordsNoise,XGBoost,0.710556,0.711175,0.721657,0.710556
8,AG News,AddRandomCharsNoise,XGBoost,0.711389,0.712402,0.723162,0.711389
12,AG News,ReplaceWithSynonymsNoise,XGBoost,0.722778,0.723637,0.731897,0.722778
