In [1]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [2]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train.csv')

# Define the mapping
class_to_text_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}
text_to_class_mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Science': 4}

# Apply the mapping to the class column
# ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
# ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_train_baseline = ag_train['Description']
ag_news_train_true_labels = ag_train['Class Index']

sampled = ag_news_train_baseline.sample(100, random_state=42)
ag_news_train_baseline = sampled
ag_news_train_true_labels = ag_news_train_true_labels.loc[sampled.index]

sampled_indices = ag_test.sample(100, random_state=42).index
ag_news_baseline = ag_test.loc[sampled_indices, 'Description']
ag_news_true_labels = ag_test.loc[sampled_indices, 'Class Index']

ag_news_train = {
    "AG News":(ag_news_train_baseline, ag_news_train_true_labels),
}

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
}

In [3]:
topic_modelling_orchestrator = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator.add_models_grid(
    model_types=['LDA', 'LSI', 'NMF'],
    param_grid={'n_topics': [4]}
)

class_orchestrator = ClassificationPipelineOrchestrator()
class_orchestrator.add_models_grid(
    model_types=[
        'TARSZeroShot',
        # 'SVM',
        # 'XGBoost',
        # 'RandomForest',
        # 'LightGBM',
        # 'SVMRoberta', 
        # 'XGBoostRoberta',
        # 'LightGBMRoberta',
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

2025-05-21 23:15:05,707 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model


['TARSZeroShot']

In [4]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    AddRandomCharsNoise(), 
    AddRandomWordsNoise(), 
    DeleteRandomWordsNoise(), 
    ShuffleSentencesNoise(noise_level=0.7), 
    ReplaceWithSynonymsNoise(), 
    ReplaceWithAntonymsNoise()
]

In [5]:
ag_news_llm_train_df = pd.read_csv('../outputs/llm_to_label/news_assigned_final.csv')
ag_news_llm_train_df['predicted_label'] = ag_news_llm_train_df['predicted_label_text'].map(text_to_class_mapping).fillna(-1).astype(int)

ag_news_llm_train_df = ag_news_llm_train_df[ag_news_llm_train_df['predicted_label'] != -1]  # Remove rows with -1 labels
ag_news_llm_train = {'AG News': (ag_news_llm_train_df['text'], ag_news_llm_train_df['predicted_label'])}

In [6]:
class_orchestrator.evaluate_with_training(ag_news_llm_train, ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
1,AG News,AddRandomCharsNoise,TARSZeroShot,0.74,0.711073,0.708966,0.74
2,AG News,AddRandomWordsNoise,TARSZeroShot,0.7,0.657159,0.640499,0.7
3,AG News,DeleteRandomWordsNoise,TARSZeroShot,0.72,0.686649,0.677779,0.72
0,AG News,NoNoise,TARSZeroShot,0.73,0.693013,0.683448,0.73
6,AG News,ReplaceWithAntonymsNoise,TARSZeroShot,0.73,0.693013,0.683448,0.73
5,AG News,ReplaceWithSynonymsNoise,TARSZeroShot,0.74,0.706667,0.7,0.74
4,AG News,ShuffleSentencesNoise,TARSZeroShot,0.73,0.693013,0.683448,0.73


In [7]:
class_orchestrator.evaluate_with_training(ag_news_train, ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

2025-05-21 22:40:23,321 `ZeroShot` is the current task. Switch to some other task before dropping this.
2025-05-21 22:40:23,367 `ZeroShot` is the current task. Switch to some other task before dropping this.
Error evaluating model TARSZeroShot: list index out of range
2025-05-21 22:40:24,153 `ZeroShot` is the current task. Switch to some other task before dropping this.
2025-05-21 22:40:24,202 `ZeroShot` is the current task. Switch to some other task before dropping this.
Error evaluating model TARSZeroShot: list index out of range
2025-05-21 22:40:25,039 `ZeroShot` is the current task. Switch to some other task before dropping this.
2025-05-21 22:40:25,089 `ZeroShot` is the current task. Switch to some other task before dropping this.
Error evaluating model TARSZeroShot: list index out of range
2025-05-21 22:40:25,780 `ZeroShot` is the current task. Switch to some other task before dropping this.
2025-05-21 22:40:25,825 `ZeroShot` is the current task. Switch to some other task before 

KeyError: "None of [Index(['Dataset', 'Noise', 'Model', 'Accuracy', 'F1 Score', 'Precision',\n       'Recall'],\n      dtype='object')] are in the [columns]"

In [6]:
topic_modelling_orchestrator.evaluate(ag_news, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:02,538 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:02,563 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:02,563 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:02.563681', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:02,568 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:08,953 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:09,928 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:10,033 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:10,042 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:10.042831', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:10,050 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:16,056 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-18 17:12:17,179 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-05-18 17:12:17,247 [INFO] gensim.corpora.dictionary: built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)
2025-05-18 17:12:17,256 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<10608 unique tokens: ['citi', 'cup', 'entertain', 'epic', 'fa']...> from 3600 documents (total 65950 corpus positions)", 'datetime': '2025-05-18T17:12:17.256802', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.4.1-arm64-arm-64bit', 'event': 'created'}
2025-05-18 17:12:17,264 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-05-18 17:12:23,166 [INFO] gensim.topic_coherence.text_analysis: 10 accumulators retri

In [7]:
topic_modelling_orchestrator.results

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News,NoNoise,LDA_4,0.106245,0.468141,0.366147,
1,AG News,NoNoise,LSI_4,0.101955,0.55697,0.543418,
2,AG News,NoNoise,NMF_4,0.405814,0.697631,0.465874,59.355705


In [7]:
df = class_orchestrator.results

In [13]:
df.sort_values('F1 Score')

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
24,AG News,DeleteRandomWordsNoise,LightGBM,0.611944,0.612355,0.615544,0.611944
22,AG News,AddRandomCharsNoise,LightGBM,0.618889,0.619569,0.623418,0.618889
26,AG News,ReplaceWithSynonymsNoise,LightGBM,0.626389,0.626331,0.627929,0.626389
23,AG News,AddRandomWordsNoise,LightGBM,0.628611,0.6289,0.631267,0.628611
27,AG News,ReplaceWithAntonymsNoise,LightGBM,0.6325,0.632513,0.634365,0.6325
25,AG News,ShuffleSentencesNoise,LightGBM,0.634167,0.634334,0.636044,0.634167
21,AG News,NoNoise,LightGBM,0.634167,0.634334,0.636044,0.634167
10,AG News,DeleteRandomWordsNoise,XGBoost,0.710556,0.711175,0.721657,0.710556
8,AG News,AddRandomCharsNoise,XGBoost,0.711389,0.712402,0.723162,0.711389
12,AG News,ReplaceWithSynonymsNoise,XGBoost,0.722778,0.723637,0.731897,0.722778
