In [1]:
# # RUN BELOW WHEN RUNNING FOR THE FIRST TIME
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

### Core Assumptions:
* Business wants to classify incoming textual data
* Minimal labeling needed. Zero to 20 labels per classification 

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

In [2]:
taking_sample = True
test_sample_size = 30

## AG News Dataset

In [3]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train_from_llm.csv')

ag_train.dropna(subset=['Predicted Topic Index'], inplace=True)

# CREATING A SAMPLE TEST SET
if taking_sample == True:
    ag_test = ag_test.sample(test_sample_size, random_state=42)

ag_test_desc = ag_test['Description']
ag_test_labels = ag_test['Class Index']

ag_train_desc = ag_train['Description']
ag_train_labels = ag_train['Class Index']

ag_train_llm_desc = ag_train['Description']
ag_train_llm_labels = ag_train['Predicted Topic Index']

### BBC

In [4]:
bbc_test = pd.read_csv('../data/BBC News/test.csv')
bbc_train = pd.read_csv('../data/BBC News/train_from_llm.csv')

if taking_sample == True:
    bbc_test = bbc_test.sample(test_sample_size, random_state=42)

bbc_test_desc = bbc_test['Description']
bbc_test_labels = bbc_test['Class Index']

bbc_train_desc = bbc_train['Description']
bbc_train_labels = bbc_train['Class Index']

bbc_train_llm_desc = ag_train['Description']
bbc_train_llm_labels = ag_train['Predicted Topic Index']


### 20NewsGroup

In [5]:
newsgroups_test = pd.read_csv('../data/20 Newsgroups/test.csv')
newsgroups_train = pd.read_csv('../data/20 Newsgroups/train_from_llm.csv')
newsgroups_train = newsgroups_train[newsgroups_train['Predicted Topic Index'] != -1]

if taking_sample == True:
    newsgroups_test = newsgroups_test.sample(test_sample_size, random_state=42)

newsgroups_test_desc = newsgroups_test['Description']
newsgroups_test_labels = newsgroups_test['Class Index']

newsgroups_train_desc = newsgroups_train['Description']
newsgroups_train_labels = newsgroups_train['Class Index']

newsgroups_train_llm_desc = newsgroups_train['Description']
newsgroups_train_llm_labels = newsgroups_train['Predicted Topic Index']

### Combining Datasets

In [6]:
# len(bbc_test_desc), len(bbc_train_desc), len(ag_test_desc), len(ag_train_desc)

In [7]:
# For pipelines that works with all datasets
datasets_test = {
    "AG News Full": (ag_test_desc, ag_test_labels),
    "AG News LLM labels": (ag_test_desc, ag_test_labels),
    "BBC News Full": (bbc_test_desc, bbc_test_labels),
    "BBC News LLM labels": (bbc_test_desc, bbc_test_labels),
    "20 Newsgroups Full": (newsgroups_test_desc, newsgroups_test_labels),
    "20 Newsgroups LLM labels": (newsgroups_test_desc, newsgroups_test_labels),
}

datasets_train = {
    "AG News Full":(ag_train_desc, ag_train_labels),
    "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
    "BBC News Full": (bbc_train_desc, bbc_train_labels),
}

# For pipelines that works with AG News only
datasets_test_ag_news = {
    "AG News Full": (ag_test_desc, ag_test_labels),
    "AG News LLM labels": (ag_test_desc, ag_test_labels),
}

datasets_train_ag_news = {
    "AG News Full":(ag_train_desc, ag_train_labels),
    "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
}

# For pipelines that works with BBC News only
datasets_test_bbc_news = {
    "BBC News Full": (bbc_test_desc, bbc_test_labels),
    "BBC News LLM labels": (bbc_test_desc, bbc_test_labels),
}

datasets_train_bbc_news = {
    "BBC News Full": (bbc_train_desc, bbc_train_labels),
    "BBC News LLM labels": (bbc_train_llm_desc, bbc_train_llm_labels),
}

datasets_newsgroups = {
    "20 Newsgroups Full": (newsgroups_test_desc, newsgroups_test_labels),
    "20 Newsgroups LLM labels": (newsgroups_test_desc, newsgroups_test_labels),
}

datasets_train_newsgroups = {
    "20 Newsgroups Full": (newsgroups_train_desc, newsgroups_train_labels),
    "20 Newsgroups LLM labels": (newsgroups_train_llm_desc, newsgroups_train_llm_labels),
}

In [8]:
topic_modelling_orchestrator_ag_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_bbc_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_ag_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI',
        'NMF'
        ],
    param_grid={'n_topics': [4]}
)
topic_modelling_orchestrator_bbc_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI', 
        'NMF'
        ],
    param_grid={'n_topics': [5]}
)

['LDA_5', 'LSI_5', 'NMF_5']

In [9]:
class_orchestrator_zero_shot = ClassificationPipelineOrchestrator()
class_orchestrator_sup = ClassificationPipelineOrchestrator()
class_orchestrator_zero_shot.add_models_grid(
    model_types=[
        'LLMClassifier',
        'TARSZeroShot',
    ]
)
class_orchestrator_sup.add_models_grid(
    model_types=[
        'SVM',
        'XGBoost',
        'LightGBM',
        'RandomForest',
        'SVMRoberta', 
        'XGBoostRoberta',
        'LightGBMRoberta',
        'RandomForestRoberta'
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

2025-06-05 22:27:18,167 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model


['SVM',
 'XGBoost',
 'LightGBM',
 'RandomForest',
 'SVMRoberta',
 'XGBoostRoberta',
 'LightGBMRoberta',
 'RandomForestRoberta']

In [10]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    # AddRandomCharsNoise(), 
    # AddRandomWordsNoise(), 
    # DeleteRandomWordsNoise(), 
    # ShuffleSentencesNoise(noise_level=0.7), 
    # ReplaceWithSynonymsNoise(), 
    # ReplaceWithAntonymsNoise()
]

#### Running pipelines for classification

In [None]:
class_orchestrator_sup.evaluate_with_training(datasets_train, datasets_test, noise_strategies).round(2)

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

2025-06-03 21:12:54,204 [INFO] root: Processing dataset: AG News Full


Models:   0%|          | 0/8 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-03 21:13:04,600 [INFO] root: Processing dataset: AG News LLM labels


Models:   0%|          | 0/8 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-03 21:13:13,482 [INFO] root: Processing dataset: BBC News Full


Models:   0%|          | 0/8 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
2,AG News Full,NoNoise,LightGBM,0.2,0.33,1.0,0.2
6,AG News Full,NoNoise,LightGBMRoberta,0.6,0.6,0.6,0.6
3,AG News Full,NoNoise,RandomForest,0.8,0.89,1.0,0.8
7,AG News Full,NoNoise,RandomForestRoberta,0.6,0.6,0.6,0.6
0,AG News Full,NoNoise,SVM,0.6,0.59,0.67,0.6
4,AG News Full,NoNoise,SVMRoberta,0.4,0.36,0.33,0.4
1,AG News Full,NoNoise,XGBoost,0.8,0.87,1.0,0.8
5,AG News Full,NoNoise,XGBoostRoberta,0.4,0.36,0.33,0.4
10,AG News LLM labels,NoNoise,LightGBM,0.0,0.0,0.0,0.0
14,AG News LLM labels,NoNoise,LightGBMRoberta,0.6,0.5,0.47,0.6


In [15]:
class_orchestrator_sup.results.round(2).to_csv('../outputs/final/results_classification_supervised.csv', index=False)

In [16]:
class_orchestrator_zero_shot.evaluate_with_training(datasets_train, datasets_test, noise_strategies).round(2)

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

2025-06-03 21:14:15,513 [INFO] root: Processing dataset: AG News Full


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-03 21:14:20,537 [INFO] root: Processing dataset: AG News LLM labels


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-03 21:14:22,313 [INFO] root: Processing dataset: BBC News Full


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
0,AG News Full,NoNoise,LLMClassifier,0.6,0.5,0.47,0.6
1,AG News Full,NoNoise,TARSZeroShot,0.4,0.3,0.27,0.4
2,AG News LLM labels,NoNoise,LLMClassifier,0.6,0.5,0.47,0.6
3,AG News LLM labels,NoNoise,TARSZeroShot,0.4,0.3,0.27,0.4
4,BBC News Full,NoNoise,LLMClassifier,1.0,1.0,1.0,1.0
5,BBC News Full,NoNoise,TARSZeroShot,0.8,0.73,0.7,0.8


In [17]:
class_orchestrator_zero_shot.results.round(2).to_csv('../outputs/final/results_classification_zero_shot.csv', index=False)

# UP TO HERE WORKS WELL, CAN RUN ON ALL DATASETS

#### Running pipelines for Topic Modeling

In [18]:
topic_modelling_orchestrator_ag_news.evaluate(datasets_test_ag_news, noise_strategies)
topic_modelling_orchestrator_ag_news.results

Datasets:   0%|          | 0/2 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News Full,NoNoise,LDA_4,0.411765,0.944541,0.364855,
1,AG News Full,NoNoise,LSI_4,0.411765,0.821863,0.28,
2,AG News Full,NoNoise,NMF_4,0.411765,0.901785,0.28,0.988373
3,AG News LLM labels,NoNoise,LDA_4,0.411765,0.944541,0.364855,
4,AG News LLM labels,NoNoise,LSI_4,0.411765,0.821863,0.28,
5,AG News LLM labels,NoNoise,NMF_4,0.411765,0.901785,0.28,0.988373


In [20]:
topic_modelling_orchestrator_ag_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_ag_news.csv', index=False)

In [21]:
topic_modelling_orchestrator_bbc_news.evaluate(datasets_test_bbc_news, noise_strategies)
topic_modelling_orchestrator_bbc_news.results

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,BBC News Full,NoNoise,LDA_5,-0.25,0.479341,0.393866,
1,BBC News Full,NoNoise,LSI_5,0.0,0.734039,0.219998,
2,BBC News Full,NoNoise,NMF_5,0.0,0.918731,0.200008,0.000122


In [22]:
topic_modelling_orchestrator_bbc_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_bbc_news.csv', index=False)
