In [None]:
# # RUN BELOW WHEN RUNNING FOR THE FIRST TIME
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

### Core Assumptions:
* Business wants to classify incoming textual data
* Minimal labeling needed. Zero to 20 labels per classification 

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [2]:
taking_sample = True

In [3]:
test_sample_size = 5
train_sample_size_per_class = 20

In [4]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train_from_llm.csv')

ag_train.dropna(subset=['Predicted Topic Index'], inplace=True)

# CREATING A SAMPLE TEST SET
if taking_sample == True:
    ag_test = ag_test.sample(test_sample_size, random_state=42)

ag_test_sample_desc = ag_test['Description']
ag_test_sample_labels = ag_test['Class Index']


# CREATING A BALANCED TRAINING SET
ag_train_world_10 = ag_train[ag_train['Class Index'] == 1].sample(train_sample_size_per_class, random_state=42)
ag_train_sports_10 = ag_train[ag_train['Class Index'] == 2].sample(train_sample_size_per_class, random_state=42)
ag_train_business_10 = ag_train[ag_train['Class Index'] == 3].sample(train_sample_size_per_class, random_state=42)
ag_train_science_10 = ag_train[ag_train['Class Index'] == 4].sample(train_sample_size_per_class, random_state=42)

# Combine the four dataframes of different categories
ag_news_baseline_combined = pd.concat([
    ag_train_world_10, 
    ag_train_sports_10, 
    ag_train_business_10, 
    ag_train_science_10
])

# Shuffle the combined dataframe
ag_news_baseline_combined = ag_news_baseline_combined.sample(frac=1, random_state=42)
# Reset the index
ag_news_baseline_combined = ag_news_baseline_combined.reset_index(drop=True)

ag_train_40_desc = ag_news_baseline_combined['Description']
ag_train_40_labels = ag_news_baseline_combined['Class Index']

ag_train_llm_desc = ag_train['Description']
ag_train_llm_labels = ag_train['Predicted Topic Index']

### BBC

In [5]:
bbc_full = pd.read_csv('../data/BBC News/BBC News Train.csv')
bbc_full = bbc_full.rename(columns={'Category': 'Class', 'Text': 'Description'})
bbc_class_to_index = {
    'business': 1,
    'tech': 2,
    'entertainment': 3,
    'politics': 4,
    'sport': 5
}

bbc_index_to_class = {
    1: 'business',
    2: 'tech',
    3: 'entertainment',
    4: 'politics',
    5: 'sport'
}

bbc_full['Class Index'] = bbc_full['Class'].map(bbc_class_to_index)

bbc_train = bbc_full.sample(n=400, random_state=42)
bbc_test = bbc_full.drop(bbc_train.index)

if taking_sample == True:
    bbc_test = bbc_test.sample(test_sample_size, random_state=42)

bbc_test_sample_desc = bbc_test['Description']
bbc_test_sample_labels = bbc_test['Class Index']

bbc_train_business_40 = bbc_train[bbc_train['Class Index'] == 1].sample(train_sample_size_per_class, random_state=42)
bbc_train_tech_40 = bbc_train[bbc_train['Class Index'] == 2].sample(train_sample_size_per_class, random_state=42)
bbc_train_entertainment_40 = bbc_train[bbc_train['Class Index'] == 3].sample(train_sample_size_per_class, random_state=42)
bbc_train_politics_40 = bbc_train[bbc_train['Class Index'] == 4].sample(train_sample_size_per_class, random_state=42)
bbc_train_sport_40 = bbc_train[bbc_train['Class Index'] == 5].sample(train_sample_size_per_class, random_state=42)

# Combine the four dataframes of different categories
bbc_news_baseline_combined = pd.concat([
    bbc_train_business_40, 
    bbc_train_tech_40, 
    bbc_train_entertainment_40, 
    bbc_train_politics_40, 
    bbc_train_sport_40
])

# Shuffle the combined dataframe
bbc_news_baseline_combined = bbc_news_baseline_combined.sample(frac=1, random_state=42)
# Reset the index
bbc_news_baseline_combined = bbc_news_baseline_combined.reset_index(drop=True)

bbc_train_40_desc = bbc_news_baseline_combined['Description']
bbc_train_40_labels = bbc_news_baseline_combined['Class Index']


In [6]:
len(bbc_test_sample_desc), len(bbc_test_sample_labels), len(ag_test_sample_desc), len(ag_test_sample_labels)

(5, 5, 5, 5)

In [7]:
# For pipelines that works with all datasets
datasets_test = {
    "AG News Full": (ag_test_sample_desc, ag_test_sample_labels),
    "AG News LLM labels": (ag_test_sample_desc, ag_test_sample_labels),
    "BBC News Full": (bbc_test_sample_desc, bbc_test_sample_labels),
}

datasets_train = {
    "AG News Full":(ag_train_40_desc, ag_train_40_labels),
    "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
    "BBC News Full": (bbc_train_40_desc, bbc_train_40_labels),
}

# For pipelines that works with AG News only
datasets_test_ag_news = {
    "AG News Full": (ag_test_sample_desc, ag_test_sample_labels),
    "AG News LLM labels": (ag_test_sample_desc, ag_test_sample_labels),
}

datasets_train_ag_news = {
    "AG News Full":(ag_train_40_desc, ag_train_40_labels),
    "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
}

# For pipelines that works with BBC News only
datasets_test_bbc_news = {
    "BBC News Full": (bbc_test_sample_desc, bbc_test_sample_labels),
    # "BBC News LLM labels": (bbc_test_sample_desc, bbc_test_sample_labels),
}

datasets_train_bbc_news = {
    "BBC News Full": (bbc_train_40_desc, bbc_train_40_labels),
    # "BBC News LLM labels": (bbc_train_40_desc, bbc_train_40_labels),
}

In [8]:
topic_modelling_orchestrator_ag_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_bbc_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_ag_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI',
        'NMF'
        ],
    param_grid={'n_topics': [4]}
)
topic_modelling_orchestrator_bbc_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI', 
        'NMF'
        ],
    param_grid={'n_topics': [5]}
)

['LDA_5', 'LSI_5', 'NMF_5']

In [9]:
class_orchestrator_zero_shot = ClassificationPipelineOrchestrator()
class_orchestrator_sup = ClassificationPipelineOrchestrator()
class_orchestrator_zero_shot.add_models_grid(
    model_types=[
        'LLMClassifier',
        'TARSZeroShot',
    ]
)
class_orchestrator_sup.add_models_grid(
    model_types=[
        'SVM',
        'XGBoost',
        'LightGBM',
        'RandomForest',
        'SVMRoberta', 
        'XGBoostRoberta',
        'LightGBMRoberta',
        'RandomForestRoberta'
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

2025-06-02 22:27:36,395 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model


['SVM',
 'XGBoost',
 'LightGBM',
 'RandomForest',
 'SVMRoberta',
 'XGBoostRoberta',
 'LightGBMRoberta',
 'RandomForestRoberta']

In [10]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    # AddRandomCharsNoise(), 
    # AddRandomWordsNoise(), 
    # DeleteRandomWordsNoise(), 
    # ShuffleSentencesNoise(noise_level=0.7), 
    # ReplaceWithSynonymsNoise(), 
    # ReplaceWithAntonymsNoise()
]

#### Running pipelines for classification

In [None]:
class_orchestrator_sup.evaluate_with_training(datasets_train, datasets_test, noise_strategies).round(2)

In [None]:
class_orchestrator_sup.results.round(2).to_csv('../outputs/final/results_classification_supervised.csv', index=False)

In [11]:
class_orchestrator_zero_shot.evaluate_with_training(datasets_train, datasets_test, noise_strategies).round(2)

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

2025-06-02 22:27:39,112 [INFO] root: Processing dataset: AG News Full


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-02 22:27:44,194 [INFO] root: Processing dataset: AG News LLM labels


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-06-02 22:27:45,954 [INFO] root: Processing dataset: BBC News Full


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Classifying with LLM:   0%|          | 0/5 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
0,AG News Full,NoNoise,LLMClassifier,0.6,0.5,0.47,0.6
1,AG News Full,NoNoise,TARSZeroShot,0.4,0.3,0.27,0.4
2,AG News LLM labels,NoNoise,LLMClassifier,0.6,0.5,0.47,0.6
3,AG News LLM labels,NoNoise,TARSZeroShot,0.4,0.3,0.27,0.4
4,BBC News Full,NoNoise,LLMClassifier,1.0,1.0,1.0,1.0
5,BBC News Full,NoNoise,TARSZeroShot,1.0,1.0,1.0,1.0


In [12]:
class_orchestrator_zero_shot.results.round(2).to_csv('../outputs/final/results_classification_zero_shot.csv', index=False)

#### Running pipelines for Topic Modeling

In [21]:
topic_modelling_orchestrator_ag_news.evaluate(datasets_test_ag_news, noise_strategies)
topic_modelling_orchestrator_ag_news.results

Datasets:   0%|          | 0/2 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News Full,NoNoise,LDA_4,0.001984,0.297714,0.349004,
1,AG News Full,NoNoise,LSI_4,0.00625,0.515767,0.52551,
2,AG News Full,NoNoise,NMF_4,0.150541,0.64656,0.529178,21.989044
3,AG News LLM labels,NoNoise,LDA_4,0.001984,0.297714,0.349004,
4,AG News LLM labels,NoNoise,LSI_4,0.00625,0.515767,0.52551,
5,AG News LLM labels,NoNoise,NMF_4,0.150541,0.64656,0.529178,21.989044


In [None]:
topic_modelling_orchestrator_ag_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_ag_news.csv', index=False)

In [22]:
topic_modelling_orchestrator_bbc_news.evaluate(datasets_test_bbc_news, noise_strategies)
topic_modelling_orchestrator_bbc_news.results

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,BBC News Full,NoNoise,LDA_5,0.0,0.42542,0.985345,
1,BBC News Full,NoNoise,LSI_5,0.113801,0.599215,0.502251,
2,BBC News Full,NoNoise,NMF_5,0.754366,0.75377,0.357812,21.425575


In [None]:
topic_modelling_orchestrator_bbc_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_bbc_news.csv', index=False)
