In [1]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

### Core Assumptions:
* Business wants to classify incoming textual data
* Minimal labeling needed. Zero to 20 labels per classification 

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [2]:
taking_sample = False

In [3]:
test_sample_size = 500
# train up to 100
train_sample_size_per_class = 20

In [4]:
ag_test = pd.read_csv('../data/AG News/test.csv')
# ag_train = pd.read_csv('../data/AG News/train.csv')
ag_train = pd.read_csv('../data/AG News/train_from_llm.csv')

ag_train.dropna(subset=['Predicted Topic Index'], inplace=True)
# TEMPORARY:
# ag_test = ag_test.sample(500, random_state=42)

# CREATING A SAMPLE TEST SET
if taking_sample == True:
    ag_test = ag_test.sample(test_sample_size, random_state=42)

ag_test_sample_desc = ag_test['Description']
ag_test_sample_labels = ag_test['Class Index']



# CREATING A BALANCED TRAINING SET
ag_train_world_10 = ag_train[ag_train['Class Index'] == 1].sample(train_sample_size_per_class, random_state=42)
ag_train_sports_10 = ag_train[ag_train['Class Index'] == 2].sample(train_sample_size_per_class, random_state=42)
ag_train_business_10 = ag_train[ag_train['Class Index'] == 3].sample(train_sample_size_per_class, random_state=42)
ag_train_science_10 = ag_train[ag_train['Class Index'] == 4].sample(train_sample_size_per_class, random_state=42)

# Combine the four dataframes of different categories
ag_news_baseline_combined = pd.concat([
    ag_train_world_10, 
    ag_train_sports_10, 
    ag_train_business_10, 
    ag_train_science_10
])

# Shuffle the combined dataframe
ag_news_baseline_combined = ag_news_baseline_combined.sample(frac=1, random_state=42)

# Reset the index
ag_news_baseline_combined = ag_news_baseline_combined.reset_index(drop=True)

ag_train_40_desc = ag_news_baseline_combined['Description']
ag_train_40_labels = ag_news_baseline_combined['Class Index']

ag_train_llm_desc = ag_train['Description']
ag_train_llm_labels = ag_train['Predicted Topic Index']



### BBC

In [5]:
bbc_full = pd.read_csv('../data/BBC News/BBC News Train.csv')
bbc_full = bbc_full.rename(columns={'Category': 'Class', 'Text': 'Description'})
# bbc_full = bbc_full.sample(800, random_state=42)
bbc_full = bbc_full.sample(900, random_state=42)
bbc_class_to_index = {
    'business': 1,
    'tech': 2,
    'entertainment': 3,
    'politics': 4,
    'sport': 5
}

bbc_index_to_class = {
    1: 'business',
    2: 'tech',
    3: 'entertainment',
    4: 'politics',
    5: 'sport'
}

bbc_full['Class Index'] = bbc_full['Class'].map(bbc_class_to_index)

bbc_train = bbc_full.sample(n=400, random_state=42)
bbc_test = bbc_full.drop(bbc_train.index)

if taking_sample == True:
    bbc_test = bbc_test.sample(test_sample_size, random_state=42)

bbc_test_sample_desc = bbc_test['Description']
bbc_test_sample_labels = bbc_test['Class Index']


bbc_train_business_40 = bbc_train[bbc_train['Class Index'] == 1].sample(train_sample_size_per_class, random_state=42)
bbc_train_tech_40 = bbc_train[bbc_train['Class Index'] == 2].sample(train_sample_size_per_class, random_state=42)
bbc_train_entertainment_40 = bbc_train[bbc_train['Class Index'] == 3].sample(train_sample_size_per_class, random_state=42)
bbc_train_politics_40 = bbc_train[bbc_train['Class Index'] == 4].sample(train_sample_size_per_class, random_state=42)
bbc_train_sport_40 = bbc_train[bbc_train['Class Index'] == 5].sample(train_sample_size_per_class, random_state=42)


# Combine the four dataframes of different categories
bbc_news_baseline_combined = pd.concat([
    bbc_train_business_40, 
    bbc_train_tech_40, 
    bbc_train_entertainment_40, 
    bbc_train_politics_40, 
    bbc_train_sport_40
])

# Shuffle the combined dataframe
bbc_news_baseline_combined = bbc_news_baseline_combined.sample(frac=1, random_state=42)

# Reset the index
bbc_news_baseline_combined = bbc_news_baseline_combined.reset_index(drop=True)

bbc_train_40_desc = bbc_news_baseline_combined['Description']
bbc_train_40_labels = bbc_news_baseline_combined['Class Index']


In [6]:
len(bbc_test_sample_desc), len(bbc_test_sample_labels), len(ag_test_sample_desc), len(ag_test_sample_labels)

(500, 500, 7600, 7600)

In [7]:
datasets_test = {
    "AG News Full": (ag_test_sample_desc, ag_test_sample_labels),
    # "AG News LLM labels": (ag_test_sample_desc, ag_test_sample_labels),
    # "BBC News Full": (bbc_test_sample_desc, bbc_test_sample_labels),
}

datasets_train = {
    "AG News Full":(ag_train_40_desc, ag_train_40_labels),
    # "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
    # "BBC News Full": (bbc_train_40_desc, bbc_train_40_labels),
}

In [11]:
topic_modelling_orchestrator = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator.add_models_grid(
    model_types=[
        'LDA', 
        # 'LSI', 
        # 'NMF'
        ],
    param_grid={'n_topics': [4]}
)

class_orchestrator = ClassificationPipelineOrchestrator()
class_orchestrator.add_models_grid(
    model_types=[
        # 'LLMClassifier',
        # 'TARSZeroShot',

        # 'SVM',
        # 'XGBoost',
        # 'RandomForest',
        # 'RandomForestRoberta'
        # 'LightGBM',
        # 'SVMRoberta', 
        # 'XGBoostRoberta',
        # 'LightGBMRoberta',
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

[]

In [12]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    # AddRandomCharsNoise(), 
    # AddRandomWordsNoise(), 
    # DeleteRandomWordsNoise(), 
    # ShuffleSentencesNoise(noise_level=0.7), 
    # ReplaceWithSynonymsNoise(), 
    # ReplaceWithAntonymsNoise()
]

In [14]:
# class_orchestrator.evaluate_with_training(datasets_train, datasets_test, noise_strategies).round(2)

In [15]:
topic_modelling_orchestrator.evaluate(datasets_test, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-02 20:09:12,246 [INFO] gensim.corpora.dictionary: adding document #0 to Dictionary<0 unique tokens: []>
2025-06-02 20:09:12,295 [INFO] gensim.corpora.dictionary: built Dictionary<15168 unique tokens: ['disappoint', 'feder', 'firm', 'mogul', 'newal']...> from 7600 documents (total 139244 corpus positions)
2025-06-02 20:09:12,295 [INFO] gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary<15168 unique tokens: ['disappoint', 'feder', 'firm', 'mogul', 'newal']...> from 7600 documents (total 139244 corpus positions)", 'datetime': '2025-06-02T20:09:12.295781', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]', 'platform': 'macOS-15.5-arm64-arm-64bit', 'event': 'created'}
2025-06-02 20:09:12,301 [INFO] gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator<processes=10, batch_size=64> to estimate probabilities from sliding windows
2025-06-02 20:09:18,142 [INFO] gensim.topic_coherence.text_analysis: 10 accu

In [16]:
topic_modelling_orchestrator.results

Unnamed: 0,Dataset,Noise,Model,ARI Score,Topics Coherence,Cosine Similarity,Reconstruction Error
0,AG News Full,NoNoise,LDA_4,0.103527,0.550159,0.433253,


In [12]:
# topic_modelling_orchestrator.results.to_csv('../outputs/results_tm_ag.csv', index=False)
# topic_modelling_orchestrator.results.to_csv('../outputs/results_tm_bbc.csv', index=False)
# 

In [15]:
class_orchestrator.results.round(2).to_csv('../outputs/results_svm_roberta_ag_llm_labels.csv', index=False)

In [16]:
class_orchestrator.results.round(2)

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
1,AG News LLM labels,AddRandomCharsNoise,SVMRoberta,0.72,0.75,0.87,0.72
2,AG News LLM labels,AddRandomWordsNoise,SVMRoberta,0.72,0.75,0.86,0.72
3,AG News LLM labels,DeleteRandomWordsNoise,SVMRoberta,0.72,0.75,0.87,0.72
0,AG News LLM labels,NoNoise,SVMRoberta,0.73,0.77,0.88,0.73
6,AG News LLM labels,ReplaceWithAntonymsNoise,SVMRoberta,0.73,0.77,0.88,0.73
5,AG News LLM labels,ReplaceWithSynonymsNoise,SVMRoberta,0.73,0.77,0.88,0.73
4,AG News LLM labels,ShuffleSentencesNoise,SVMRoberta,0.73,0.77,0.88,0.73


In [None]:
# topic_modelling_orchestrator.evaluate(datasets_test, noise_strategies)

In [38]:
class_orchestrator.results.to_csv('../outputs/results_svm_roberta_bbc.csv', index=False)

In [16]:
pd.set_option('display.max_rows', None)

In [17]:
class_orchestrator.results

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
22,AG News Full,AddRandomCharsNoise,LightGBM,0.2,0.333333,1.0,0.2
43,AG News Full,AddRandomCharsNoise,LightGBMRoberta,0.3,0.361905,0.466667,0.3
15,AG News Full,AddRandomCharsNoise,RandomForest,0.8,0.804444,0.883333,0.8
1,AG News Full,AddRandomCharsNoise,SVM,0.7,0.777778,0.9,0.7
29,AG News Full,AddRandomCharsNoise,SVMRoberta,0.5,0.521429,0.583333,0.5
8,AG News Full,AddRandomCharsNoise,XGBoost,0.4,0.388889,0.4,0.4
36,AG News Full,AddRandomCharsNoise,XGBoostRoberta,0.4,0.44,0.5,0.4
23,AG News Full,AddRandomWordsNoise,LightGBM,0.2,0.333333,1.0,0.2
44,AG News Full,AddRandomWordsNoise,LightGBMRoberta,0.3,0.328571,0.366667,0.3
16,AG News Full,AddRandomWordsNoise,RandomForest,0.7,0.676667,0.783333,0.7
