In [None]:
# # RUN BELOW WHEN RUNNING FOR THE FIRST TIME
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

### Core Assumptions:
* Business wants to classify incoming textual data
* Minimal labeling needed. Zero to 20 labels per classification 

In [45]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

In [46]:
taking_sample = False
test_sample_size = 50

## AG News Dataset

In [47]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train_from_llm.csv')

ag_train.dropna(subset=['Predicted Topic Index'], inplace=True)

# CREATING A SAMPLE TEST SET
if taking_sample == True:
    ag_test = ag_test.sample(test_sample_size, random_state=42)

ag_test_desc = ag_test['Description']
ag_test_labels = ag_test['Class Index']

ag_train_desc = ag_train['Description']
ag_train_labels = ag_train['Class Index']

ag_train_llm_desc = ag_train['Description']
ag_train_llm_labels = ag_train['Predicted Topic Index']

### BBC

In [48]:
bbc_test = pd.read_csv('../data/BBC News/test.csv')
bbc_train = pd.read_csv('../data/BBC News/train_from_llm.csv')

if taking_sample == True:
    bbc_test = bbc_test.sample(test_sample_size, random_state=42)

bbc_test_desc = bbc_test['Description']
bbc_test_labels = bbc_test['Class Index']

bbc_train_desc = bbc_train['Description']
bbc_train_labels = bbc_train['Class Index']

bbc_train_llm_desc = ag_train['Description']
bbc_train_llm_labels = ag_train['Predicted Topic Index']


### 20NewsGroup

In [49]:
newsgroups_test = pd.read_csv('../data/20 Newsgroups/test.csv')
newsgroups_test.dropna(inplace=True)
newsgroups_test = newsgroups_test.sample(4000, random_state=42)
newsgroups_train = pd.read_csv('../data/20 Newsgroups/train_from_llm.csv')
newsgroups_train = newsgroups_train[newsgroups_train['Predicted Topic Index'] != -1]

if taking_sample == True:
    newsgroups_test = newsgroups_test.sample(test_sample_size, random_state=42)

newsgroups_test_desc = newsgroups_test['Description']
newsgroups_test_labels = newsgroups_test['Class Index']

newsgroups_train_desc = newsgroups_train['Description']
newsgroups_train_labels = newsgroups_train['Class Index']

newsgroups_train_llm_desc = newsgroups_train['Description']
newsgroups_train_llm_labels = newsgroups_train['Predicted Topic Index']

In [50]:
# Check for nulls in newsgroups dataset
print(f"Nulls in newsgroups_test_desc: {newsgroups_test_desc.isna().sum()}")
print(f"Nulls in newsgroups_test_labels: {newsgroups_test_labels.isna().sum()}")
print(f"Nulls in newsgroups_train_desc: {newsgroups_train_desc.isna().sum()}")
print(f"Nulls in newsgroups_train_labels: {newsgroups_train_labels.isna().sum()}")
print(f"Nulls in newsgroups_train_llm_labels: {newsgroups_train_llm_labels.isna().sum()}")

Nulls in newsgroups_test_desc: 0
Nulls in newsgroups_test_labels: 0
Nulls in newsgroups_train_desc: 0
Nulls in newsgroups_train_labels: 0
Nulls in newsgroups_train_llm_labels: 0


### Combining Datasets

In [51]:
# len(bbc_test_desc), len(bbc_train_desc), len(ag_test_desc), len(ag_train_desc)

In [52]:
# For pipelines that works with all datasets
datasets_test = {
    "AG News Full": (ag_test_desc, ag_test_labels),
    "AG News LLM labels": (ag_test_desc, ag_test_labels),
    "BBC News Full": (bbc_test_desc, bbc_test_labels),
    "BBC News LLM labels": (bbc_test_desc, bbc_test_labels),
    "20 Newsgroups Full": (newsgroups_test_desc, newsgroups_test_labels),
    "20 Newsgroups LLM labels": (newsgroups_test_desc, newsgroups_test_labels),
}

datasets_train = {
    "AG News Full":(ag_train_desc, ag_train_labels),
    "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
    "BBC News Full": (bbc_train_desc, bbc_train_labels),
    "BBC News LLM labels": (bbc_train_llm_desc, bbc_train_llm_labels),
    "20 Newsgroups Full": (newsgroups_train_desc, newsgroups_train_labels),
    "20 Newsgroups LLM labels": (newsgroups_train_llm_desc, newsgroups_train_llm_labels),
}
# For zero shot pipelines. Since the training is only different for LLM labeled datasets.
datasets_test_full = {
    "AG News Full": (ag_test_desc, ag_test_labels),
    "BBC News Full": (bbc_test_desc, bbc_test_labels),
    "20 Newsgroups Full": (newsgroups_test_desc, newsgroups_test_labels),
}

datasets_train_full = {
    "AG News Full":(ag_train_desc, ag_train_labels),
    "BBC News Full": (bbc_train_desc, bbc_train_labels),
    "20 Newsgroups Full": (newsgroups_train_desc, newsgroups_train_labels),
}

# For pipelines that works with AG News only
datasets_test_ag_news_full = {
    "AG News Full": (ag_test_desc, ag_test_labels),
    # "AG News LLM labels": (ag_test_desc, ag_test_labels),
}

datasets_train_ag_news_full = {
    "AG News Full":(ag_train_desc, ag_train_labels),
    # "AG News LLM labels":(ag_train_llm_desc, ag_train_llm_labels),
}

# For pipelines that works with BBC News only
datasets_test_bbc_news_full = {
    "BBC News Full": (bbc_test_desc, bbc_test_labels),
    # "BBC News LLM labels": (bbc_test_desc, bbc_test_labels),
}

datasets_train_bbc_news_full = {
    "BBC News Full": (bbc_train_desc, bbc_train_labels),
    # "BBC News LLM labels": (bbc_train_llm_desc, bbc_train_llm_labels),
}

datasets_test_newsgroups = {
    "20 Newsgroups Full": (newsgroups_test_desc, newsgroups_test_labels),
    # "20 Newsgroups LLM labels": (newsgroups_test_desc, newsgroups_test_labels),
}

datasets_train_newsgroups = {
    "20 Newsgroups Full": (newsgroups_train_desc, newsgroups_train_labels),
    # "20 Newsgroups LLM labels": (newsgroups_train_llm_desc, newsgroups_train_llm_labels),
}

In [53]:
for key in datasets_test.keys():
    print(f"{key}: {len(datasets_test[key][0])} samples, {len(datasets_test[key][1])} labels")
print('==========================================')
for key in datasets_train.keys():
    print(f"{key}: {len(datasets_train[key][0])} samples, {len(datasets_train[key][1])} labels")

AG News Full: 7600 samples, 7600 labels
AG News LLM labels: 7600 samples, 7600 labels
BBC News Full: 1390 samples, 1390 labels
BBC News LLM labels: 1390 samples, 1390 labels
20 Newsgroups Full: 4000 samples, 4000 labels
20 Newsgroups LLM labels: 4000 samples, 4000 labels
AG News Full: 80 samples, 80 labels
AG News LLM labels: 80 samples, 80 labels
BBC News Full: 100 samples, 100 labels
BBC News LLM labels: 80 samples, 80 labels
20 Newsgroups Full: 395 samples, 395 labels
20 Newsgroups LLM labels: 395 samples, 395 labels


In [54]:
topic_modelling_orchestrator_ag_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_bbc_news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_20news = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator_ag_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI',
        'NMF'
        ],
    param_grid={'n_topics': [4]}
)
topic_modelling_orchestrator_bbc_news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI', 
        'NMF'
        ],
    param_grid={'n_topics': [5]}
)
topic_modelling_orchestrator_20news.add_models_grid(
    model_types=[
        'LDA', 
        'LSI', 
        'NMF'
        ],
    param_grid={'n_topics': [20]}
)

['LDA_20', 'LSI_20', 'NMF_20']

In [55]:
# class_orchestrator_zero_shot = ClassificationPipelineOrchestrator()
# class_orchestrator_sup = ClassificationPipelineOrchestrator()
# class_orchestrator_zero_shot.add_models_grid(
#     model_types=[
#         'LLMClassifier',
#         'TARSZeroShot',
#     ]
# )
# class_orchestrator_sup.add_models_grid(
#     model_types=[
#         'SVM',
#         'XGBoost',
#         'LightGBM',
#         'RandomForest',
#         'SVMRoberta', 
#         'XGBoostRoberta',
#         'LightGBMRoberta',
#         'RandomForestRoberta'
#     ]
#     # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
# )

In [56]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    AddRandomCharsNoise(), 
    AddRandomWordsNoise(), 
    DeleteRandomWordsNoise(), 
    ShuffleSentencesNoise(noise_level=0.7), 
    ReplaceWithSynonymsNoise(), 
    ReplaceWithAntonymsNoise()
]

#### Running pipelines for classification

In [57]:
# class_orchestrator_sup.evaluate_with_training(datasets_train_newsgroups, datasets_test_newsgroups, noise_strategies).round(2)

In [58]:
# class_orchestrator_sup.results.round(2).to_csv('../outputs/final/results_classification_supervised_20_newsgroup.csv', index=False)

In [59]:
# class_orchestrator_zero_shot.evaluate_with_training(datasets_train_newsgroups, datasets_test_newsgroups, noise_strategies).round(2)

In [60]:
# class_orchestrator_zero_shot.results.round(2).to_csv('../outputs/final/results_classification_zero_shot_llm_pt1_20_newsgroups.csv', index=False)

# UP TO HERE WORKS WELL, CAN RUN ON ALL DATASETS

#### Running pipelines for Topic Modeling

In [61]:
topic_modelling_orchestrator_ag_news.evaluate(datasets_test_ag_news_full, noise_strategies)
topic_modelling_orchestrator_ag_news.results
topic_modelling_orchestrator_ag_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_ag_news.csv', index=False)


Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

In [62]:
topic_modelling_orchestrator_ag_news.evaluate(datasets_test_bbc_news_full, noise_strategies)
topic_modelling_orchestrator_ag_news.results
topic_modelling_orchestrator_ag_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_bbc_news.csv', index=False)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

In [63]:
topic_modelling_orchestrator_ag_news.evaluate(datasets_test_newsgroups, noise_strategies)
topic_modelling_orchestrator_ag_news.results
topic_modelling_orchestrator_ag_news.results.round(2).to_csv('../outputs/final/results_topic_modelling_20newsgroups.csv', index=False)


Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/3 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]