In [1]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

### Core Assumptions:
* Business wants to classify incoming textual data
* Minimal labeling needed. Zero to 20 labels per classification 

In [1]:
import pandas as pd
import random
from topic_modeling_pipeline import *
from classification_pipeline import *
random.seed(42)

## AG News Dataset

In [2]:
taking_sample = True

In [3]:
ag_test = pd.read_csv('../data/AG News/test.csv')
ag_train = pd.read_csv('../data/AG News/train.csv')

# CREATING A SAMPLE TEST SET
if taking_sample == True:
    ag_test_world_sample = ag_test[ag_test['Class Index'] == 1].sample(100, random_state=42)
    ag_test_sports_sample = ag_test[ag_test['Class Index'] == 2].sample(100, random_state=42)
    ag_test_business_sample = ag_test[ag_test['Class Index'] == 3].sample(100, random_state=42)
    ag_test_science_sample = ag_test[ag_test['Class Index'] == 4].sample(100, random_state=42)

    # Combine the four dataframes of different categories
    ag_test_sample = pd.concat([ag_test_world_sample, ag_test_sports_sample, ag_test_business_sample, ag_test_science_sample])

    # Shuffle the combined dataframe
    ag_test_sample = ag_test_sample.sample(frac=1, random_state=42).reset_index(drop=True)

    # Reset the index
    ag_test_sample.reset_index(drop=True, inplace=True)

    ag_test_sample_desc = ag_test_sample['Description']
    ag_test_sample_labels = ag_test_sample['Class Index']

else:
    ag_test_sample_desc = ag_test['Description']
    ag_test_sample_labels = ag_test['Class Index']

ag_news_sample = {
    "AG News": (ag_test_sample_desc, ag_test_sample_labels),
}


# CREATING A BALANCED TRAINING SET
ag_train_world_10 = ag_train[ag_train['Class Index'] == 1].sample(10, random_state=42)
ag_train_sports_10 = ag_train[ag_train['Class Index'] == 2].sample(10, random_state=42)
ag_train_business_10 = ag_train[ag_train['Class Index'] == 3].sample(10, random_state=42)
ag_train_science_10 = ag_train[ag_train['Class Index'] == 4].sample(10, random_state=42)

# Combine the four dataframes of different categories
ag_news_baseline_combined = pd.concat([
    ag_train_world_10, 
    ag_train_sports_10, 
    ag_train_business_10, 
    ag_train_science_10
])

# Shuffle the combined dataframe
ag_news_baseline_combined = ag_news_baseline_combined.sample(frac=1, random_state=42)

# Reset the index
ag_news_baseline_combined = ag_news_baseline_combined.reset_index(drop=True)

ag_train_40_desc = ag_news_baseline_combined['Description']
ag_train_40_labels = ag_news_baseline_combined['Class Index']
ag_news_train = {
    "AG News":(ag_train_40_desc, ag_train_40_labels),
}

In [4]:
# ag_news_llm_train_df = pd.read_csv('../outputs/llm_to_label/news_assigned_final.csv')
# ag_news_llm_train_df['predicted_label'] = ag_news_llm_train_df['predicted_label_text'].map(text_to_class_mapping).fillna(-1).astype(int)

# ag_news_llm_train_df = ag_news_llm_train_df[ag_news_llm_train_df['predicted_label'] != -1]  # Remove rows with -1 labels
# ag_news_llm_train = {'AG News': (ag_news_llm_train_df['text'], ag_news_llm_train_df['predicted_label'])}

In [5]:
topic_modelling_orchestrator = TopicModelingPipelineOrchestrator()
topic_modelling_orchestrator.add_models_grid(
    model_types=['LDA', 'LSI', 'NMF'],
    param_grid={'n_topics': [4]}
)

class_orchestrator = ClassificationPipelineOrchestrator()
class_orchestrator.add_models_grid(
    model_types=[
        # 'TARSZeroShot',
        'TARSFewShot',
        # 'SVM',
        # 'XGBoost',
        # 'RandomForest',
        # 'LightGBM',
        # 'SVMRoberta', 
        # 'XGBoostRoberta',
        # 'LightGBMRoberta',
    ]
    # param_grid={'SVMRoberta': [{}], 'SVM': [{}]}  # Empty dictionary means default parameters
)

2025-05-24 18:02:41,063 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model


['TARSFewShot']

In [6]:
from noise_strategy import *
noise_strategies = [
    NoNoise(),
    # AddRandomCharsNoise(), 
    # AddRandomWordsNoise(), 
    # DeleteRandomWordsNoise(), 
    # ShuffleSentencesNoise(noise_level=0.7), 
    # ReplaceWithSynonymsNoise(), 
    # ReplaceWithAntonymsNoise()
]

In [7]:
# class_orchestrator.evaluate_with_training(ag_news_llm_train, ag_news_sample, noise_strategies)

In [33]:
class_orchestrator.evaluate_with_training(ag_news_train, ag_news_sample, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noise Strategies:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,Dataset,Noise,Model,Accuracy,F1 Score,Precision,Recall
8,AG News,AddRandomCharsNoise,SVM,0.55,0.548776,0.552725,0.55
1,AG News,AddRandomCharsNoise,TARSZeroShot,0.735,0.695938,0.6854,0.735
9,AG News,AddRandomWordsNoise,SVM,0.5625,0.562756,0.56825,0.5625
2,AG News,AddRandomWordsNoise,TARSZeroShot,0.7325,0.703401,0.701125,0.7325
10,AG News,DeleteRandomWordsNoise,SVM,0.5425,0.542542,0.54745,0.5425
3,AG News,DeleteRandomWordsNoise,TARSZeroShot,0.7375,0.706675,0.704,0.7375
7,AG News,NoNoise,SVM,0.5575,0.556953,0.5614,0.5575
0,AG News,NoNoise,TARSZeroShot,0.7525,0.722987,0.7222,0.7525
13,AG News,ReplaceWithAntonymsNoise,SVM,0.5525,0.552548,0.557275,0.5525
6,AG News,ReplaceWithAntonymsNoise,TARSZeroShot,0.745,0.712959,0.708975,0.745


In [8]:
class_orchestrator.evaluate_with_training(ag_news_train, ag_news_sample, noise_strategies)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Noise Strategies:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
# topic_modelling_orchestrator.evaluate(ag_news, noise_strategies)