In [1]:
import pandas as pd
import random
from langchain_community.llms import Ollama
random.seed(42)
from tqdm.auto import tqdm

## AG News Dataset

In [2]:
ag_test = pd.read_csv('../../data/AG News/test.csv')
ag_train = pd.read_csv('../../data/AG News/train.csv')

# Define the mapping
class_to_text_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}
text_to_class_mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Science': 4}

# Apply the mapping to the class column
# ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
# ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_train_baseline = ag_train['Description']
ag_news_train_true_labels = ag_train['Class Index']

sampled = ag_news_train_baseline.sample(1000, random_state=42)
ag_news_train_baseline = sampled
ag_news_train_true_labels = ag_news_train_true_labels.loc[sampled.index]

sampled_indices = ag_test.sample(3600, random_state=42).index
ag_news_baseline = ag_test.loc[sampled_indices, 'Description']
ag_news_true_labels = ag_test.loc[sampled_indices, 'Class Index']

ag_news_train = {
    "AG News":(ag_news_train_baseline, ag_news_train_true_labels),
}

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
}

In [3]:
# Convert Series to DataFrame
df = pd.DataFrame({'text': ag_news_baseline})
df['true_labels'] = ag_news_true_labels
df['true_labels_text'] = df['true_labels'].map(class_to_text_mapping)
df = df.sample(100, random_state=42)
df

Unnamed: 0,text,true_labels,true_labels_text
1615,Schering-Plough Corporation has announced that...,3,Business
1049,The UN tribunal in The Hague says it will impo...,1,World
4626,AP - Don't question Pedro Martinez anymore. Fa...,2,Sports
2019,CARDIFF -- Championship leader Sebastien Loeb ...,2,Sports
3221,"In New York, San Francisco, and Washington, D....",4,Science
...,...,...,...
3217,A company board member testifies in trial that...,4,Science
2995,Joe Nemechek wasn #39;t surprised to be back a...,2,Sports
2022,AP - Turkey's parliament adjourned Saturday wi...,1,World
6757,We've got two more entries this week in the ca...,4,Science


In [15]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import RobertaModel, RobertaTokenizer
import torch
from flair.models import TARSClassifier
from flair.data import Sentence
from typing import List, Optional, Union
class IClasificationModel(ABC):
    @abstractmethod
    def __init__(self):
        self.assigned_topics = None
        self.topic_distributions = None

    @abstractmethod
    def predict_classes(self, documents): 
        pass

    @abstractmethod    
    def fit_model(self, train_data_X, train_data_y):
        pass

    def evaluate(self, documents, true_labels: pd.DataFrame=None):
        predictions = self.predict_classes(documents)
        accuracy = accuracy_score(predictions, true_labels)
        f1 = f1_score(predictions, true_labels, average='weighted')
        precision = precision_score(predictions, true_labels, average='weighted')
        recall = recall_score(predictions, true_labels, average='weighted')

        return {
            'Accuracy': accuracy,
            'F1 Score': f1,
            'Precision': precision,
            'Recall': recall
        }


class TARSZeroShotModel(IClasificationModel):
    def __init__(self, model_name: str = 'tars-base'):
        super().__init__()
        self.model = TARSClassifier.load(model_name)
        self.model.add_and_switch_to_new_task("ZeroShot", label_dictionary=['World', 'Sports', 'Business', 'Science'], label_type="classification")
        
    def fit_model(self, train_data_X, train_data_y):
        pass
        
    def predict_classes(self, documents):
            
        predictions = []
        
        for doc in documents.tolist():
            sentence = Sentence(doc)
            self.model.predict(sentence)
            if sentence.labels:
                prediction = sentence.labels[0].value
            else:
                prediction = "Unknown"
                
            predictions.append(prediction)
        return np.array(predictions)
    

In [None]:
m = TARSZeroShotModel()
# Predict topics using TARS zero-shot model
print("Predicting topics with TARS zero-shot model...")
candidate_labels = list(class_to_text_mapping.values())  # ['World', 'Sports', 'Business', 'Science']
predictions = m.predict_classes(df['text'])

# Add predictions to dataframe
df['tars_predicted'] = predictions

# Convert text predictions to class indices for evaluation
predicted_indices = [text_to_class_mapping.get(label, -1) for label in predictions]

# # Calculate metrics


2025-05-21 23:10:52,952 TARS initialized without a task. You need to call .add_and_switch_to_new_task() before training this model
Predicting topics with TARS zero-shot model...


In [17]:
accuracy = accuracy_score(df['true_labels'], predicted_indices)
f1 = f1_score(df['true_labels'], predicted_indices, average='weighted')
precision = precision_score(df['true_labels'], predicted_indices, average='weighted')
recall = recall_score(df['true_labels'], predicted_indices, average='weighted')

# Print metrics
print(f"TARS Zero-Shot Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

TARS Zero-Shot Model Performance:
Accuracy: 0.7500
F1 Score: 0.7636
Precision: 0.9373
Recall: 0.7500


  _warn_prf(average, modifier, msg_start, len(result))


### LLM

In [31]:
llm = Ollama(model="gemma2")

In [32]:
topics = df['true_labels_text'].unique()
topics

array(['Business', 'World', 'Sports', 'Science'], dtype=object)

In [None]:
def assign_topics(input_df, topics, llm=None, checkpoint_interval=300):
    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()
    
    # Initialize Topic column with 'Unknown'
    df['Predicted Topic'] = 'Unknown'
    
    # Process each news item individually with progress bar
    for idx in tqdm(range(len(df)), desc="Assigning topics"):
        news_item = df.iloc[idx]['text']
        
        # Generate prompt for current item
        prompt_assigning_prompt = f'''You are provided with news and helping to classify them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective news.
News can be found in tripletick block: ```{news_item}```
Topics to choose from: {topics}
Please return ONLY the topic name. DO NOT OUTPUT any additional text, quotes, or formatting.'''
        
        try:
            # Get assignment for current item
            result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
            
            # Clean the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            
            # Update the DataFrame with the assigned topic
            df.iloc[idx, df.columns.get_loc('Predicted Topic')] = result
            
            # Save checkpoint every checkpoint_interval rows
            if (idx + 1) % checkpoint_interval == 0:
                checkpoint_filename = f'outputs/news_assigned_checkpoint_{idx + 1}.csv'
                df.to_csv(checkpoint_filename, index=False)
                print(f"\nCheckpoint saved: {checkpoint_filename}")
                
        except Exception as e:
            print(f"\nError processing item {idx}: {str(e)}")
            print(f"Result received: {result}")
            continue
    
    # Save final results
    final_filename = '../../outputs/llm_full/news_assigned_final.csv'
    df.to_csv(final_filename, index=False)
    print(f"\nFinal results saved: {final_filename}")
    
    return df

In [34]:
news_assigned = assign_topics(
    df,
    topics=str(topics),
    llm=llm
)

Assigning topics:   0%|          | 0/100 [00:00<?, ?it/s]


Final results saved: checkpoints/news_assigned_final.csv


In [None]:
news_assigned[[news_assigned['Predicted Topic'] == news_assigned['true_labels_text']]]

In [36]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Convert predicted topics to numeric labels using the mapping
predicted_labels = news_assigned['Predicted Topic'].map(text_to_class_mapping).fillna(-1).astype(int)
true_labels = news_assigned['true_labels']

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.8300
F1 Score: 0.8117
Precision: 0.8562
Recall: 0.8300


  _warn_prf(average, modifier, msg_start, len(result))
