In [28]:
import pandas as pd
import random
from langchain_community.llms import Ollama
random.seed(42)
from tqdm.auto import tqdm

## AG News Dataset

In [22]:
ag_test = pd.read_csv('../../data/AG News/test.csv')
ag_train = pd.read_csv('../../data/AG News/train.csv')

# Define the mapping
class_to_text_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}
text_to_class_mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Science': 4}

# Apply the mapping to the class column
# ag_test['Class'] = ag_test['Class Index'].replace(class_mapping)
# ag_train['Class'] = ag_train['Class Index'].replace(class_mapping)

ag_news_train_baseline = ag_train['Description']
ag_news_train_true_labels = ag_train['Class Index']

sampled = ag_news_train_baseline.sample(1000, random_state=42)
ag_news_train_baseline = sampled
ag_news_train_true_labels = ag_news_train_true_labels.loc[sampled.index]

sampled_indices = ag_test.sample(3600, random_state=42).index
ag_news_baseline = ag_test.loc[sampled_indices, 'Description']
ag_news_true_labels = ag_test.loc[sampled_indices, 'Class Index']

ag_news_train = {
    "AG News":(ag_news_train_baseline, ag_news_train_true_labels),
}

ag_news = {
    "AG News":(ag_news_baseline, ag_news_true_labels),
}

In [24]:
# Convert Series to DataFrame
df = pd.DataFrame({'text': ag_news_train_baseline})
df['true_labels'] = ag_news_train_true_labels
df['true_labels_text'] = df['true_labels'].map(class_to_text_mapping)
df = df.sample(1000, random_state=42)
df

Unnamed: 0,text,true_labels,true_labels_text
63621,She rode into office three years ago on a wave...,1,World
110878,AMSTERDAM - Dutch diary company Campina announ...,3,Business
95593,The armed Basque separatist group ETA showed i...,1,World
60020,"She may be in a West Virginia prison, but West...",3,Business
57501,LONDON - Defence Minister Bill Graham was tigh...,1,World
...,...,...,...
92177,Wireless services will lead the next growth ph...,4,Science
28275,Cardiffs Millennium Stadium will host this sea...,2,Sports
7682,A Greek soldier guarding an Olympic facility i...,2,Sports
1517,The experts claim hackers have compromised the...,4,Science


### LLM

In [25]:
llm = Ollama(model="gemma2")

In [26]:
topics = df['true_labels_text'].unique()
topics

array(['World', 'Business', 'Science', 'Sports'], dtype=object)

In [27]:
def assign_topics(input_df, topics, llm=None, checkpoint_interval=300):
    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()
    
    # Initialize Topic column with 'Unknown'
    df['predicted_label_text'] = 'Unknown'
    
    # Process each news item individually with progress bar
    for idx in tqdm(range(len(df)), desc="Assigning topics"):
        news_item = df.iloc[idx]['text']
        
        # Generate prompt for current item
        prompt_assigning_prompt = f'''You are provided with news and helping to classify them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective news.
News can be found in tripletick block: ```{news_item}```
Topics to choose from: {topics}
Please return ONLY the topic name. DO NOT OUTPUT any additional text, quotes, or formatting.'''
        
        try:
            # Get assignment for current item
            result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
            
            # Clean the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            
            # Update the DataFrame with the assigned topic
            df.iloc[idx, df.columns.get_loc('predicted_label_text')] = result
            
            # Save checkpoint every checkpoint_interval rows
            if (idx + 1) % checkpoint_interval == 0:
                checkpoint_filename = f'outputs/news_assigned_checkpoint_{idx + 1}.csv'
                df.to_csv(checkpoint_filename, index=False)
                print(f"\nCheckpoint saved: {checkpoint_filename}")
                
        except Exception as e:
            print(f"\nError processing item {idx}: {str(e)}")
            print(f"Result received: {result}")
            continue
    
    final_filename = '../../outputs/llm_to_label/news_assigned_final.csv'
    df.to_csv(final_filename, index=False)
    print(f"\nFinal results saved: {final_filename}")
    
    return df

In [20]:
news_assigned = assign_topics(
    df,
    topics=str(topics),
    llm=llm
)

Assigning topics:   0%|          | 0/100 [00:00<?, ?it/s]


Final results saved: ../../outputs/llm_to_label/news_assigned_final.csv


In [18]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Convert predicted topics to numeric labels using the mapping
predicted_labels = news_assigned['Predicted Topic'].map(text_to_class_mapping).fillna(-1).astype(int)
true_labels = news_assigned['true_labels']

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.8300
F1 Score: 0.8117
Precision: 0.8562
Recall: 0.8300


  _warn_prf(average, modifier, msg_start, len(result))
