In [1]:
import pandas as pd
import random
from langchain_ollama import OllamaLLM
random.seed(42)
from tqdm.auto import tqdm

## AG News Dataset

In [2]:
sample_size_per_class = 50

In [3]:
ag_train = pd.read_csv('../../data/AG News/train.csv')

class_to_text_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}
text_to_class_mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Science': 4}
ag_train['Class'] = ag_train['Class Index'].map(class_to_text_mapping)

# CREATING A SAMPLE TEST SET
ag_train_world_sample = ag_train[ag_train['Class Index'] == 1].sample(sample_size_per_class, random_state=42)
ag_train_sports_sample = ag_train[ag_train['Class Index'] == 2].sample(sample_size_per_class, random_state=42)
ag_train_business_sample = ag_train[ag_train['Class Index'] == 3].sample(sample_size_per_class, random_state=42)
ag_train_science_sample = ag_train[ag_train['Class Index'] == 4].sample(sample_size_per_class, random_state=42)

# Combine the four dataframes of different categories
ag_train_sample = pd.concat([ag_train_world_sample, 
                            ag_train_sports_sample, 
                            ag_train_business_sample, 
                            ag_train_science_sample], 
                           ignore_index=True)

# Shuffle the combined dataframe
ag_train_sample = ag_train_sample.sample(frac=1, random_state=42).reset_index(drop=True)

# Reset the index
ag_train_sample.reset_index(drop=True, inplace=True)

In [4]:
# !ollama pull gemma3
# !ollama pull gemma3:12b-it-qat

In [5]:
# llm = OllamaLLM(model="gemma3:12b-it-qat")
llm = OllamaLLM(model="gemma3")

In [6]:
topics = ag_train_sample['Class'].unique()

def assign_topics(input_df, topics, llm=None, checkpoint_interval=300, noise_strategies=None):

    output_filename = '../../data/AG News/train_from_llm.csv'

    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()
    
    # Initialize Topic column with 'Unknown'
    df['Predicted Topic'] = 'Unknown'
    
    # Process each news item individually with progress bar
    for idx in tqdm(range(len(df)), desc="Assigning topics"):
        news_item = df.iloc[idx]['Description']
        
        # Generate prompt for current item
        prompt_assigning_prompt = f'''You are provided with news and helping to classify them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective news.
News can be found in tripletick block: ```{news_item}```
Topics to choose from: {topics}
Please return ONLY the topic name. DO NOT OUTPUT any additional text, quotes, or formatting.'''
        
        try:
            result = llm.invoke(prompt_assigning_prompt)
            # Clean the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            
            # Update the DataFrame with the assigned topic
            df.iloc[idx, df.columns.get_loc('Predicted Topic')] = result
            
            # Save checkpoint every checkpoint_interval rows
            # if (idx + 1) % checkpoint_interval == 0:
            #     checkpoint_filename = f'{output_filename}_{idx + 1}.csv'
            #     df.to_csv(checkpoint_filename, index=False)
            #     print(f"\nCheckpoint saved: {checkpoint_filename}")
                
        except Exception as e:
            print(f"\nError processing item {idx}: {str(e)}")
            print(f"Result received: {result}")
            continue
    df['Predicted Topic Index'] = df['Predicted Topic'].map(text_to_class_mapping)
    df.to_csv(output_filename, index=False)
    print(f"\nFinal results saved: {output_filename}")
    
    return df

In [7]:
result = assign_topics(
    ag_train_sample,
    topics=str(topics),
    llm=llm,
)

Assigning topics:   0%|          | 0/200 [00:00<?, ?it/s]


Final results saved: ../../data/AG News/train_from_llm.csv


In [8]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Convert predicted topics to numeric labels using the mapping
predicted_labels = result['Predicted Topic']
true_labels = result['Class']

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.7550
F1 Score: 0.7344
Precision: 0.8082
Recall: 0.7550


  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Convert predicted topics to numeric labels using the mapping
predicted_labels = result['Predicted Topic']
true_labels = result['Class']

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.7750
F1 Score: 0.7509
Precision: 0.8212
Recall: 0.7750


  _warn_prf(average, modifier, msg_start, len(result))
