In [12]:
# # RUN WHEN RUNNING FOR THE FIRST TIME
# !ollama pull gemma3

In [27]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from langchain_ollama import OllamaLLM
from tqdm.auto import tqdm
import pandas as pd
import random
random.seed(42)


In [28]:

def calculate_metrics(df, predicted_col='Predicted Topic Index', true_col='Class Index'):
    """
    Calculate classification metrics for news topic prediction.
    
    Args:
        df: DataFrame containing the predictions and true labels
        predicted_col: Column name for predicted labels (text format)
        true_col: Column name for true labels (numeric format)
        
    Returns:
        Dictionary of calculated metrics
    """
    true_labels = df[true_col].tolist()
    predicted_labels = df[predicted_col].tolist()

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    
    metrics = {
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall
    }
    
    # Print the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    
    return metrics

In [29]:
def assign_topics(input_df, topics, mapping, llm=None, output_path=None):
    if llm is None:
        raise ValueError("LLM instance must be provided")
    if output_path is None:
        raise ValueError("Output path must be provided")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()
    
    # Initialize Topic column with 'Unknown'
    df['Predicted Topic'] = 'Unknown'
    
    # Process each news item individually with progress bar
    for idx in tqdm(range(len(df)), desc="Assigning topics"):
        news_item = df.iloc[idx]['Description']
        
        # Generate prompt for current item
        prompt_assigning_prompt = f'''You are provided with news and helping to classify them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective news.
News can be found in tripletick block: ```{news_item}```
Topics to choose from: {topics}
Please return ONLY the topic name. DO NOT OUTPUT any additional text, quotes, or formatting. Only 1 topic  should be returned.'''
        
        try:
            result = llm.invoke(prompt_assigning_prompt)
            # Clean the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            
            # Update the DataFrame with the assigned topic
            df.iloc[idx, df.columns.get_loc('Predicted Topic')] = result
                            
        except Exception as e:
            print(f"\nError processing item {idx}: {str(e)}")
            print(f"Result received: {result}")
            continue
    df['Predicted Topic Index'] = df['Predicted Topic'].map(mapping)
    df.to_csv(output_path, index=False)
    print(f"\nFinal results saved: {output_path}")
    
    return df

In [30]:
llm = OllamaLLM(model="gemma3")

## AG News Dataset

In [31]:
sample_size_per_class = 20

In [None]:
ag_train = pd.read_csv('../../data/AG News/train.csv')

class_to_text_mapping = {1: "World", 2: "Sports", 3: "Business", 4: "Science"}
text_to_class_mapping = {'World': 1, 'Sports': 2, 'Business': 3, 'Science': 4}
ag_train['Class'] = ag_train['Class Index'].map(class_to_text_mapping)

# CREATING A SAMPLE TRAIN SET
ag_train_world = ag_train[ag_train['Class Index'] == 1].sample(sample_size_per_class, random_state=42)
ag_train_sports = ag_train[ag_train['Class Index'] == 2].sample(sample_size_per_class, random_state=42)
ag_train_business = ag_train[ag_train['Class Index'] == 3].sample(sample_size_per_class, random_state=42)
ag_train_science = ag_train[ag_train['Class Index'] == 4].sample(sample_size_per_class, random_state=42)

# Combine the four dataframes of different categories
ag_train = pd.concat([ag_train_world, 
                            ag_train_sports, 
                            ag_train_business, 
                            ag_train_science], 
                           ignore_index=True)

# Shuffle the combined dataframe
ag_train = ag_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Reset the index
ag_train.reset_index(drop=True, inplace=True)

topics_ag_news = ag_train['Class'].unique()

In [None]:
ag_news_output = assign_topics(
    ag_train,
    topics=str(topics_ag_news),
    mapping=text_to_class_mapping,
    llm=llm,
    output_path='../../data/AG News/train_from_llm.csv'
)

Assigning topics:   0%|          | 0/80 [00:00<?, ?it/s]


Final results saved: ../../data/AG News/train_from_llm.csv


In [34]:
ag_metrics = calculate_metrics(ag_news_output)

Accuracy: 0.8125
F1 Score: 0.7970
Precision: 0.8658
Recall: 0.8125


## BBC News Dataset

In [None]:
bbc_full = pd.read_csv('../../data/BBC News/BBC News Train.csv')
bbc_full = bbc_full.rename(columns={'Category': 'Class', 'Text': 'Description'})

bbc_class_to_index = {
    'business': 1,
    'tech': 2,
    'entertainment': 3,
    'politics': 4,
    'sport': 5
}

bbc_index_to_class = {
    1: 'business',
    2: 'tech',
    3: 'entertainment',
    4: 'politics',
    5: 'sport'
}

bbc_full['Class Index'] = bbc_full['Class'].map(bbc_class_to_index)


bbc_train_business = bbc_full[bbc_full['Class Index'] == 1].sample(sample_size_per_class, random_state=42)
bbc_train_tech = bbc_full[bbc_full['Class Index'] == 2].sample(sample_size_per_class, random_state=42)
bbc_train_entertainment = bbc_full[bbc_full['Class Index'] == 3].sample(sample_size_per_class, random_state=42)
bbc_train_politics = bbc_full[bbc_full['Class Index'] == 4].sample(sample_size_per_class, random_state=42)
bbc_train_sport = bbc_full[bbc_full['Class Index'] == 5].sample(sample_size_per_class, random_state=42)

# Create a test set by excluding the training samples
bbc_train_indices = pd.concat([
    bbc_train_business,
    bbc_train_tech, 
    bbc_train_entertainment,
    bbc_train_politics,
    bbc_train_sport
]).index
# Create test set by excluding training indices
bbc_test = bbc_full.drop(bbc_train_indices).reset_index(drop=True)
bbc_test.to_csv('../../data/BBC News/test.csv', index=False)

# Combine the four dataframes of different categories
bbc_train = pd.concat([
    bbc_train_business, 
    bbc_train_tech, 
    bbc_train_entertainment, 
    bbc_train_politics, 
    bbc_train_sport
], ignore_index=True)

# Shuffle the combined dataframe
bbc_train = bbc_train.sample(frac=1, random_state=42)
bbc_train = bbc_train.reset_index(drop=True)

topics_bbc_news = bbc_train['Class'].unique()

In [24]:
bbc_news_output = assign_topics(
    bbc_train,
    topics=str(topics_bbc_news),
    llm=llm,
    mapping=bbc_class_to_index,
    output_path='../../data/BBC News/train_from_llm.csv'
)

Assigning topics:   0%|          | 0/100 [00:00<?, ?it/s]


Final results saved: ../../data/BBC News/train_from_llm.csv


In [26]:
bbc_metrics = calculate_metrics(bbc_news_output)

Accuracy: 0.8800
F1 Score: 0.8777
Precision: 0.8846
Recall: 0.8800


## 20 News Group Dataset