# Metrics Calculation Script

This code evaluates a multi-label, multi-class classification model by comparing predicted and ground truth data from CSV files. It calculates key metrics such as precision, recall, F1 score, and Exact Match Ratio (EMR) for both main roles and subclasses. The results include detailed performance metrics for each role and subclass, along with average F1 scores and EMR.

**Important Notes:**
- Our test split already had role and subrole classifications. During test prompt generation, responses containing these roles and subroles were simply discarded (Check the "***DP_Prompt_Generation.ipynb***" for more details). Which is why the test.csv can be used as ground truth in this case.
- The "generated_predictions.csv" file is generated in "***Inferencing.ipynb***".  

In [None]:
from collections import defaultdict
from sklearn.metrics import f1_score, precision_score, recall_score
from collections import defaultdict
import csv
import ast

# Mount drive to access files
from google.colab import drive
drive.mount('/content/drive')

# Define the base directory once
BASE_DIR = "/content/drive/MyDrive/Llama_3B_Instruct_with_Pre-constructed_Prompts"

# Function to generate full paths from base path
def path_builder(relative_path):
    """Returns the full path by combining BASE_DIR with the given relative path."""
    from pathlib import Path
    return str(Path(BASE_DIR) / relative_path)

Mounted at /content/drive


In [None]:
def evaluate_predictionss(pred_file, ground_truth_file):
    # Load the ground truth
    ground_truth = {}
    with open(ground_truth_file, 'r') as gt_file:
        reader = csv.reader(gt_file)
        next(reader)  # Skip the header row
        counter = 0
        for row in reader:
            _, _, _, _, main_role, subclasses_str = row  # Adjust indices as per your ground_truth file format
            try:
                subclasses_list = ast.literal_eval(subclasses_str)  # Safely convert string to list
            except Exception as e:
                print(f"Error parsing subclasses_str: {subclasses_str}. Error: {e}")
                subclasses_list = []  # Default to an empty list if parsing fails
            ground_truth[counter] = {'main_class': main_role, 'subclasses': subclasses_list}
            counter += 1

    # Load the predictions
    predictions = {}
    with open(pred_file, 'r') as pred_file:
        reader = csv.reader(pred_file)
        next(reader)  # Skip the header row
        counter = 0
        for row in reader:
            main_role, subclasses_str = row
            try:
                subclasses_list = ast.literal_eval(subclasses_str.strip('"'))  # Safely convert string to list
            except Exception as e:
                print(f"Error parsing subclasses_str: {subclasses_str}. Error: {e}")
                subclasses_list = []  # Default to an empty list
            predictions[counter] = {'main_class': main_role, 'subclasses': subclasses_list}
            counter += 1

    # Initialize metrics
    all_main_classes = ['Antagonist', 'Protagonist', 'Innocent']  # Add all main classes here
    main_class_to_subclasses = {
        'Antagonist': ['Instigator', 'Conspirator', 'Tyrant', 'Foreign Adversary',
                       'Traitor', 'Spy', 'Saboteur', 'Corrupt', 'Incompetent',
                       'Terrorist', 'Deceiver', 'Bigot'],
        'Protagonist': ['Guardian', 'Martyr', 'Peacemaker', 'Rebel', 'Underdog', 'Virtuous'],
        'Innocent': ['Forgotten', 'Exploited', 'Victim', 'Scapegoat']
    }
    all_subclasses = [v for values in main_class_to_subclasses.values() for v in values]

    metrics = {
        'main_class': {cls: {'TP': 0, 'FP': 0, 'FN': 0, 'TN': 0} for cls in all_main_classes},
        'subclasses': {subcls: {'TP': 0, 'FP': 0, 'FN': 0, 'TN': 0} for subcls in all_subclasses},
        'EMR': 0,
        'total_samples': len(ground_truth),
    }

    # Iterate over ground truth and predictions
    for idx, gt_data in ground_truth.items():
        gt_main_class = gt_data['main_class']
        gt_subclasses = set(gt_data['subclasses'])

        pred_data = predictions.get(idx, {'main_class': None, 'subclasses': []})
        pred_main_class = pred_data['main_class']
        pred_subclasses = set(pred_data['subclasses'])

        # Main class metrics
        for main_class in all_main_classes:
            if gt_main_class == main_class and pred_main_class == main_class:
                metrics['main_class'][main_class]['TP'] += 1
            elif gt_main_class == main_class and pred_main_class != main_class:
                metrics['main_class'][main_class]['FN'] += 1
            elif gt_main_class != main_class and pred_main_class == main_class:
                metrics['main_class'][main_class]['FP'] += 1
            else:
                metrics['main_class'][main_class]['TN'] += 1

        # Subclass metrics
        for subclass in all_subclasses:
            if subclass in gt_subclasses and subclass in pred_subclasses:
                metrics['subclasses'][subclass]['TP'] += 1
            elif subclass in gt_subclasses and subclass not in pred_subclasses:
                metrics['subclasses'][subclass]['FN'] += 1
            elif subclass not in gt_subclasses and subclass in pred_subclasses:
                metrics['subclasses'][subclass]['FP'] += 1
            else:
                metrics['subclasses'][subclass]['TN'] += 1

        # Exact Match Ratio
        if gt_main_class == pred_main_class and gt_subclasses == pred_subclasses:
            metrics['EMR'] += 1

    # Calculate final metrics
    results = {'main_class': {}, 'subclasses': {}, 'EMR': metrics['EMR'] / metrics['total_samples']}
    for main_class, stats in metrics['main_class'].items():
        results['main_class'][main_class] = {
            'precision': stats['TP'] / (stats['TP'] + stats['FP']) if stats['TP'] + stats['FP'] > 0 else 0,
            'recall': stats['TP'] / (stats['TP'] + stats['FN']) if stats['TP'] + stats['FN'] > 0 else 0,
            'f1_score': 2 * stats['TP'] / (2 * stats['TP'] + stats['FP'] + stats['FN']) if 2 * stats['TP'] + stats['FP'] + stats['FN'] > 0 else 0,
        }
    for subclass, stats in metrics['subclasses'].items():
        results['subclasses'][subclass] = {
            'precision': stats['TP'] / (stats['TP'] + stats['FP']) if stats['TP'] + stats['FP'] > 0 else 0,
            'recall': stats['TP'] / (stats['TP'] + stats['FN']) if stats['TP'] + stats['FN'] > 0 else 0,
            'f1_score': 2 * stats['TP'] / (2 * stats['TP'] + stats['FP'] + stats['FN']) if 2 * stats['TP'] + stats['FP'] + stats['FN'] > 0 else 0,
        }

    return results


def main():
    pred_file = path_builder("Dataset_EN_PT/generated_predictions.csv")
    ground_truth_file = path_builder("Dataset_EN_PT/test_data/test.csv")

    results = evaluate_predictionss(pred_file, ground_truth_file)
    main_class_f1 = []
    subclass_f1 = []

    # Print metrics
    print("Main Class Metrics:")
    for main_class, stats in results['main_class'].items():
        print(f"{main_class}: Precision={stats['precision']:.2f}, Recall={stats['recall']:.2f}, F1 Score={stats['f1_score']:.2f}")
        main_class_f1.append(stats['f1_score'])

    print("\nSubclass Metrics:")
    for subclass, stats in results['subclasses'].items():
        print(f"{subclass}: Precision={stats['precision']:.2f}, Recall={stats['recall']:.2f}, F1 Score={stats['f1_score']:.2f}")
        if stats['f1_score'] > 0:
            subclass_f1.append(stats['f1_score'])

    print(f"\nExact Match Ratio (EMR): {results['EMR']:.2f}")
    print(f"Average Subclass F1 Score: {sum(subclass_f1) / len(subclass_f1):.2f}")
    print(f"Average Main Class F1 Score: {sum(main_class_f1) / len(main_class_f1):.2f}")


if __name__ == '__main__':
    main()


Main Class Metrics:
Antagonist: Precision=0.71, Recall=0.87, F1 Score=0.78
Protagonist: Precision=0.74, Recall=0.59, F1 Score=0.66
Innocent: Precision=0.83, Recall=0.67, F1 Score=0.74

Subclass Metrics:
Instigator: Precision=0.00, Recall=0.00, F1 Score=0.00
Conspirator: Precision=0.50, Recall=0.25, F1 Score=0.33
Tyrant: Precision=0.23, Recall=0.20, F1 Score=0.21
Foreign Adversary: Precision=0.33, Recall=0.19, F1 Score=0.24
Traitor: Precision=0.00, Recall=0.00, F1 Score=0.00
Spy: Precision=0.00, Recall=0.00, F1 Score=0.00
Saboteur: Precision=0.00, Recall=0.00, F1 Score=0.00
Corrupt: Precision=0.00, Recall=0.00, F1 Score=0.00
Incompetent: Precision=0.14, Recall=0.18, F1 Score=0.16
Terrorist: Precision=0.00, Recall=0.00, F1 Score=0.00
Deceiver: Precision=0.00, Recall=0.00, F1 Score=0.00
Bigot: Precision=0.50, Recall=0.33, F1 Score=0.40
Guardian: Precision=0.43, Recall=0.68, F1 Score=0.53
Martyr: Precision=1.00, Recall=0.33, F1 Score=0.50
Peacemaker: Precision=0.00, Recall=0.00, F1 Score=0