In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
from collections import Counter

In [2]:
# Function to load data from CSV files
def load_data_from_csv(manual_file_path, gpt_file_path):
    # Load CSV files
    manual_df = pd.read_csv(manual_file_path)
    gpt_df = pd.read_csv(gpt_file_path)
    
    # Check that both dataframes have the same texts
    if not manual_df['full_text'].equals(gpt_df['full_text']):
        print("Warning: The texts in the two CSV files don't match exactly.")
        
    return manual_df, gpt_df

In [3]:
manual_file_path = "./labeled_data/labeled_sentiment_manual_annotation.csv" 
gpt_file_path = "./labeled_data/labeled_sentiment_gpt_for_cohens_kappa.csv" 
    
# Load data from CSV files
manual_df, gpt_df = load_data_from_csv(manual_file_path, gpt_file_path)

In [4]:
# Calculate Cohen's Kappa
def calculate_cohen_kappa(manual_df, gpt_df):
    # Extract the labels (annotations)
    manual_annotations = manual_df['label'].tolist()
    gpt_annotations = gpt_df['label'].tolist()
    
    # Calculate Cohen's Kappa
    kappa = cohen_kappa_score(manual_annotations, gpt_annotations)
    
    return kappa, manual_annotations, gpt_annotations

In [5]:
# Interpret Cohen's Kappa
def interpret_kappa(kappa):
    if kappa < 0:
        return "Poor agreement (less than chance)"
    elif kappa < 0.2:
        return "Slight agreement"
    elif kappa < 0.4:
        return "Fair agreement"
    elif kappa < 0.6:
        return "Moderate agreement"
    elif kappa < 0.8:
        return "Substantial agreement"
    else:
        return "Almost perfect agreement"

In [6]:
kappa, manual_annotations, gpt_annotations = calculate_cohen_kappa(manual_df, gpt_df)
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Kappa Interpretation: {interpret_kappa(kappa)}")

Cohen's Kappa: 0.8154
Kappa Interpretation: Almost perfect agreement


In [7]:
# Create a confusion matrix
def create_confusion_matrix(y_true, y_pred):
    labels = sorted(list(set(y_true + y_pred)))
    n_labels = len(labels)
    
    # Create a dictionary mapping label to index
    label_to_idx = {label: i for i, label in enumerate(labels)}
    
    # Initialize confusion matrix
    cm = np.zeros((n_labels, n_labels), dtype=int)
    
    # Fill confusion matrix
    for true, pred in zip(y_true, y_pred):
        cm[label_to_idx[true], label_to_idx[pred]] += 1
    
    return cm, labels

In [8]:
cm, labels = create_confusion_matrix(manual_annotations, gpt_annotations)
print("\nConfusion Matrix:")
print("Labels:", labels)
print(cm)


Confusion Matrix:
Labels: ['negatif', 'netral', 'positif']
[[53  1  2]
 [ 3  5  0]
 [ 3  1 32]]


In [9]:
# Calculate agreement metrics per class
def per_class_metrics(cm, labels):
    n_labels = len(labels)
    metrics = {}
    
    for i, label in enumerate(labels):
        true_pos = cm[i, i]
        false_pos = sum(cm[:, i]) - true_pos
        false_neg = sum(cm[i, :]) - true_pos
        
        # Count instances where both annotators agree this is not the class
        true_neg = np.sum(cm) - true_pos - false_pos - false_neg
        
        # Calculate agreement percentage for this class
        total = np.sum(cm)
        agreement = (true_pos + true_neg) / total
        
        metrics[label] = {
            "agreement": agreement,
            "true_positive": true_pos,
            "false_positive": false_pos,
            "false_negative": false_neg,
            "true_negative": true_neg
        }
    
    return metrics

In [10]:
 # Calculate and display per-class metrics
class_metrics = per_class_metrics(cm, labels)
print("\nPer-Class Metrics:")
for label, metrics in class_metrics.items():
    print(f"\nClass: {label}")
    print(f"Agreement: {metrics['agreement']:.4f}")
    print(f"True Positives: {metrics['true_positive']}")
    print(f"False Positives: {metrics['false_positive']}")
    print(f"False Negatives: {metrics['false_negative']}")
    print(f"True Negatives: {metrics['true_negative']}")
    
# Calculate overall percentage agreement
correct = sum(1 for m, g in zip(manual_annotations, gpt_annotations) if m == g)
total = len(manual_annotations)
percentage_agreement = correct / total * 100
print(f"\nOverall Percentage Agreement: {percentage_agreement:.2f}%")


Per-Class Metrics:

Class: negatif
Agreement: 0.9100
True Positives: 53
False Positives: 6
False Negatives: 3
True Negatives: 38

Class: netral
Agreement: 0.9500
True Positives: 5
False Positives: 2
False Negatives: 3
True Negatives: 90

Class: positif
Agreement: 0.9400
True Positives: 32
False Positives: 2
False Negatives: 4
True Negatives: 62

Overall Percentage Agreement: 90.00%


In [11]:
# Display distribution of annotations
print("\nDistribution of Manual Annotations:")
print(Counter(manual_annotations))
print("\nDistribution of GPT-4o-mini Annotations:")
print(Counter(gpt_annotations))


Distribution of Manual Annotations:
Counter({'negatif': 56, 'positif': 36, 'netral': 8})

Distribution of GPT-4o-mini Annotations:
Counter({'negatif': 59, 'positif': 34, 'netral': 7})
