In [None]:
"""
Semantic Feature Confidence Analysis

This script analyzes confidence scores from GPT-4o's semantic feature extraction results
on semantically chunked stream-of-consciousness text data. It examines the model's confidence in classifying
five key psycholinguistic features:

1. Cognitive Flexibility
2. Narrative and Discourse Coherence
3. Emotional Tone
4. Self-Reflection Depth
5. Analytical Thinking

Process Overview:
- Loads structured feature extraction results from JSON
- Processes and aggregates confidence scores across all features
- Generates statistical summaries and visualizations
- Identifies notable examples of high/low confidence patterns (for report)

The analysis helps understand:
- Distribution of confidence scores across different features
- Patterns in model uncertainty
- Feature-specific confidence variations

"""

import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any

###############################################################################
# CONSTANTS AND CONFIGURATIONS
###############################################################################
# Feature list for confidence analysis
CONFIDENCE_FEATURES = [
    'Cognitive Flexibility',
    'Narrative and Discourse Coherence',
    'Emotional Tone',
    'Self-Reflection Depth',
    'Analytical Thinking'
]

# Data path for feature extraction results
DATA_PATH = '/content/drive/MyDrive/MSC thesis/final_datasets/semantic_feature_extraction/cot-structured_feature_extraction_openai_gpt-4o_temp_1.0.json'

###############################################################################
# DATA LOADING AND PROCESSING
###############################################################################
CONFIDENCE_FEATURES = [
    'Cognitive Flexibility',
    'Narrative and Discourse Coherence',
    'Emotional Tone',
    'Self-Reflection Depth',
    'Analytical Thinking'
]

###############################################################################
# FEATURE DATA EXTRACTION
###############################################################################
    """
    Extract feature data from model outputs into a structured format.
    """
    feature_data = []

    for index, row in df.iterrows():
        try:
            if isinstance(row['model_output'], dict) and 'features' in row['model_output']:
                for feature in row['model_output']['features']:
                    feature_data.append({
                        'author_id': row['author_id'],
                        'chunk_number': row['chunk_number'],
                        'feature': feature['feature'],
                        'result': feature['result'],
                        'confidence_score': float(feature['confidence_score'])
                    })
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue

    return pd.DataFrame(feature_data)

###############################################################################
# CONFIDENCE SCORE ANALYSIS
###############################################################################
    """
    Create pivot table for confidence scores analysis.
    """
    return features_df.pivot(
        index=['author_id', 'chunk_number'],
        columns='feature',
        values='confidence_score'
    ).reset_index()

def print_confidence_statistics(confidence_df: pd.DataFrame) -> None:
    """
    Print summary statistics for confidence scores.
    """
    summary_stats = confidence_df[CONFIDENCE_FEATURES].describe()
    summary_stats.loc['median'] = confidence_df[CONFIDENCE_FEATURES].median()
    print("\nSummary statistics for confidence scores:")
    print(summary_stats)

    for feature in CONFIDENCE_FEATURES:
        mean_score = confidence_df[feature].mean()
        std_score = confidence_df[feature].std()
        median_score = confidence_df[feature].median()
        print(f"\nFeature: {feature}")
        print(f"Mean: {mean_score:.2f}, Median: {median_score:.2f}, Std: {std_score:.2f}")

###############################################################################
# VISUALIZATION FUNCTIONS
###############################################################################
    """
    Generate boxplot visualization of confidence scores.
    """
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=confidence_df[CONFIDENCE_FEATURES], width=0.6, palette="Set2")
    plt.title("Confidence Scores Distribution by Feature", fontsize=14)
    plt.ylabel("Confidence Score", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    n_size = len(confidence_df)
    plt.text(0.95, 0.95, f'n = {n_size}',
             transform=plt.gca().transAxes,
             ha='right', va='top',
             bbox=dict(facecolor='white', edgecolor='none', alpha=0.7))

    plt.tight_layout()
    plt.show()

def plot_confidence_distributions(confidence_df: pd.DataFrame) -> None:
    """
    Generate distribution plots for confidence scores.
    """
    colors = sns.color_palette("Set2")
    fig = plt.figure(figsize=(15, 10))
    n_size = len(confidence_df)

    for i, (col, color) in enumerate(zip(CONFIDENCE_FEATURES, colors), 1):
        ax = plt.subplot(3, 2, i)
        sns.histplot(confidence_df[col], kde=True, bins=15, color=color)
        plt.title(f"Distribution of {col}")
        plt.xlabel("Confidence Score")

        plt.text(0.95, 0.95, f'n = {n_size}',
                transform=ax.transAxes,
                ha='right', va='top',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.7))

    plt.tight_layout()
    plt.show()

###############################################################################
# EXAMPLE FINDING AND ANALYSIS
###############################################################################
                                 low_threshold: float = 0.3,
                                 high_threshold: float = 0.7,
                                 specific_high: float = 0.9) -> pd.DataFrame:
    """
    Find samples with mixed confidence scores.
    """
    return df[(df[CONFIDENCE_FEATURES].max(axis=1) > high_threshold) &
             (df[CONFIDENCE_FEATURES].min(axis=1) < low_threshold) &
             (df[CONFIDENCE_FEATURES] == specific_high).any(axis=1)]

def find_specific_example(df: pd.DataFrame, author_id: str, chunk_number: int) -> None:
    """
    Find and display specific example by author ID and chunk number.
    """
    specific_row = df[(df['author_id'] == author_id) & (df['chunk_number'] == chunk_number)]
    if not specific_row.empty:
        print(f"\nDetails for author '{author_id}' in chunk {chunk_number}:")
        print(specific_row.T.to_string(header=False))
    else:
        print(f"\nNo data found for author '{author_id}' in chunk {chunk_number}")

###############################################################################
# MAIN EXECUTION
###############################################################################
    """
    Main execution function for confidence score analysis.
    """
    # Extract and process feature data
    features_df = extract_feature_data(df)
    confidence_df = create_confidence_pivot(features_df)

    # Generate statistics and visualizations
    print_confidence_statistics(confidence_df)
    plot_confidence_boxplots(confidence_df)
    plot_confidence_distributions(confidence_df)

    # Author-level analysis
    author_means = confidence_df.groupby('author_id')[CONFIDENCE_FEATURES].mean()
    print("\nAverage confidence scores per author:")
    print(author_means.head())

    # Find example cases
    mixed_samples = find_mixed_confidence_examples(confidence_df)
    if not mixed_samples.empty:
        print("\nExample of mixed confidence scores:")
        print(mixed_samples.iloc[0])

    # Find specific example
    find_specific_example(confidence_df, '1997_891831', 7)

def load_feature_data(filepath: str) -> pd.DataFrame:
    """
    Load feature extraction data from JSON file.

    Args:
        filepath: Path to the JSON lines file containing feature extraction results

    Returns:
        DataFrame containing the loaded data
    """
    try:
        return pd.read_json(filepath, orient='records', lines=True)
    except Exception as e:
        raise Exception(f"Error loading data from {filepath}: {e}")

if __name__ == "__main__":
    # Load feature extraction results

    df = load_feature_data(DATA_PATH)
    main(df)