In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

In [None]:
# Config
INPUT_FILE = "dataset_final.csv"
OUTPUT_DIR = "eda_charts" # Directory to save the charts

# Helper function to create and save a styled bar chart.
def create_bar_chart(data, column_name, title, filename, palette='viridis'):
    print(f"Generating chart for '{column_name}'...")
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(12, 8))
    
    counts = data[column_name].value_counts().reset_index()
    counts.columns = [column_name, 'Count']
    
    sns.barplot(
        x='Count',
        y=column_name,
        data=counts,
        palette=palette
    )
    
    plt.title(title, fontsize=16, weight='bold')
    plt.xlabel("Number of Prompts", fontsize=12)
    plt.ylabel(column_name, fontsize=12)
    plt.tight_layout()
    
    plt.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)
    plt.close() # Close the plot to free up memory
    print(f"  - Chart saved to '{os.path.join(OUTPUT_DIR, filename)}'")

# Helper function to create and save a styled histogram.
def create_histogram(data, column_name, title, filename, color='skyblue'):
    print(f"Generating histogram for '{column_name}'...")
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.figure(figsize=(10, 6))
    
    sns.histplot(data[column_name], kde=True, bins=20, color=color)
    
    plt.title(title, fontsize=16, weight='bold')
    plt.xlabel("Score", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.tight_layout()
    
    plt.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)
    plt.close()
    print(f"  - Chart saved to '{os.path.join(OUTPUT_DIR, filename)}'")

# Performs a comprehensive EDA on the structured CSV data.
def run_comprehensive_eda(filepath):
    print("--- Starting Comprehensive EDA Script ---")
    
    try:
        # 1. Load and Prepare Data
        print(f"Loading data from '{filepath}'...")
        df = pd.read_csv(filepath)
        print(f"Successfully loaded {len(df)} records.")

        # Create output directory if it doesn't exist
        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)

        # 2. Check for required columns
        required_columns = [
            'serviceDomain', 'promptIntentType', 'reasoningComplexity', 
            'targetAgeGroup', 'confidenceScore'
        ]
        for col in required_columns:
            if col not in df.columns:
                print(f"Warning: Metadata column '{col}' not found. Skipping its chart.")
                df[col] = None # Add empty column to prevent errors
        
        # 3. Generate and Save Charts
        print("\n--- Generating Visualizations ---")
        
        # Chart 1: Service Domain Distribution
        create_bar_chart(df, 'serviceDomain', 
                         'Distribution of Prompts by Service Domain', 
                         '1_service_domain_distribution.png', 'viridis')
                         
        # Chart 2: Prompt Intent Type Distribution
        create_bar_chart(df, 'promptIntentType', 
                         'Distribution of Prompts by Intent Type', 
                         '2_intent_type_distribution.png', 'plasma')

        # Chart 3: Reasoning Complexity Distribution
        create_bar_chart(df, 'reasoningComplexity', 
                         'Distribution of Prompts by Reasoning Complexity', 
                         '3_reasoning_complexity_distribution.png', 'magma')

        # Chart 4: Target Age Group Distribution
        create_bar_chart(df, 'targetAgeGroup', 
                         'Distribution of Prompts by Target Age Group', 
                         '4_age_group_distribution.png', 'cividis')

        # Chart 5: Confidence Score Histogram
        create_histogram(df, 'confidenceScore', 
                         'Distribution of Model Confidence Scores', 
                         '5_confidence_score_histogram.png', 'steelblue')

        print("\n--- EDA Script Finished ---")

    except FileNotFoundError:
        print(f"FATAL ERROR: Input file not found at '{filepath}'")
    except Exception as e:
        print(f"An error occurred during analysis: {e}")

if __name__ == "__main__":
    run_comprehensive_eda(INPUT_FILE)


--- Starting Comprehensive EDA Script ---
Loading data from 'dataset_final.csv'...
Successfully loaded 9959 records.

--- Generating Visualizations ---
Generating chart for 'serviceDomain'...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


  - Chart saved to 'eda_charts/1_service_domain_distribution.png'
Generating chart for 'promptIntentType'...
  - Chart saved to 'eda_charts/2_intent_type_distribution.png'
Generating chart for 'reasoningComplexity'...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


  - Chart saved to 'eda_charts/3_reasoning_complexity_distribution.png'
Generating chart for 'targetAgeGroup'...
  - Chart saved to 'eda_charts/4_age_group_distribution.png'
Generating histogram for 'confidenceScore'...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


  - Chart saved to 'eda_charts/5_confidence_score_histogram.png'

--- EDA Script Finished ---
