# Data Shape Analysis and Visualization

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
from bundle.DataCraft import * 



def create_visualizations(data, output_dir="../../visualizations"):
    """Creates and saves various visualizations of the data structure."""
    if not data:
        print("No data to visualize.")
        return
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Generating visualizations in {output_dir}...")
    
    
    # EEG Sample Heatmap
    for idx, item in enumerate(data):
        if 'eeg_chunk' in item and item['eeg_chunk']:
            sample_item = data[idx + 1] if idx + 1 < len(data) else None
            break
    else:
        sample_item = None 

    if sample_item and sample_item['eeg_chunk']:
        # Get the first sample from the chunk
        sample = sample_item['eeg_chunk'][0]
        
        plt.figure(figsize=(14, 8))
        sns.heatmap(sample.T, cmap='viridis', cbar_kws={'label': 'Amplitude'})
        plt.title(f"EEG Sample Heatmap for Character '{sample_item['character']}'", fontsize=16)
        plt.xlabel('Time Steps', fontsize=14)
        plt.ylabel('Channels', fontsize=14)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/eeg_sample_heatmap.png")
        plt.close()
        print("EEG sample heatmap saved")
        
        # EEG Sample Line Plot (for first few channels)
        plt.figure(figsize=(14, 8))
        num_channels_to_plot = min(5, sample.shape[1])  # Plot up to 5 channels
        for i in range(num_channels_to_plot):
            plt.plot(sample[:, i], label=f'Channel {i+1}')
        
        plt.title(f"EEG Signal for First {num_channels_to_plot} Channels", fontsize=16)
        plt.xlabel('Time Steps', fontsize=14)
        plt.ylabel('Amplitude', fontsize=14)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/eeg_sample_lineplot.png")
        plt.close()
        print("EEG sample line plot saved")
        

    
    # Plot the probability matrix columns as line plots
    plt.figure(figsize=(14, 8))
    num_column_to_plot = 2  # Plot both columns
    
    # Access the converted_data from the sample item
    prob_chunk = sample_item['prob_chunk']
    
    for i in range(num_column_to_plot):
        plt.plot(prob_chunk[:, i], label=f'Column {i+1}')
    
    plt.title(f"Probability Values for Character '{sample_item['character']}'", fontsize=16)
    plt.xlabel('Row Index (0-77)', fontsize=14)
    plt.ylabel('Probability', fontsize=14)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/prob_sample_lineplot.png")
    plt.close()
    print("Probability matrix line plot saved")
    
    # Collect key statistics
    total_items = len(data)
    unique_sentences_count = len(set([item['sentence'] for item in data]))
    avg_sentence_length = np.mean([len(s) for s in set([item['sentence'] for item in data])])
    
    if sample_item and 'eeg_chunk' in sample_item and sample_item['eeg_chunk']:
        sample = sample_item['eeg_chunk'][0]
        eeg_shape = f"{sample.shape[0]} × {sample.shape[1]}"
        avg_chunk_size = np.mean([len(item['eeg_chunk']) for item in data if 'eeg_chunk' in item and item['eeg_chunk']])
    else:
        eeg_shape = "N/A"
        avg_chunk_size = 0
    
    # Create text for the summary
    summary_text = (
        "DATA SHAPE SUMMARY\n"
        "==================\n\n"
        f"Total Items: {total_items}\n"
        f"Unique Sentences: {unique_sentences_count}\n"
        f"Avg. Sentence Length: {avg_sentence_length:.2f} chars\n"
        f"EEG Sample Shape: {eeg_shape}\n"
        f"Avg. Chunk Size: {avg_chunk_size:.2f} samples\n"
    )
    
    print(f"Data summary:{summary_text}")
    
    print(f"\nAll visualizations saved to {output_dir}/")
    return output_dir


# Load and Analyze Data

In [24]:
# Try to load from the default path first
data = load_sentence_eeg_prob_data()


if data:
    output_dir = create_visualizations(data)
    print(f"\nTo view the visualizations, check the files in the {output_dir} directory.")
else:
    print("Failed to load data. Please check the file path.")

Attempting to load processed data from: ../../data/sentences_eeg.pkl
Successfully loaded processed data.
Generating visualizations in ../../visualizations...
EEG sample heatmap saved
EEG sample line plot saved
Probability matrix line plot saved
Data summary:DATA SHAPE SUMMARY

Total Items: 16270
Unique Sentences: 552
Avg. Sentence Length: 24.32 chars
EEG Sample Shape: 78 × 64
Avg. Chunk Size: 30.00 samples


All visualizations saved to ../../visualizations/

To view the visualizations, check the files in the ../../visualizations directory.
