In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
import math
from tqdm import tqdm
import seaborn as sns
import os

# Function to calculate permutation pattern
def get_permutation_pattern(time_series, start_idx, embedding_dim, delay):
    """
    Extract the permutation pattern from a time series segment.
    
    Args:
        time_series: Input time series data
        start_idx: Starting index
        embedding_dim: Embedding dimension
        delay: Time delay
        
    Returns:
        Pattern index representing the ordinal pattern
    """
    # Extract values for the pattern
    pattern_values = [time_series[start_idx + i * delay] for i in range(embedding_dim)]
    
    # Get the indices that would sort the pattern
    sorted_indices = np.argsort(pattern_values)
    
    # Convert to a single integer representation (factorial number system)
    pattern_index = 0
    base = 1
    for i in range(embedding_dim - 1, -1, -1):
        pattern_index += sorted_indices[i] * base
        base *= (i + 1)
    
    return pattern_index

# Function to calculate permutation entropy
def calculate_permutation_entropy(time_series, embedding_dim, delay, normalize=True):
    """
    Calculate permutation entropy for a time series.
    
    Args:
        time_series: Input time series data
        embedding_dim: Embedding dimension
        delay: Time delay
        normalize: Whether to normalize the entropy value
        
    Returns:
        Permutation entropy value
    """
    # Ensure time series is NumPy array
    time_series = np.array(time_series)
    
    # Calculate the number of patterns possible
    n_patterns = math.factorial(embedding_dim)
    
    # Array to store pattern counts
    pattern_counts = np.zeros(n_patterns)
    
    # Calculate valid length for patterns
    valid_length = len(time_series) - (embedding_dim - 1) * delay
    
    # Count patterns
    for i in range(valid_length):
        pattern = get_permutation_pattern(time_series, i, embedding_dim, delay)
        pattern_counts[pattern] += 1
    
    # Calculate probabilities
    pattern_probs = pattern_counts / valid_length
    
    # Remove zero probabilities for entropy calculation
    pattern_probs = pattern_probs[pattern_probs > 0]
    
    # Calculate entropy
    pe = entropy(pattern_probs, base=math.e)
    
    # Normalize if requested
    if normalize and pe != 0:
        pe = pe / np.log(n_patterns)
    
    return pe

# Function to calculate complexity based on permutation entropy
def calculate_complexity(time_series, embedding_dim, delay):
    """
    Calculate statistical complexity based on permutation entropy.
    
    Args:
        time_series: Input time series data
        embedding_dim: Embedding dimension
        delay: Time delay
        
    Returns:
        Complexity value
    """
    # Calculate permutation entropy (normalized)
    pe = calculate_permutation_entropy(time_series, embedding_dim, delay, normalize=True)
    
    # Calculate disequilibrium (using Jensen-Shannon divergence approximation)
    n_patterns = math.factorial(embedding_dim)
    
    # Uniform distribution (maximum entropy)
    uniform_prob = 1.0 / n_patterns
    
    # Calculate pattern probabilities
    pattern_counts = np.zeros(n_patterns)
    valid_length = len(time_series) - (embedding_dim - 1) * delay
    
    for i in range(valid_length):
        pattern = get_permutation_pattern(time_series, i, embedding_dim, delay)
        pattern_counts[pattern] += 1
    
    pattern_probs = pattern_counts / valid_length
    
    # Calculate disequilibrium using Jensen-Shannon divergence
    # Avoid zero probabilities
    pattern_probs = np.clip(pattern_probs, 1e-10, 1.0)
    disequilibrium = 0
    
    for prob in pattern_probs:
        if prob > 0:
            mean_prob = (prob + uniform_prob) / 2
            disequilibrium += (prob * np.log(prob / mean_prob) + 
                             uniform_prob * np.log(uniform_prob / mean_prob)) / 2
    
    # Normalize disequilibrium
    max_disequilibrium = -0.5 * np.log(1.0 / n_patterns)
    disequilibrium = disequilibrium / max_disequilibrium
    
    # Calculate complexity as product of normalized entropy and disequilibrium
    complexity = pe * disequilibrium
    
    return complexity

# Function to process data for a single subject, activity, and axis
def process_signal(signal, subject, activity, axis, embedded_dims, embedded_delays, signal_lengths):
    """
    Process a signal and calculate permutation entropy and complexity.
    
    Args:
        signal: Input signal data
        subject: Subject identifier
        activity: Activity label
        axis: Axis label
        embedded_dims: List of embedding dimensions
        embedded_delays: List of embedding delays
        signal_lengths: List of signal lengths
        
    Returns:
        DataFrame with results
    """
    results = []
    
    for embedded_dim in embedded_dims:
        for embedded_delay in embedded_delays:
            for signal_length in signal_lengths:
                # Skip if signal is too short
                if len(signal) < signal_length:
                    print(f"Warning: Signal for {subject}, {activity}, {axis} has length {len(signal)} < {signal_length}")
                    continue
                
                # Use first signal_length points
                signal_segment = signal[:signal_length]
                
                # Calculate metrics
                try:
                    pe = calculate_permutation_entropy(signal_segment, embedded_dim, embedded_delay)
                    complexity = calculate_complexity(signal_segment, embedded_dim, embedded_delay)
                    
                    # Append results
                    results.append({
                        'subject': subject,
                        'activity': activity,
                        'axis': axis,
                        'embedded_dim': embedded_dim,
                        'embedded_delay': embedded_delay,
                        'signal_length': signal_length,
                        'permutation_entropy': pe,
                        'complexity': complexity
                    })
                except Exception as e:
                    print(f"Error calculating metrics for {subject}, {activity}, {axis}, " +
                          f"dim={embedded_dim}, delay={embedded_delay}, len={signal_length}: {e}")
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Function to visualize results
def visualize_results(results_df, output_dir='permutation_entropy_results'):
    """
    Generate visualizations for permutation entropy results.
    
    Args:
        results_df: DataFrame with permutation entropy results
        output_dir: Directory to save visualizations
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # ------ Visualization 1: Permutation entropy by activity and embedding dimension ------
    plt.figure(figsize=(12, 8))
    
    for activity in results_df['activity'].unique():
        activity_data = results_df[results_df['activity'] == activity]
        
        # Average over axes, delays, signal lengths and subjects
        avg_data = activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
        std_data = activity_data.groupby('embedded_dim')['permutation_entropy'].std()
        
        plt.errorbar(
            avg_data.index, 
            avg_data.values, 
            yerr=std_data.values,
            marker='o',
            label=activity
        )
    
    plt.xlabel('Embedding Dimension')
    plt.ylabel('Average Permutation Entropy')
    plt.title('Permutation Entropy by Activity and Embedding Dimension')
    plt.legend()
    plt.grid(True)
    
    plt.savefig(os.path.join(output_dir, 'permutation_entropy_by_activity.png'))
    plt.close()
    
    # ------ Visualization 2: Complexity vs permutation entropy by activity ------
    plt.figure(figsize=(12, 8))
    
    for activity in results_df['activity'].unique():
        activity_data = results_df[results_df['activity'] == activity]
        
        plt.scatter(
            activity_data['permutation_entropy'], 
            activity_data['complexity'], 
            alpha=0.5,
            label=activity
        )
    
    plt.xlabel('Permutation Entropy')
    plt.ylabel('Complexity')
    plt.title('Complexity vs Permutation Entropy by Activity')
    plt.legend()
    plt.grid(True)
    
    plt.savefig(os.path.join(output_dir, 'complexity_vs_entropy.png'))
    plt.close()
    
    # ------ Visualization 3: Permutation entropy by activity, axis, and embedding dimension ------
    for axis in results_df['axis'].unique():
        plt.figure(figsize=(12, 8))
        
        for activity in results_df['activity'].unique():
            axis_activity_data = results_df[(results_df['activity'] == activity) & (results_df['axis'] == axis)]
            
            # Average over delays, signal lengths and subjects
            avg_data = axis_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            std_data = axis_activity_data.groupby('embedded_dim')['permutation_entropy'].std()
            
            plt.errorbar(
                avg_data.index, 
                avg_data.values, 
                yerr=std_data.values,
                marker='o',
                label=activity
            )
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Permutation Entropy by Activity (Axis: {axis})')
        plt.legend()
        plt.grid(True)
        
        plt.savefig(os.path.join(output_dir, f'permutation_entropy_axis_{axis}.png'))
        plt.close()
    
    # ------ Visualization 4: Effect of embedding delay ------
    plt.figure(figsize=(15, 5))
    
    for i, delay in enumerate(sorted(results_df['embedded_delay'].unique())):
        plt.subplot(1, len(results_df['embedded_delay'].unique()), i+1)
        
        for activity in results_df['activity'].unique():
            delay_activity_data = results_df[(results_df['activity'] == activity) & 
                                            (results_df['embedded_delay'] == delay)]
            
            # Average over axes, signal lengths and subjects
            avg_data = delay_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            
            plt.plot(avg_data.index, avg_data.values, 'o-', label=activity)
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Delay = {delay}')
        if i == len(results_df['embedded_delay'].unique()) - 1:
            plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'embedding_delay_effect.png'))
    plt.close()
    
    # ------ Visualization 5: Effect of signal length ------
    plt.figure(figsize=(15, 5))
    
    for i, length in enumerate(sorted(results_df['signal_length'].unique())):
        plt.subplot(1, len(results_df['signal_length'].unique()), i+1)
        
        for activity in results_df['activity'].unique():
            length_activity_data = results_df[(results_df['activity'] == activity) & 
                                             (results_df['signal_length'] == length)]
            
            # Average over axes, delays and subjects
            avg_data = length_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            
            plt.plot(avg_data.index, avg_data.values, 'o-', label=activity)
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Signal Length = {length}')
        if i == len(results_df['signal_length'].unique()) - 1:
            plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'signal_length_effect.png'))
    plt.close()
    
    # ------ Visualization 6: Box plots of PE by activity ------
    plt.figure(figsize=(12, 8))
    
    sns.boxplot(x='activity', y='permutation_entropy', data=results_df)
    plt.xlabel('Activity')
    plt.ylabel('Permutation Entropy')
    plt.title('Distribution of Permutation Entropy by Activity')
    plt.grid(True, axis='y')
    
    plt.savefig(os.path.join(output_dir, 'permutation_entropy_boxplot.png'))
    plt.close()
    
    # ------ Visualization 7: Complexity distribution by activity ------
    plt.figure(figsize=(12, 8))
    
    sns.boxplot(x='activity', y='complexity', data=results_df)
    plt.xlabel('Activity')
    plt.ylabel('Complexity')
    plt.title('Distribution of Complexity by Activity')
    plt.grid(True, axis='y')
    
    plt.savefig(os.path.join(output_dir, 'complexity_boxplot.png'))
    plt.close()
    
    # ------ Visualization 8: Heatmap of average PE for different parameter combinations ------
    dim_delay_data = results_df.groupby(['embedded_dim', 'embedded_delay'])['permutation_entropy'].mean().reset_index()
    pivot_data = dim_delay_data.pivot('embedded_dim', 'embedded_delay', 'permutation_entropy')
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(pivot_data, annot=True, cmap='viridis', fmt='.3f')
    plt.title('Average Permutation Entropy for Different Parameter Combinations')
    plt.ylabel('Embedded Dimension')
    plt.xlabel('Embedded Delay')
    
    plt.savefig(os.path.join(output_dir, 'parameter_heatmap.png'))
    plt.close()

# Function to generate sample data (for testing)
def generate_sample_data():
    """
    Generate sample accelerometer data for testing.
    
    Returns:
        DataFrame with sample data
    """
    np.random.seed(42)
    
    # Define parameters
    subjects = range(1, 16)  # 15 subjects
    activities = ['walking', 'running', 'climbing_up', 'climbing_down']
    axes = ['x', 'y', 'z']
    sample_length = 5000
    
    # Generate sample data
    data = []
    
    for subject in subjects:
        for activity in activities:
            for axis in axes:
                # Generate different patterns based on activity
                if activity == 'walking':
                    # Walking: periodic with moderate noise
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) + 0.2 * np.random.normal(size=sample_length)
                elif activity == 'running':
                    # Running: faster periodic with higher amplitude and noise
                    t = np.linspace(0, 10, sample_length)
                    signal = 1.5 * np.sin(4 * np.pi * t) + 0.4 * np.random.normal(size=sample_length)
                elif activity == 'climbing_up':
                    # Climbing up: asymmetric pattern
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) + 0.5 * np.sin(4 * np.pi * t) + 0.3 * np.random.normal(size=sample_length)
                else:  # climbing_down
                    # Climbing down: different asymmetric pattern
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) - 0.5 * np.sin(4 * np.pi * t) + 0.3 * np.random.normal(size=sample_length)
                
                # Add axis-specific modifications
                if axis == 'x':
                    signal = signal * 1.2
                elif axis == 'y':
                    signal = signal * 0.8 + 0.5
                else:  # z axis
                    signal = signal * 1.0 - 0.2
                
                # Add subject-specific noise
                subject_noise = 0.1 * (subject / 15) * np.random.normal(size=sample_length)
                signal = signal + subject_noise
                
                # Process signal and calculate PE and complexity
                subject_id = f'subject_{subject}'
                embedded_dims = [3, 4, 5, 6]
                embedded_delays = [1, 2, 3]
                signal_lengths = [1024, 2048, 4096]
                
                results = process_signal(
                    signal, subject_id, activity, axis, 
                    embedded_dims, embedded_delays, signal_lengths
                )
                
                # Add to data
                data.append(results)
    
    # Combine all results
    combined_data = pd.concat(data, ignore_index=True)
    
    return combined_data

# Main function
def main():
    """
    Main function to execute the permutation entropy analysis.
    """
    try:
        # Ask the user whether to use sample data or load from disk
        print("Permutation Entropy and Complexity Analysis")
        print("------------------------------------------")
        print("1. Use sample generated data (for testing)")
        print("2. Load processed data from CSV file")
        choice = input("Enter your choice (1/2): ")
        
        results_df = None
        
        if choice == '1':
            print("Generating sample data...")
            results_df = generate_sample_data()
        elif choice == '2':
            file_path = input("Enter the path to the CSV file: ")
            if os.path.exists(file_path):
                print(f"Loading data from {file_path}...")
                results_df = pd.read_csv(file_path)
                
                # Check if the loaded data has the required columns
                required_columns = ['subject', 'activity', 'axis', 'embedded_dim', 
                                    'embedded_delay', 'signal_length', 
                                    'permutation_entropy', 'complexity']
                
                missing_columns = [col for col in required_columns if col not in results_df.columns]
                
                if missing_columns:
                    print(f"Warning: Loaded data is missing columns: {missing_columns}")
                    print("Cannot proceed with visualization.")
                    return
            else:
                print(f"Error: File not found at {file_path}")
                return
        else:
            print("Invalid choice. Exiting.")
            return
        
        if results_df is not None and not results_df.empty:
            # Get summary statistics
            print("\nSummary Statistics:")
            print(f"Number of samples: {len(results_df)}")
            print(f"Number of subjects: {len(results_df['subject'].unique())}")
            print(f"Activities: {results_df['activity'].unique()}")
            print(f"Axes: {results_df['axis'].unique()}")
            print(f"Embedding dimensions: {sorted(results_df['embedded_dim'].unique())}")
            print(f"Embedding delays: {sorted(results_df['embedded_delay'].unique())}")
            print(f"Signal lengths: {sorted(results_df['signal_length'].unique())}")
            
            # Average PE and complexity by activity
            activity_stats = results_df.groupby('activity')[['permutation_entropy', 'complexity']].agg(['mean', 'std'])
            print("\nPermutation Entropy and Complexity by Activity:")
            print(activity_stats)
            
            # Generate visualizations
            print("\nGenerating visualizations...")
            visualize_results(results_df)
            
            print("\nAnalysis complete. Results saved to 'permutation_entropy_results' directory.")
        else:
            print("No data to analyze.")
    
    except Exception as e:
        print(f"Error in main function: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Permutation Entropy and Complexity Analysis
------------------------------------------
1. Use sample generated data (for testing)
2. Load processed data from CSV file
Enter your choice (1/2): 2
Enter the path to the CSV file: /Users/rosalinatorres/Downloads/processed_permutation_entropy_complexity.csv
Loading data from /Users/rosalinatorres/Downloads/processed_permutation_entropy_complexity.csv...
Cannot proceed with visualization.


In [None]:
https://drive.google.com/drive/folders/1JneWV9NL1v2R8LpiTNKw2oqTgd3o2Sgx?usp=drive_link

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
import math
from tqdm import tqdm
import seaborn as sns
import os

# Function to calculate permutation pattern
def get_permutation_pattern(time_series, start_idx, embedding_dim, delay):
    """
    Extract the permutation pattern from a time series segment.
    
    Args:
        time_series: Input time series data
        start_idx: Starting index
        embedding_dim: Embedding dimension
        delay: Time delay
        
    Returns:
        Pattern index representing the ordinal pattern
    """
    # Extract values for the pattern
    pattern_values = [time_series[start_idx + i * delay] for i in range(embedding_dim)]
    
    # Get the indices that would sort the pattern
    sorted_indices = np.argsort(pattern_values)
    
    # Convert to a single integer representation (factorial number system)
    pattern_index = 0
    base = 1
    for i in range(embedding_dim - 1, -1, -1):
        pattern_index += sorted_indices[i] * base
        base *= (i + 1)
    
    return pattern_index

# Function to calculate permutation entropy
def calculate_permutation_entropy(time_series, embedding_dim, delay, normalize=True):
    """
    Calculate permutation entropy for a time series.
    
    Args:
        time_series: Input time series data
        embedding_dim: Embedding dimension
        delay: Time delay
        normalize: Whether to normalize the entropy value
        
    Returns:
        Permutation entropy value
    """
    # Ensure time series is NumPy array
    time_series = np.array(time_series)
    
    # Calculate the number of patterns possible
    n_patterns = math.factorial(embedding_dim)
    
    # Array to store pattern counts
    pattern_counts = np.zeros(n_patterns)
    
    # Calculate valid length for patterns
    valid_length = len(time_series) - (embedding_dim - 1) * delay
    
    # Count patterns
    for i in range(valid_length):
        pattern = get_permutation_pattern(time_series, i, embedding_dim, delay)
        pattern_counts[pattern] += 1
    
    # Calculate probabilities
    pattern_probs = pattern_counts / valid_length
    
    # Remove zero probabilities for entropy calculation
    pattern_probs = pattern_probs[pattern_probs > 0]
    
    # Calculate entropy
    pe = entropy(pattern_probs, base=math.e)
    
    # Normalize if requested
    if normalize and pe != 0:
        pe = pe / np.log(n_patterns)
    
    return pe

# Function to calculate complexity based on permutation entropy
def calculate_complexity(time_series, embedding_dim, delay):
    """
    Calculate statistical complexity based on permutation entropy.
    
    Args:
        time_series: Input time series data
        embedding_dim: Embedding dimension
        delay: Time delay
        
    Returns:
        Complexity value
    """
    # Calculate permutation entropy (normalized)
    pe = calculate_permutation_entropy(time_series, embedding_dim, delay, normalize=True)
    
    # Calculate disequilibrium (using Jensen-Shannon divergence approximation)
    n_patterns = math.factorial(embedding_dim)
    
    # Uniform distribution (maximum entropy)
    uniform_prob = 1.0 / n_patterns
    
    # Calculate pattern probabilities
    pattern_counts = np.zeros(n_patterns)
    valid_length = len(time_series) - (embedding_dim - 1) * delay
    
    for i in range(valid_length):
        pattern = get_permutation_pattern(time_series, i, embedding_dim, delay)
        pattern_counts[pattern] += 1
    
    pattern_probs = pattern_counts / valid_length
    
    # Calculate disequilibrium using Jensen-Shannon divergence
    # Avoid zero probabilities
    pattern_probs = np.clip(pattern_probs, 1e-10, 1.0)
    disequilibrium = 0
    
    for prob in pattern_probs:
        if prob > 0:
            mean_prob = (prob + uniform_prob) / 2
            disequilibrium += (prob * np.log(prob / mean_prob) + 
                             uniform_prob * np.log(uniform_prob / mean_prob)) / 2
    
    # Normalize disequilibrium
    max_disequilibrium = -0.5 * np.log(1.0 / n_patterns)
    disequilibrium = disequilibrium / max_disequilibrium
    
    # Calculate complexity as product of normalized entropy and disequilibrium
    complexity = pe * disequilibrium
    
    return complexity

# Function to process data for a single subject, activity, and axis
def process_signal(signal, subject, activity, axis, embedded_dims, embedded_delays, signal_lengths):
    """
    Process a signal and calculate permutation entropy and complexity.
    
    Args:
        signal: Input signal data
        subject: Subject identifier
        activity: Activity label
        axis: Axis label
        embedded_dims: List of embedding dimensions
        embedded_delays: List of embedding delays
        signal_lengths: List of signal lengths
        
    Returns:
        DataFrame with results
    """
    results = []
    
    for embedded_dim in embedded_dims:
        for embedded_delay in embedded_delays:
            for signal_length in signal_lengths:
                # Skip if signal is too short
                if len(signal) < signal_length:
                    print(f"Warning: Signal for {subject}, {activity}, {axis} has length {len(signal)} < {signal_length}")
                    continue
                
                # Use first signal_length points
                signal_segment = signal[:signal_length]
                
                # Calculate metrics
                try:
                    pe = calculate_permutation_entropy(signal_segment, embedded_dim, embedded_delay)
                    complexity = calculate_complexity(signal_segment, embedded_dim, embedded_delay)
                    
                    # Append results
                    results.append({
                        'subject': subject,
                        'activity': activity,
                        'axis': axis,
                        'embedded_dim': embedded_dim,
                        'embedded_delay': embedded_delay,
                        'signal_length': signal_length,
                        'permutation_entropy': pe,
                        'complexity': complexity
                    })
                except Exception as e:
                    print(f"Error calculating metrics for {subject}, {activity}, {axis}, " +
                          f"dim={embedded_dim}, delay={embedded_delay}, len={signal_length}: {e}")
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Function to visualize results
def visualize_results(results_df, output_dir='permutation_entropy_results'):
    """
    Generate visualizations for permutation entropy results.
    
    Args:
        results_df: DataFrame with permutation entropy results
        output_dir: Directory to save visualizations
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # ------ Visualization 1: Permutation entropy by activity and embedding dimension ------
    plt.figure(figsize=(12, 8))
    
    for activity in results_df['activity'].unique():
        activity_data = results_df[results_df['activity'] == activity]
        
        # Average over axes, delays, signal lengths and subjects
        avg_data = activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
        std_data = activity_data.groupby('embedded_dim')['permutation_entropy'].std()
        
        plt.errorbar(
            avg_data.index, 
            avg_data.values, 
            yerr=std_data.values,
            marker='o',
            label=activity
        )
    
    plt.xlabel('Embedding Dimension')
    plt.ylabel('Average Permutation Entropy')
    plt.title('Permutation Entropy by Activity and Embedding Dimension')
    plt.legend()
    plt.grid(True)
    
    plt.savefig(os.path.join(output_dir, 'permutation_entropy_by_activity.png'))
    plt.close()
    
    # ------ Visualization 2: Complexity vs permutation entropy by activity ------
    plt.figure(figsize=(12, 8))
    
    for activity in results_df['activity'].unique():
        activity_data = results_df[results_df['activity'] == activity]
        
        plt.scatter(
            activity_data['permutation_entropy'], 
            activity_data['complexity'], 
            alpha=0.5,
            label=activity
        )
    
    plt.xlabel('Permutation Entropy')
    plt.ylabel('Complexity')
    plt.title('Complexity vs Permutation Entropy by Activity')
    plt.legend()
    plt.grid(True)
    
    plt.savefig(os.path.join(output_dir, 'complexity_vs_entropy.png'))
    plt.close()
    
    # ------ Visualization 3: Permutation entropy by activity, axis, and embedding dimension ------
    for axis in results_df['axis'].unique():
        plt.figure(figsize=(12, 8))
        
        for activity in results_df['activity'].unique():
            axis_activity_data = results_df[(results_df['activity'] == activity) & (results_df['axis'] == axis)]
            
            # Average over delays, signal lengths and subjects
            avg_data = axis_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            std_data = axis_activity_data.groupby('embedded_dim')['permutation_entropy'].std()
            
            plt.errorbar(
                avg_data.index, 
                avg_data.values, 
                yerr=std_data.values,
                marker='o',
                label=activity
            )
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Permutation Entropy by Activity (Axis: {axis})')
        plt.legend()
        plt.grid(True)
        
        plt.savefig(os.path.join(output_dir, f'permutation_entropy_axis_{axis}.png'))
        plt.close()
    
    # ------ Visualization 4: Effect of embedding delay ------
    plt.figure(figsize=(15, 5))
    
    for i, delay in enumerate(sorted(results_df['embedded_delay'].unique())):
        plt.subplot(1, len(results_df['embedded_delay'].unique()), i+1)
        
        for activity in results_df['activity'].unique():
            delay_activity_data = results_df[(results_df['activity'] == activity) & 
                                            (results_df['embedded_delay'] == delay)]
            
            # Average over axes, signal lengths and subjects
            avg_data = delay_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            
            plt.plot(avg_data.index, avg_data.values, 'o-', label=activity)
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Delay = {delay}')
        if i == len(results_df['embedded_delay'].unique()) - 1:
            plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'embedding_delay_effect.png'))
    plt.close()
    
    # ------ Visualization 5: Effect of signal length ------
    plt.figure(figsize=(15, 5))
    
    for i, length in enumerate(sorted(results_df['signal_length'].unique())):
        plt.subplot(1, len(results_df['signal_length'].unique()), i+1)
        
        for activity in results_df['activity'].unique():
            length_activity_data = results_df[(results_df['activity'] == activity) & 
                                             (results_df['signal_length'] == length)]
            
            # Average over axes, delays and subjects
            avg_data = length_activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
            
            plt.plot(avg_data.index, avg_data.values, 'o-', label=activity)
        
        plt.xlabel('Embedding Dimension')
        plt.ylabel('Average Permutation Entropy')
        plt.title(f'Signal Length = {length}')
        if i == len(results_df['signal_length'].unique()) - 1:
            plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'signal_length_effect.png'))
    plt.close()
    
    # ------ Visualization 6: Box plots of PE by activity ------
    plt.figure(figsize=(12, 8))
    
    sns.boxplot(x='activity', y='permutation_entropy', data=results_df)
    plt.xlabel('Activity')
    plt.ylabel('Permutation Entropy')
    plt.title('Distribution of Permutation Entropy by Activity')
    plt.grid(True, axis='y')
    
    plt.savefig(os.path.join(output_dir, 'permutation_entropy_boxplot.png'))
    plt.close()
    
    # ------ Visualization 7: Complexity distribution by activity ------
    plt.figure(figsize=(12, 8))
    
    sns.boxplot(x='activity', y='complexity', data=results_df)
    plt.xlabel('Activity')
    plt.ylabel('Complexity')
    plt.title('Distribution of Complexity by Activity')
    plt.grid(True, axis='y')
    
    plt.savefig(os.path.join(output_dir, 'complexity_boxplot.png'))
    plt.close()
    
    # ------ Visualization 8: Heatmap of average PE for different parameter combinations ------
    dim_delay_data = results_df.groupby(['embedded_dim', 'embedded_delay'])['permutation_entropy'].mean().reset_index()
    pivot_data = dim_delay_data.pivot('embedded_dim', 'embedded_delay', 'permutation_entropy')
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(pivot_data, annot=True, cmap='viridis', fmt='.3f')
    plt.title('Average Permutation Entropy for Different Parameter Combinations')
    plt.ylabel('Embedded Dimension')
    plt.xlabel('Embedded Delay')
    
    plt.savefig(os.path.join(output_dir, 'parameter_heatmap.png'))
    plt.close()

# Function to generate sample data (for testing)
def generate_sample_data():
    """
    Generate sample accelerometer data for testing.
    
    Returns:
        DataFrame with sample data
    """
    np.random.seed(42)
    
    # Define parameters
    subjects = range(1, 16)  # 15 subjects
    activities = ['walking', 'running', 'climbing_up', 'climbing_down']
    axes = ['x', 'y', 'z']
    sample_length = 5000
    
    # Generate sample data
    data = []
    
    for subject in subjects:
        for activity in activities:
            for axis in axes:
                # Generate different patterns based on activity
                if activity == 'walking':
                    # Walking: periodic with moderate noise
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) + 0.2 * np.random.normal(size=sample_length)
                elif activity == 'running':
                    # Running: faster periodic with higher amplitude and noise
                    t = np.linspace(0, 10, sample_length)
                    signal = 1.5 * np.sin(4 * np.pi * t) + 0.4 * np.random.normal(size=sample_length)
                elif activity == 'climbing_up':
                    # Climbing up: asymmetric pattern
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) + 0.5 * np.sin(4 * np.pi * t) + 0.3 * np.random.normal(size=sample_length)
                else:  # climbing_down
                    # Climbing down: different asymmetric pattern
                    t = np.linspace(0, 10, sample_length)
                    signal = np.sin(2 * np.pi * t) - 0.5 * np.sin(4 * np.pi * t) + 0.3 * np.random.normal(size=sample_length)
                
                # Add axis-specific modifications
                if axis == 'x':
                    signal = signal * 1.2
                elif axis == 'y':
                    signal = signal * 0.8 + 0.5
                else:  # z axis
                    signal = signal * 1.0 - 0.2
                
                # Add subject-specific noise
                subject_noise = 0.1 * (subject / 15) * np.random.normal(size=sample_length)
                signal = signal + subject_noise
                
                # Process signal and calculate PE and complexity
                subject_id = f'subject_{subject}'
                embedded_dims = [3, 4, 5, 6]
                embedded_delays = [1, 2, 3]
                signal_lengths = [1024, 2048, 4096]
                
                results = process_signal(
                    signal, subject_id, activity, axis, 
                    embedded_dims, embedded_delays, signal_lengths
                )
                
                # Add to data
                data.append(results)
    
    # Combine all results
    combined_data = pd.concat(data, ignore_index=True)
    
    return combined_data

# Main function
def main():
    """
    Main function to execute the permutation entropy analysis.
    """
    try:
        # Ask the user whether to use sample data or load from disk
        print("Permutation Entropy and Complexity Analysis")
        print("------------------------------------------")
        print("1. Use sample generated data (for testing)")
        print("2. Load processed data from CSV file")
        choice = input("Enter your choice (1/2): ")
        
        results_df = None
        
        if choice == '1':
            print("Generating sample data...")
            results_df = generate_sample_data()
        elif choice == '2':
            file_path = input("Enter the path to the CSV file: ")
            if os.path.exists(file_path):
                print(f"Loading data from {file_path}...")
                results_df = pd.read_csv(file_path)
                
                # Check if the loaded data has the required columns
                required_columns = ['subject', 'activity', 'axis', 'embedded_dim', 
                                    'embedded_delay', 'signal_length', 
                                    'permutation_entropy', 'complexity']
                
                missing_columns = [col for col in required_columns if col not in results_df.columns]
                
                if missing_columns:
                    print(f"Warning: Loaded data is missing columns: {missing_columns}")
                    print("Cannot proceed with visualization.")
                    return
            else:
                print(f"Error: File not found at {file_path}")
                return
        else:
            print("Invalid choice. Exiting.")
            return
        
        if results_df is not None and not results_df.empty:
            # Get summary statistics
            print("\nSummary Statistics:")
            print(f"Number of samples: {len(results_df)}")
            print(f"Number of subjects: {len(results_df['subject'].unique())}")
            print(f"Activities: {results_df['activity'].unique()}")
            print(f"Axes: {results_df['axis'].unique()}")
            print(f"Embedding dimensions: {sorted(results_df['embedded_dim'].unique())}")
            print(f"Embedding delays: {sorted(results_df['embedded_delay'].unique())}")
            print(f"Signal lengths: {sorted(results_df['signal_length'].unique())}")
            
            # Average PE and complexity by activity
            activity_stats = results_df.groupby('activity')[['permutation_entropy', 'complexity']].agg(['mean', 'std'])
            print("\nPermutation Entropy and Complexity by Activity:")
            print(activity_stats)
            
            # Generate visualizations
            print("\nGenerating visualizations...")
            visualize_results(results_df)
            
            print("\nAnalysis complete. Results saved to 'permutation_entropy_results' directory.")
        else:
            print("No data to analyze.")
    
    except Exception as e:
        print(f"Error in main function: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Permutation Entropy and Complexity Analysis
------------------------------------------
1. Use sample generated data (for testing)
2. Load processed data from CSV file
Enter your choice (1/2): 2
Enter the path to the CSV file: processed_permutation_entropy_complexity.csv
Loading data from processed_permutation_entropy_complexity.csv...
Cannot proceed with visualization.


In [4]:
import os
import glob

# Common places to check
potential_locations = [
    os.path.expanduser("~/Downloads"),
    os.path.expanduser("~/Documents"),
    os.path.expanduser("~/Desktop"),
    os.getcwd()  # Current working directory
]

# Search for CSV files
for location in potential_locations:
    if os.path.exists(location):
        print(f"Checking {location}...")
        csv_files = glob.glob(os.path.join(location, "**/*.csv"), recursive=True)
        if csv_files:
            print(f"Found {len(csv_files)} CSV files in {location}:")
            for file in csv_files[:5]:  # Show first 5 files
                print(f"  - {file}")
            if len(csv_files) > 5:
                print(f"  ... and {len(csv_files) - 5} more files")
        else:
            print(f"No CSV files found in {location}")

Checking /Users/rosalinatorres/Downloads...
Found 135 CSV files in /Users/rosalinatorres/Downloads:
  - /Users/rosalinatorres/Downloads/processed_permutation_entropy_complexity.csv
  - /Users/rosalinatorres/Downloads/Pandas-Data-Science-Tasks-master/SalesAnalysis/Output/all_data.csv
  - /Users/rosalinatorres/Downloads/Pandas-Data-Science-Tasks-master/SalesAnalysis/Sales_Data/Sales_December_2019.csv
  - /Users/rosalinatorres/Downloads/Pandas-Data-Science-Tasks-master/SalesAnalysis/Sales_Data/Sales_April_2019.csv
  - /Users/rosalinatorres/Downloads/Pandas-Data-Science-Tasks-master/SalesAnalysis/Sales_Data/Sales_February_2019.csv
  ... and 130 more files
Checking /Users/rosalinatorres/Documents...
Found 1 CSV files in /Users/rosalinatorres/Documents:
  - /Users/rosalinatorres/Documents/processed_permutation_entropy_complexity.csv
Checking /Users/rosalinatorres/Desktop...
Found 31 CSV files in /Users/rosalinatorres/Desktop:
  - /Users/rosalinatorres/Desktop/Taxes/myenv/lib/python3.13/site-

In [None]:
import os
print("Current working directory:", os.getcwd())
print("Files in this directory:", os.listdir())

In [5]:
pip install gdown

Note: you may need to restart the kernel to use updated packages.


In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy
import math
from tqdm import tqdm
import seaborn as sns
import os

def analyze_csv_file(file_path):
    """
    Analyze and visualize data from a CSV file with flexible column mapping.
    
    Args:
        file_path: Path to the CSV file
    """
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"Error: File not found at {file_path}")
            return
        
        # Try to load the file
        df = pd.read_csv(file_path)
        
        # Basic information
        print(f"\nCSV File Analysis: {file_path}")
        print(f"Number of rows: {len(df)}")
        print(f"Number of columns: {len(df.columns)}")
        print("\nColumn names:")
        for i, col in enumerate(df.columns):
            print(f"  {i+1}. {col}")
        
        # Show first few rows
        print("\nFirst 3 rows:")
        print(df.head(3).to_string())
        
        # Check numeric columns for potentially relevant data
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        print(f"\nNumeric columns (potential entropy/complexity values): {numeric_cols}")
        
        # Check for categorical columns
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        print(f"\nCategorical columns (potential activity/subject labels): {categorical_cols}")
        
        # For small number of columns, try to guess mappings
        if len(df.columns) < 20:
            print("\nAttempting to guess column mappings...")
            guessed_mappings = guess_column_mappings(df)
            for expected, guessed in guessed_mappings.items():
                if guessed:
                    print(f"  '{expected}' might be '{guessed}'")
                else:
                    print(f"  No clear match for '{expected}'")
            
            # Ask if the user wants to use the guessed mappings or define their own
            print("\nDo you want to:")
            print("1. Use these guessed mappings")
            print("2. Define your own mappings")
            print("3. Proceed without mappings (try to use columns as-is)")
            choice = input("Enter your choice (1/2/3): ")
            
            if choice == '1':
                # Use guessed mappings
                column_mapping = {v: k for k, v in guessed_mappings.items() if v}
                mapped_df = apply_column_mapping(df, column_mapping)
                generate_visualizations(mapped_df)
            elif choice == '2':
                # Define custom mappings
                column_mapping = define_custom_mappings(df)
                mapped_df = apply_column_mapping(df, column_mapping)
                generate_visualizations(mapped_df)
            else:
                # Try to use columns as-is
                print("\nProceeding with direct analysis...")
                try_direct_analysis(df)
        else:
            # Too many columns to guess, ask user to define mappings
            print("\nToo many columns to guess mappings automatically.")
            print("Do you want to define your own mappings? (y/n)")
            choice = input("Enter y/n: ")
            
            if choice.lower() == 'y':
                column_mapping = define_custom_mappings(df)
                mapped_df = apply_column_mapping(df, column_mapping)
                generate_visualizations(mapped_df)
            else:
                # Try to use columns as-is
                print("\nProceeding with direct analysis...")
                try_direct_analysis(df)
    
    except Exception as e:
        print(f"Error analyzing CSV file: {e}")
        import traceback
        traceback.print_exc()

def guess_column_mappings(df):
    """
    Attempt to guess column mappings based on column names and data properties.
    
    Args:
        df: DataFrame to analyze
        
    Returns:
        Dictionary of guessed mappings (expected_column: guessed_column)
    """
    expected_columns = [
        'subject', 'activity', 'axis', 
        'embedded_dim', 'embedded_delay', 'signal_length', 
        'permutation_entropy', 'complexity'
    ]
    
    guessed_mappings = {col: None for col in expected_columns}
    
    # Look for each expected column based on name similarity
    for col in df.columns:
        col_lower = col.lower()
        
        # Subject
        if 'subject' in col_lower or 'subj' in col_lower or 'person' in col_lower or 'individual' in col_lower:
            guessed_mappings['subject'] = col
        
        # Activity
        elif 'activity' in col_lower or 'action' in col_lower or 'movement' in col_lower or 'motion' in col_lower:
            guessed_mappings['activity'] = col
        
        # Axis
        elif 'axis' in col_lower or 'dimension' in col_lower or ('x' == col_lower or 'y' == col_lower or 'z' == col_lower):
            guessed_mappings['axis'] = col
        
        # Embedded dimension
        elif ('embedded' in col_lower and 'dim' in col_lower) or 'embed_dim' in col_lower or 'embdim' in col_lower:
            guessed_mappings['embedded_dim'] = col
        
        # Embedded delay
        elif ('embedded' in col_lower and 'delay' in col_lower) or 'embed_delay' in col_lower or 'tau' in col_lower:
            guessed_mappings['embedded_delay'] = col
        
        # Signal length
        elif ('signal' in col_lower and 'length' in col_lower) or 'window' in col_lower or 'size' in col_lower:
            guessed_mappings['signal_length'] = col
        
        # Permutation entropy
        elif ('permutation' in col_lower and 'entropy' in col_lower) or 'pe' == col_lower or 'perm_ent' in col_lower:
            guessed_mappings['permutation_entropy'] = col
        
        # Complexity
        elif 'complexity' in col_lower or 'complex' == col_lower:
            guessed_mappings['complexity'] = col
    
    # If we didn't find matches by name, try to guess based on data properties
    # Look for numeric columns that might be parameters
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # For embedded_dim, look for columns with small integers (3-6)
    if guessed_mappings['embedded_dim'] is None and numeric_cols:
        for col in numeric_cols:
            unique_values = sorted(df[col].unique())
            if len(unique_values) <= 4 and set(unique_values).issubset({3, 4, 5, 6}):
                guessed_mappings['embedded_dim'] = col
                break
    
    # For embedded_delay, look for columns with small integers (1-3)
    if guessed_mappings['embedded_delay'] is None and numeric_cols:
        for col in numeric_cols:
            unique_values = sorted(df[col].unique())
            if len(unique_values) <= 3 and set(unique_values).issubset({1, 2, 3}):
                guessed_mappings['embedded_delay'] = col
                break
    
    # For signal_length, look for columns with values like 1024, 2048, 4096
    if guessed_mappings['signal_length'] is None and numeric_cols:
        for col in numeric_cols:
            unique_values = sorted(df[col].unique())
            if len(unique_values) <= 3 and set(unique_values).issubset({1024, 2048, 4096}):
                guessed_mappings['signal_length'] = col
                break
    
    # For permutation_entropy and complexity, look for continuous values between 0 and 1
    remaining_numeric_cols = [col for col in numeric_cols if col not in guessed_mappings.values()]
    
    # Permutation entropy is usually close to 1 for random signals
    if guessed_mappings['permutation_entropy'] is None and remaining_numeric_cols:
        for col in remaining_numeric_cols:
            if df[col].min() >= 0 and df[col].max() <= 1:
                guessed_mappings['permutation_entropy'] = col
                remaining_numeric_cols.remove(col)
                break
    
    # Complexity is usually smaller than permutation entropy
    if guessed_mappings['complexity'] is None and remaining_numeric_cols:
        for col in remaining_numeric_cols:
            if df[col].min() >= 0 and df[col].max() <= 1:
                guessed_mappings['complexity'] = col
                break
    
    return guessed_mappings

def define_custom_mappings(df):
    """
    Let the user define custom column mappings.
    
    Args:
        df: DataFrame to map
        
    Returns:
        Dictionary of column mappings (original_column: mapped_column)
    """
    expected_columns = [
        'subject', 'activity', 'axis', 
        'embedded_dim', 'embedded_delay', 'signal_length', 
        'permutation_entropy', 'complexity'
    ]
    
    column_mapping = {}
    
    print("\nDefine column mappings:")
    for expected_col in expected_columns:
        print(f"\nMap '{expected_col}' to:")
        for i, col in enumerate(df.columns):
            print(f"  {i+1}. {col}")
        print("  0. None (Skip this column)")
        
        selection = input("Enter number: ")
        if selection.isdigit() and int(selection) > 0 and int(selection) <= len(df.columns):
            selected_col = df.columns[int(selection)-1]
            column_mapping[selected_col] = expected_col
            print(f"Mapping {selected_col} -> {expected_col}")
        else:
            print(f"Skipping {expected_col}")
    
    return column_mapping

def apply_column_mapping(df, column_mapping):
    """
    Apply column mapping to the DataFrame.
    
    Args:
        df: DataFrame to map
        column_mapping: Dictionary of column mappings (original_column: mapped_column)
        
    Returns:
        Mapped DataFrame
    """
    if not column_mapping:
        print("No column mappings provided. Returning original DataFrame.")
        return df
    
    # Create a copy with mapped columns
    mapped_df = df.rename(columns=column_mapping)
    
    print("\nMapped DataFrame preview:")
    print(mapped_df.head(3).to_string())
    
    # Check if all expected columns are present
    expected_columns = [
        'subject', 'activity', 'axis', 
        'embedded_dim', 'embedded_delay', 'signal_length', 
        'permutation_entropy', 'complexity'
    ]
    
    missing_columns = [col for col in expected_columns if col not in mapped_df.columns]
    
    if missing_columns:
        print(f"\nWarning: Mapped DataFrame is missing columns: {missing_columns}")
        print("Some visualizations may not work properly.")
    
    # Check data types and convert if necessary
    for col in mapped_df.columns:
        if col in ['embedded_dim', 'embedded_delay', 'signal_length']:
            if not pd.api.types.is_numeric_dtype(mapped_df[col]):
                try:
                    mapped_df[col] = pd.to_numeric(mapped_df[col])
                    print(f"Converted '{col}' to numeric type.")
                except:
                    print(f"Warning: Could not convert '{col}' to numeric type.")
    
    return mapped_df

def try_direct_analysis(df):
    """
    Try to analyze the DataFrame directly without mappings.
    
    Args:
        df: DataFrame to analyze
    """
    print("\nAttempting direct analysis...")
    
    # Check if we have both 'permutation_entropy' and 'complexity' as columns
    if 'permutation_entropy' in df.columns and 'complexity' in df.columns:
        print("Found 'permutation_entropy' and 'complexity' columns directly.")
        generate_visualizations(df)
        return
    
    # Look for columns that might contain entropy and complexity values
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numeric_cols) >= 2:
        # Take the first two numeric columns as entropy and complexity
        pe_col = numeric_cols[0]
        complexity_col = numeric_cols[1]
        
        print(f"Using '{pe_col}' as permutation entropy and '{complexity_col}' as complexity.")
        
        # Create a new DataFrame with renamed columns
        df_renamed = df.copy()
        df_renamed.rename(columns={pe_col: 'permutation_entropy', complexity_col: 'complexity'}, inplace=True)
        
        # Look for potential categorical columns for grouping
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        
        if categorical_cols:
            group_col = categorical_cols[0]
            print(f"Using '{group_col}' for grouping (as activity).")
            df_renamed.rename(columns={group_col: 'activity'}, inplace=True)
        
        generate_visualizations(df_renamed)
    else:
        print("Could not identify appropriate columns for analysis.")
        print("Please use the CSV analyzer to understand your data structure better.")

def generate_visualizations(df, output_dir='permutation_entropy_results'):
    """
    Generate visualizations based on the DataFrame.
    
    Args:
        df: DataFrame with entropy and complexity data
        output_dir: Directory to save visualizations
    """
    try:
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        print(f"\nGenerating visualizations in '{output_dir}'...")
        
        # Check required columns
        required_cols = ['permutation_entropy', 'complexity']
        missing_required = [col for col in required_cols if col not in df.columns]
        
        if missing_required:
            print(f"Error: Missing required columns: {missing_required}")
            return
        
        # Check grouping columns
        has_activity = 'activity' in df.columns
        has_embedded_dim = 'embedded_dim' in df.columns
        has_embedded_delay = 'embedded_delay' in df.columns
        has_signal_length = 'signal_length' in df.columns
        has_axis = 'axis' in df.columns
        
        # ------ Visualization 1: Permutation Entropy Distribution ------
        plt.figure(figsize=(10, 6))
        sns.histplot(df['permutation_entropy'], kde=True)
        plt.xlabel('Permutation Entropy')
        plt.ylabel('Frequency')
        plt.title('Distribution of Permutation Entropy')
        plt.savefig(os.path.join(output_dir, 'permutation_entropy_distribution.png'))
        plt.close()
        
        # ------ Visualization 2: Complexity Distribution ------
        plt.figure(figsize=(10, 6))
        sns.histplot(df['complexity'], kde=True)
        plt.xlabel('Complexity')
        plt.ylabel('Frequency')
        plt.title('Distribution of Complexity')
        plt.savefig(os.path.join(output_dir, 'complexity_distribution.png'))
        plt.close()
        
        # ------ Visualization 3: Complexity vs Permutation Entropy ------
        plt.figure(figsize=(10, 8))
        
        if has_activity:
            sns.scatterplot(
                x='permutation_entropy', 
                y='complexity', 
                hue='activity',
                data=df,
                alpha=0.7
            )
            plt.title('Complexity vs Permutation Entropy by Activity')
        else:
            sns.scatterplot(
                x='permutation_entropy', 
                y='complexity', 
                data=df,
                alpha=0.7
            )
            plt.title('Complexity vs Permutation Entropy')
            
        plt.xlabel('Permutation Entropy')
        plt.ylabel('Complexity')
        plt.grid(True)
        
        plt.savefig(os.path.join(output_dir, 'complexity_vs_entropy.png'))
        plt.close()
        
        # ------ Visualization 4: Box plots if activity available ------
        if has_activity:
            plt.figure(figsize=(12, 8))
            sns.boxplot(x='activity', y='permutation_entropy', data=df)
            plt.xlabel('Activity')
            plt.ylabel('Permutation Entropy')
            plt.title('Distribution of Permutation Entropy by Activity')
            plt.grid(True, axis='y')
            plt.savefig(os.path.join(output_dir, 'permutation_entropy_boxplot.png'))
            plt.close()
            
            plt.figure(figsize=(12, 8))
            sns.boxplot(x='activity', y='complexity', data=df)
            plt.xlabel('Activity')
            plt.ylabel('Complexity')
            plt.title('Distribution of Complexity by Activity')
            plt.grid(True, axis='y')
            plt.savefig(os.path.join(output_dir, 'complexity_boxplot.png'))
            plt.close()
        
        # ------ Visualization 5: Embedded dimension effect if available ------
        if has_embedded_dim and has_activity:
            plt.figure(figsize=(12, 8))
            
            for activity in df['activity'].unique():
                activity_data = df[df['activity'] == activity]
                
                # Average over other parameters
                avg_data = activity_data.groupby('embedded_dim')['permutation_entropy'].mean()
                std_data = activity_data.groupby('embedded_dim')['permutation_entropy'].std()
                
                plt.errorbar(
                    avg_data.index, 
                    avg_data.values, 
                    yerr=std_data.values,
                    marker='o',
                    label=activity
                )
            
            plt.xlabel('Embedding Dimension')
            plt.ylabel('Average Permutation Entropy')
            plt.title('Permutation Entropy by Activity and Embedding Dimension')
            plt.legend()
            plt.grid(True)
            
            plt.savefig(os.path.join(output_dir, 'permutation_entropy_by_embedding_dim.png'))
            plt.close()
        
        # ------ Visualization 6: Signal length effect if available ------
        if has_signal_length and has_activity:
            plt.figure(figsize=(12, 8))
            
            for activity in df['activity'].unique():
                activity_data = df[df['activity'] == activity]
                
                # Average over other parameters
                avg_data = activity_data.groupby('signal_length')['permutation_entropy'].mean()
                std_data = activity_data.groupby('signal_length')['permutation_entropy'].std()
                
                plt.errorbar(
                    avg_data.index, 
                    avg_data.values, 
                    yerr=std_data.values,
                    marker='o',
                    label=activity
                )
            
            plt.xlabel('Signal Length')
            plt.ylabel('Average Permutation Entropy')
            plt.title('Permutation Entropy by Activity and Signal Length')
            plt.legend()
            plt.grid(True)
            
            plt.savefig(os.path.join(output_dir, 'permutation_entropy_by_signal_length.png'))
            plt.close()
        
        # ------ Visualization 7: Parameters heatmap if available ------
        if has_embedded_dim and has_embedded_delay:
            dim_delay_data = df.groupby(['embedded_dim', 'embedded_delay'])['permutation_entropy'].mean().reset_index()
            
            # Create pivot table
            pivot_data = dim_delay_data.pivot('embedded_dim', 'embedded_delay', 'permutation_entropy')
            
            plt.figure(figsize=(10, 8))
            sns.heatmap(pivot_data, annot=True, cmap='viridis', fmt='.3f')
            plt.title('Average Permutation Entropy for Different Parameter Combinations')
            plt.ylabel('Embedded Dimension')
            plt.xlabel('Embedded Delay')
            
            plt.savefig(os.path.join(output_dir, 'parameter_heatmap.png'))
            plt.close()
        
        # ------ Visualization 8: Axis comparison if available ------
        if has_axis and has_activity:
            plt.figure(figsize=(12, 8))
            
            sns.boxplot(x='activity', y='permutation_entropy', hue='axis', data=df)
            plt.xlabel('Activity')
            plt.ylabel('Permutation Entropy')
            plt.title('Permutation Entropy by Activity and Axis')
            plt.grid(True, axis='y')
            
            plt.savefig(os.path.join(output_dir, 'permutation_entropy_by_axis.png'))
            plt.close()
        
        print(f"Visualization complete! {len(os.listdir(output_dir))} files saved to '{output_dir}'")
        
        # Generate summary statistics
        print("\nSummary Statistics:")
        
        # Overall statistics
        print("\nOverall Permutation Entropy:")
        print(df['permutation_entropy'].describe())
        
        print("\nOverall Complexity:")
        print(df['complexity'].describe())
        
        # Statistics by activity if available
        if has_activity:
            print("\nPermutation Entropy by Activity:")
            print(df.groupby('activity')['permutation_entropy'].describe())
            
            print("\nComplexity by Activity:")
            print(df.groupby('activity')['complexity'].describe())
            
            # Look for statistically significant differences
            if len(df['activity'].unique()) >= 2:
                print("\nActivity Comparison (mean values):")
                activity_means = df.groupby('activity')[['permutation_entropy', 'complexity']].mean()
                print(activity_means)
                
                # Calculate differences between activities
                activities = df['activity'].unique()
                print("\nPairwise Differences in Permutation Entropy:")
                for i, act1 in enumerate(activities):
                    for act2 in activities[i+1:]:
                        diff = activity_means.loc[act1, 'permutation_entropy'] - activity_means.loc[act2, 'permutation_entropy']
                        print(f"  {act1} vs {act2}: {diff:.4f}")
        
    except Exception as e:
        print(f"Error generating visualizations: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    print("Flexible Permutation Entropy Analysis")
    print("------------------------------------")
    file_path = input("Enter the path to the CSV file: ")
    analyze_csv_file(file_path)

Flexible Permutation Entropy Analysis
------------------------------------
Enter the path to the CSV file: /Users/rosalinatorres/Downloads/project2_data_chest
Error analyzing CSV file: [Errno 21] Is a directory: '/Users/rosalinatorres/Downloads/project2_data_chest'


Traceback (most recent call last):
  File "/var/folders/7d/38n8qyc974dfs9nc462v_ln40000gn/T/ipykernel_12384/3618087377.py", line 24, in analyze_csv_file
    df = pd.read_csv(file_path)
         ^^^^^^^^^^^^^^^^^^^^^^
  File "/Applications/anaconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Applications/anaconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Applications/anaconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Applications/anaconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py", line 1880, in _make_engine
    self.handles = 

In [17]:
# To view the analysis results
cat output.csv

# To view the plot (this will open it in your default image viewer)
open output_plot.png

SyntaxError: invalid syntax (1207426452.py, line 2)