# LucidBench Benchmark Analysis

This notebook provides comprehensive analysis of filesystem benchmark results across different storage devices and filesystems.

## Table of Contents
1. [Setup and Data Loading](#setup)
2. [Overview and Summary Statistics](#overview)
3. [Performance Analysis by Storage Type](#storage-analysis)
4. [Filesystem Comparison](#filesystem-analysis)
5. [I/O Pattern Analysis](#io-analysis)
6. [Resource Utilization](#resource-analysis)
7. [Statistical Analysis](#statistical-analysis)
8. [Comparative Analysis](#comparative-analysis)
9. [Recommendations and Conclusions](#conclusions)

## 1. Setup and Data Loading <a name="setup"></a>

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy import stats
from pathlib import Path

# Set plot style
plt.style.use('seaborn')
sns.set_palette('husl')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [None]:
def load_benchmark_data(run_dir):
    """Load all benchmark and monitoring data from a run directory."""
    data = []
    
    for test_dir in Path(run_dir).glob('*'):
        if not test_dir.is_dir():
            continue
            
        # Parse test directory name
        parts = test_dir.name.split('_')
        if len(parts) < 4:
            continue
            
        storage_type = parts[0]
        device = parts[1]
        filesystem = parts[2]
        test_type = '_'.join(parts[3:])
        
        # Load test data
        test_file = test_dir / 'test.json'
        if test_file.exists():
            with open(test_file) as f:
                test_data = json.load(f)
                
        # Load monitoring data
        monitor_file = test_dir / 'monitoring.json'
        if monitor_file.exists():
            with open(monitor_file) as f:
                monitor_data = json.load(f)
                
        # Combine data
        if 'test_data' in locals() and 'monitor_data' in locals():
            data.append({
                'storage_type': storage_type,
                'device': device,
                'filesystem': filesystem,
                'test_type': test_type,
                'test_data': test_data,
                'monitor_data': monitor_data
            })
    
    return pd.DataFrame(data)

# Load data from the most recent run
results_dir = Path('../results')
latest_run = max(results_dir.glob('run_*'), key=os.path.getctime)
df = load_benchmark_data(latest_run)
print(f"Loaded data from {latest_run.name}")

## 2. Overview and Summary Statistics <a name="overview"></a>

In [None]:
def extract_performance_metrics(row):
    """Extract key performance metrics from test data."""
    test_data = row['test_data']
    job = test_data['jobs'][0]
    
    metrics = {
        'iops': job['read']['iops'] if 'read' in job else job['write']['iops'],
        'bandwidth': job['read']['bw'] if 'read' in job else job['write']['bw'],
        'latency': job['read']['lat_ns']['mean'] if 'read' in job else job['write']['lat_ns']['mean'],
        'runtime': job['runtime']
    }
    
    return pd.Series(metrics)

# Extract performance metrics
performance_df = df.apply(extract_performance_metrics, axis=1)
df = pd.concat([df, performance_df], axis=1)

# Display summary statistics
print("\nSummary Statistics by Storage Type and Filesystem:")
summary = df.groupby(['storage_type', 'filesystem']).agg({
    'iops': ['mean', 'std', 'min', 'max'],
    'bandwidth': ['mean', 'std', 'min', 'max'],
    'latency': ['mean', 'std', 'min', 'max']
}).round(2)
display(summary)

## 3. Performance Analysis by Storage Type <a name="storage-analysis"></a>

In [None]:
def plot_storage_performance(df, metric):
    """Plot performance metrics by storage type."""
    plt.figure(figsize=(12, 6))
    
    # Create box plot
    sns.boxplot(data=df, x='storage_type', y=metric, hue='filesystem')
    
    plt.title(f'{metric.title()} by Storage Type and Filesystem')
    plt.xlabel('Storage Type')
    plt.ylabel(metric.title())
    plt.xticks(rotation=45)
    plt.legend(title='Filesystem')
    plt.tight_layout()
    plt.show()

# Plot performance metrics
for metric in ['iops', 'bandwidth', 'latency']:
    plot_storage_performance(df, metric)

## 4. Filesystem Comparison <a name="filesystem-analysis"></a>

In [None]:
def analyze_filesystem_performance(df):
    """Analyze filesystem performance across different test types."""
    # Create pivot table for each metric
    metrics = ['iops', 'bandwidth', 'latency']
    
    for metric in metrics:
        pivot = pd.pivot_table(
            df,
            values=metric,
            index=['storage_type', 'test_type'],
            columns='filesystem',
            aggfunc='mean'
        )
        
        print(f"\n{metric.upper()} Comparison:")
        display(pivot)
        
        # Plot comparison
        plt.figure(figsize=(12, 6))
        pivot.plot(kind='bar')
        plt.title(f'{metric.title()} Comparison by Filesystem')
        plt.xlabel('Storage Type and Test Type')
        plt.ylabel(metric.title())
        plt.xticks(rotation=45)
        plt.legend(title='Filesystem')
        plt.tight_layout()
        plt.show()

analyze_filesystem_performance(df)

## 5. I/O Pattern Analysis <a name="io-analysis"></a>

In [None]:
def analyze_io_patterns(df):
    """Analyze I/O patterns across different test types."""
    # Group by test type and calculate statistics
    io_stats = df.groupby('test_type').agg({
        'iops': ['mean', 'std', 'min', 'max'],
        'bandwidth': ['mean', 'std', 'min', 'max'],
        'latency': ['mean', 'std', 'min', 'max']
    }).round(2)
    
    print("\nI/O Pattern Statistics:")
    display(io_stats)
    
    # Plot I/O patterns
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # IOPS by test type
    sns.boxplot(data=df, x='test_type', y='iops', ax=axes[0,0])
    axes[0,0].set_title('IOPS by Test Type')
    axes[0,0].set_xticklabels(axes[0,0].get_xticklabels(), rotation=45)
    
    # Bandwidth by test type
    sns.boxplot(data=df, x='test_type', y='bandwidth', ax=axes[0,1])
    axes[0,1].set_title('Bandwidth by Test Type')
    axes[0,1].set_xticklabels(axes[0,1].get_xticklabels(), rotation=45)
    
    # Latency by test type
    sns.boxplot(data=df, x='test_type', y='latency', ax=axes[1,0])
    axes[1,0].set_title('Latency by Test Type')
    axes[1,0].set_xticklabels(axes[1,0].get_xticklabels(), rotation=45)
    
    # IOPS vs Bandwidth scatter
    sns.scatterplot(data=df, x='iops', y='bandwidth', hue='test_type', ax=axes[1,1])
    axes[1,1].set_title('IOPS vs Bandwidth')
    
    plt.tight_layout()
    plt.show()

analyze_io_patterns(df)

## 6. Resource Utilization <a name="resource-analysis"></a>

In [None]:
def analyze_resource_utilization(df):
    """Analyze system resource utilization during benchmarks."""
    # Extract monitoring data
    monitor_data = []
    for _, row in df.iterrows():
        for stat in row['monitor_data']['stats']:
            monitor_data.append({
                'storage_type': row['storage_type'],
                'filesystem': row['filesystem'],
                'test_type': row['test_type'],
                'timestamp': stat['timestamp'],
                'cpu_percent': stat['cpu_percent'],
                'memory_percent': stat['memory_percent'],
                'disk_read_bytes': stat['disk_read_bytes'],
                'disk_write_bytes': stat['disk_write_bytes']
            })
    
    monitor_df = pd.DataFrame(monitor_data)
    
    # Plot resource utilization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # CPU utilization
    sns.boxplot(data=monitor_df, x='storage_type', y='cpu_percent', hue='filesystem', ax=axes[0,0])
    axes[0,0].set_title('CPU Utilization by Storage Type')
    
    # Memory utilization
    sns.boxplot(data=monitor_df, x='storage_type', y='memory_percent', hue='filesystem', ax=axes[0,1])
    axes[0,1].set_title('Memory Utilization by Storage Type')
    
    # Disk read bytes
    sns.boxplot(data=monitor_df, x='storage_type', y='disk_read_bytes', hue='filesystem', ax=axes[1,0])
    axes[1,0].set_title('Disk Read Bytes by Storage Type')
    
    # Disk write bytes
    sns.boxplot(data=monitor_df, x='storage_type', y='disk_write_bytes', hue='filesystem', ax=axes[1,1])
    axes[1,1].set_title('Disk Write Bytes by Storage Type')
    
    plt.tight_layout()
    plt.show()

analyze_resource_utilization(df)

## 7. Statistical Analysis <a name="statistical-analysis"></a>

In [None]:
def perform_statistical_analysis(df):
    """Perform statistical analysis on benchmark results."""
    # ANOVA test for each metric
    metrics = ['iops', 'bandwidth', 'latency']
    
    for metric in metrics:
        print(f"\nANOVA Test for {metric.upper()}:")
        
        # Test by storage type
        storage_groups = [group for _, group in df.groupby('storage_type')[metric]]
        f_stat, p_val = stats.f_oneway(*storage_groups)
        print(f"Storage Type ANOVA: F-statistic = {f_stat:.2f}, p-value = {p_val:.4f}")
        
        # Test by filesystem
        fs_groups = [group for _, group in df.groupby('filesystem')[metric]]
        f_stat, p_val = stats.f_oneway(*fs_groups)
        print(f"Filesystem ANOVA: F-statistic = {f_stat:.2f}, p-value = {p_val:.4f}")
        
        # Correlation analysis
        corr_matrix = df[metrics].corr()
        print(f"\nCorrelation Matrix for {metric}:")
        display(corr_matrix)
        
        # Plot correlation heatmap
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title(f'Correlation Matrix for {metric}')
        plt.tight_layout()
        plt.show()

perform_statistical_analysis(df)

## 8. Comparative Analysis <a name="comparative-analysis"></a>

In [None]:
def perform_comparative_analysis(df):
    """Perform comparative analysis between different configurations."""
    # Create comparison matrix
    metrics = ['iops', 'bandwidth', 'latency']
    
    for metric in metrics:
        # Compare storage types
        storage_comparison = pd.pivot_table(
            df,
            values=metric,
            index='test_type',
            columns='storage_type',
            aggfunc='mean'
        )
        
        print(f"\n{metric.upper()} Comparison by Storage Type:")
        display(storage_comparison)
        
        # Compare filesystems
        fs_comparison = pd.pivot_table(
            df,
            values=metric,
            index='test_type',
            columns='filesystem',
            aggfunc='mean'
        )
        
        print(f"\n{metric.upper()} Comparison by Filesystem:")
        display(fs_comparison)
        
        # Plot comparison heatmaps
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        sns.heatmap(storage_comparison, annot=True, cmap='YlOrRd', ax=ax1)
        ax1.set_title(f'{metric.upper()} by Storage Type')
        
        sns.heatmap(fs_comparison, annot=True, cmap='YlOrRd', ax=ax2)
        ax2.set_title(f'{metric.upper()} by Filesystem')
        
        plt.tight_layout()
        plt.show()

perform_comparative_analysis(df)

## 9. Recommendations and Conclusions <a name="conclusions"></a>

In [None]:
def generate_recommendations(df):
    """Generate recommendations based on benchmark results."""
    # Calculate performance scores
    df['performance_score'] = (
        df['iops'] / df['iops'].max() +
        df['bandwidth'] / df['bandwidth'].max() -
        df['latency'] / df['latency'].max()
    ) / 3
    
    # Group by storage type and filesystem
    recommendations = df.groupby(['storage_type', 'filesystem']).agg({
        'performance_score': 'mean',
        'iops': 'mean',
        'bandwidth': 'mean',
        'latency': 'mean'
    }).round(2)
    
    print("\nPerformance Scores and Recommendations:")
    display(recommendations)
    
    # Plot performance scores
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x='storage_type', y='performance_score', hue='filesystem')
    plt.title('Overall Performance Score by Storage Type and Filesystem')
    plt.xlabel('Storage Type')
    plt.ylabel('Performance Score')
    plt.legend(title='Filesystem')
    plt.tight_layout()
    plt.show()

generate_recommendations(df)