# Processing Time Analysis by Prompt Strategy

This notebook analyzes the processing time performance across different prompt strategies:
- **base_version**: Basic version with minimal context
- **with_geom**: Version with geospatial features
- **with_geom_time**: Full version with temporal + geospatial analysis

The analysis focuses on the `processing_time` column from prediction CSV files.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import os
from collections import defaultdict

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Set paths
base_path = Path('/leonardo_work/IscrC_LLM-Mob/LLM-Mob-As-Mobility-Interpreter')
results_path = base_path / 'results'

print(f"Base path: {base_path}")
print(f"Results path: {results_path}")
print(f"Results path exists: {results_path.exists()}")

Base path: /leonardo_work/IscrC_LLM-Mob/LLM-Mob-As-Mobility-Interpreter
Results path: /leonardo_work/IscrC_LLM-Mob/LLM-Mob-As-Mobility-Interpreter/results
Results path exists: True


: 

In [None]:
def load_all_processing_times():
    """
    Load processing times from all CSV files in results directory
    Returns: DataFrame with columns [strategy, model, anchor, dataset, processing_time]
    """
    all_data = []
    
    # Define strategy mapping
    strategy_names = {
        'base_version': 'Base Version',
        'with_geom': 'With Geometry',
        'with_geom_time': 'With Geometry + Time'
    }
    
    # Scan all subdirectories excluding DEV
    for anchor_dir in results_path.iterdir():
        if not anchor_dir.is_dir() or anchor_dir.name == 'DEV':
            continue
            
        anchor_type = anchor_dir.name  # middle, penultimate, etc.
        
        for model_dir in anchor_dir.iterdir():
            if not model_dir.is_dir():
                continue
                
            model_name = model_dir.name
            
            for strategy_dir in model_dir.iterdir():
                if not strategy_dir.is_dir():
                    continue
                    
                strategy = strategy_dir.name
                strategy_label = strategy_names.get(strategy, strategy)
                
                # Load all CSV files in this strategy directory
                for csv_file in strategy_dir.glob('*.csv'):
                    if csv_file.name.endswith('_checkpoint.txt'):
                        continue
                    
                    try:
                        df = pd.read_csv(csv_file, on_bad_lines='skip', engine='python')
                        
                        if 'processing_time' in df.columns:
                            # Extract dataset name from filename
                            dataset_name = csv_file.stem.split('_pred_')[0] if '_pred_' in csv_file.stem else csv_file.stem
                            
                            # Add processing times to our collection
                            for _, row in df.iterrows():
                                if pd.notna(row['processing_time']):
                                    all_data.append({
                                        'strategy': strategy_label,
                                        'model': model_name,
                                        'anchor': anchor_type,
                                        'dataset': dataset_name,
                                        'file': csv_file.name,
                                        'processing_time': float(row['processing_time'])
                                    })
                        
                        print(f"Loaded {len(df)} records from {csv_file.name} ({strategy_label})")
                        
                    except Exception as e:
                        print(f"Error loading {csv_file}: {e}")
                        continue
    
    return pd.DataFrame(all_data)

# Load all data
print("Loading processing times from all result files...")
df_times = load_all_processing_times()

print(f"\nLoaded {len(df_times)} processing time records")
print(f"Strategies found: {df_times['strategy'].unique()}")
print(f"Models found: {df_times['model'].unique()}")
print(f"Anchor types found: {df_times['anchor'].unique()}")

Loading processing times from all result files...
Loaded 6969 records from dati_2020_pred_20250907_124943.csv (Base Version)
Loaded 65891 records from dati_2014_pred_20250906_131918.csv (Base Version)
Loaded 70261 records from dati_2016_pred_20250906_203001.csv (Base Version)
Loaded 71222 records from veronacard_2019_original_pred_20250907_131333.csv (Base Version)
Loaded 71223 records from dati_2019_pred_20250907_085717.csv (Base Version)
Loaded 66674 records from dati_2015_pred_20250906_165241.csv (Base Version)
Loaded 72903 records from veronacard_2022_original_pred_20250907_184148.csv (Base Version)
Loaded 10706 records from veronacard_2023_original_parziale_pred_20250907_223658.csv (Base Version)
Loaded 78380 records from dati_2018_pred_20250907_044207.csv (Base Version)
Loaded 21983 records from veronacard_2021_original_pred_20250907_173000.csv (Base Version)
Loaded 81340 records from dati_2017_pred_20250907_001908.csv (Base Version)
Loaded 6970 records from veronacard_2020_origi

In [None]:
# Data overview
print("=== DATA OVERVIEW ===")
print(f"Total records: {len(df_times):,}")
print(f"Processing time range: {df_times['processing_time'].min():.2f} - {df_times['processing_time'].max():.2f} seconds")
print(f"Mean processing time: {df_times['processing_time'].mean():.2f} seconds")
print(f"Median processing time: {df_times['processing_time'].median():.2f} seconds")

print("\n=== RECORDS PER STRATEGY ===")
strategy_counts = df_times['strategy'].value_counts()
for strategy, count in strategy_counts.items():
    print(f"{strategy}: {count:,} records")

print("\n=== BASIC STATISTICS BY STRATEGY ===")
strategy_stats = df_times.groupby('strategy')['processing_time'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(3)
print(strategy_stats)

In [None]:
# Processing Time Distribution by Strategy
plt.figure(figsize=(15, 10))

# Subplot 1: Box plot
plt.subplot(2, 2, 1)
sns.boxplot(data=df_times, x='strategy', y='processing_time')
plt.title('Processing Time Distribution by Strategy (Box Plot)')
plt.xlabel('Strategy')
plt.ylabel('Processing Time (seconds)')
plt.xticks(rotation=45)

# Subplot 2: Violin plot
plt.subplot(2, 2, 2)
sns.violinplot(data=df_times, x='strategy', y='processing_time')
plt.title('Processing Time Distribution by Strategy (Violin Plot)')
plt.xlabel('Strategy')
plt.ylabel('Processing Time (seconds)')
plt.xticks(rotation=45)

# Subplot 3: Histogram
plt.subplot(2, 2, 3)
for strategy in df_times['strategy'].unique():
    strategy_data = df_times[df_times['strategy'] == strategy]['processing_time']
    plt.hist(strategy_data, alpha=0.7, bins=50, label=strategy)
plt.title('Processing Time Histogram by Strategy')
plt.xlabel('Processing Time (seconds)')
plt.ylabel('Frequency')
plt.legend()

# Subplot 4: Mean processing time by strategy
plt.subplot(2, 2, 4)
strategy_means = df_times.groupby('strategy')['processing_time'].mean().sort_values()
bars = plt.bar(range(len(strategy_means)), strategy_means.values)
plt.title('Mean Processing Time by Strategy')
plt.xlabel('Strategy')
plt.ylabel('Mean Processing Time (seconds)')
plt.xticks(range(len(strategy_means)), strategy_means.index, rotation=45)

# Add value labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}s',
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Detailed Statistics Table
print("=== DETAILED PROCESSING TIME STATISTICS BY STRATEGY ===")

detailed_stats = df_times.groupby('strategy')['processing_time'].agg([
    'count',
    'mean',
    'median',
    'std',
    'min',
    ('q25', lambda x: x.quantile(0.25)),
    ('q75', lambda x: x.quantile(0.75)),
    'max',
    ('range', lambda x: x.max() - x.min())
]).round(3)

print(detailed_stats)

# Statistical significance test (ANOVA)
from scipy import stats

strategies = df_times['strategy'].unique()
groups = [df_times[df_times['strategy'] == s]['processing_time'].values for s in strategies]

# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(*groups)

print(f"\n=== STATISTICAL SIGNIFICANCE TEST ===")
print(f"One-way ANOVA F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.2e}")
print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

# Pairwise comparisons (Tukey HSD)
if len(strategies) > 2:
    from scipy.stats import tukey_hsd
    
    tukey_result = tukey_hsd(*groups)
    print(f"\n=== PAIRWISE COMPARISONS (Tukey HSD) ===")
    
    for i, strategy1 in enumerate(strategies):
        for j, strategy2 in enumerate(strategies):
            if i < j:
                print(f"{strategy1} vs {strategy2}: p-value = {tukey_result.pvalue[i, j]:.4f}")

In [None]:
# Processing Time by Model and Strategy
plt.figure(figsize=(16, 10))

# Create a pivot table for heatmap
model_strategy_stats = df_times.groupby(['model', 'strategy'])['processing_time'].mean().unstack()

# Subplot 1: Heatmap of mean processing times
plt.subplot(2, 2, 1)
sns.heatmap(model_strategy_stats, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Mean Processing Time by Model and Strategy (seconds)')
plt.xlabel('Strategy')
plt.ylabel('Model')

# Subplot 2: Grouped bar chart
plt.subplot(2, 2, 2)
model_strategy_mean = df_times.groupby(['strategy', 'model'])['processing_time'].mean().unstack()
model_strategy_mean.plot(kind='bar', width=0.8)
plt.title('Mean Processing Time by Strategy and Model')
plt.xlabel('Strategy')
plt.ylabel('Processing Time (seconds)')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)

# Subplot 3: Processing time distribution across all combinations
plt.subplot(2, 1, 2)
# Create combination column for better visualization
df_times['model_strategy'] = df_times['model'] + ' - ' + df_times['strategy']
sns.boxplot(data=df_times, x='model_strategy', y='processing_time')
plt.title('Processing Time Distribution by Model-Strategy Combinations')
plt.xlabel('Model - Strategy')
plt.ylabel('Processing Time (seconds)')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Performance Summary Report
print("=== PROCESSING TIME PERFORMANCE SUMMARY ===")
print("\nStrategy Performance Ranking (by mean processing time):")

strategy_ranking = df_times.groupby('strategy')['processing_time'].agg([
    'mean', 'median', 'count'
]).sort_values('mean')

for i, (strategy, stats) in enumerate(strategy_ranking.iterrows(), 1):
    print(f"{i}. {strategy}:")
    print(f"   Mean: {stats['mean']:.2f}s")
    print(f"   Median: {stats['median']:.2f}s")
    print(f"   Records: {stats['count']:,}")
    print()

# Efficiency metrics
print("\n=== EFFICIENCY ANALYSIS ===")
base_mean = strategy_ranking.loc['Base Version', 'mean'] if 'Base Version' in strategy_ranking.index else None

if base_mean:
    print(f"Base Version mean processing time: {base_mean:.2f}s")
    print("\nOverhead compared to Base Version:")
    
    for strategy, stats in strategy_ranking.iterrows():
        if strategy != 'Base Version':
            overhead = ((stats['mean'] - base_mean) / base_mean) * 100
            print(f"{strategy}: +{overhead:.1f}% ({stats['mean'] - base_mean:.2f}s additional)")

# Processing rate (predictions per second)
print("\n=== PROCESSING RATE ===")
for strategy, stats in strategy_ranking.iterrows():
    rate = 1 / stats['mean']
    print(f"{strategy}: {rate:.3f} predictions/second")

In [None]:
# Export summary statistics to CSV for further analysis
output_path = base_path / 'notebook' / 'processing_time_summary.csv'

# Prepare detailed summary
summary_data = []

for strategy in df_times['strategy'].unique():
    strategy_data = df_times[df_times['strategy'] == strategy]['processing_time']
    
    summary_data.append({
        'strategy': strategy,
        'count': len(strategy_data),
        'mean': strategy_data.mean(),
        'median': strategy_data.median(),
        'std': strategy_data.std(),
        'min': strategy_data.min(),
        'max': strategy_data.max(),
        'q25': strategy_data.quantile(0.25),
        'q75': strategy_data.quantile(0.75),
        'predictions_per_second': 1 / strategy_data.mean()
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(output_path, index=False)

print(f"Summary statistics exported to: {output_path}")
print("\nSummary table:")
print(summary_df.round(3))

In [None]:
# Final comprehensive visualization
plt.figure(figsize=(20, 12))

# Create a comprehensive dashboard
gs = plt.GridSpec(3, 3, hspace=0.3, wspace=0.3)

# 1. Mean processing time comparison
ax1 = plt.subplot(gs[0, 0])
strategy_means = df_times.groupby('strategy')['processing_time'].mean().sort_values()
bars = ax1.bar(range(len(strategy_means)), strategy_means.values, color=['#ff9999', '#66b3ff', '#99ff99'])
ax1.set_title('Mean Processing Time by Strategy', fontsize=12, fontweight='bold')
ax1.set_xlabel('Strategy')
ax1.set_ylabel('Seconds')
ax1.set_xticks(range(len(strategy_means)))
ax1.set_xticklabels(strategy_means.index, rotation=45, ha='right')

# Add values on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}s', ha='center', va='bottom', fontweight='bold')

# 2. Distribution comparison
ax2 = plt.subplot(gs[0, 1:3])
sns.violinplot(data=df_times, x='strategy', y='processing_time', ax=ax2)
ax2.set_title('Processing Time Distribution by Strategy', fontsize=12, fontweight='bold')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')

# 3. Model comparison heatmap
ax3 = plt.subplot(gs[1, :])
model_strategy_pivot = df_times.groupby(['model', 'strategy'])['processing_time'].mean().unstack()
sns.heatmap(model_strategy_pivot, annot=True, fmt='.2f', cmap='RdYlBu_r', ax=ax3)
ax3.set_title('Mean Processing Time: Model vs Strategy Heatmap', fontsize=12, fontweight='bold')

# 4. Processing rate comparison
ax4 = plt.subplot(gs[2, 0])
processing_rates = 1 / strategy_means
bars = ax4.bar(range(len(processing_rates)), processing_rates.values, color=['#ffcc99', '#c2c2f0', '#ccffcc'])
ax4.set_title('Processing Rate by Strategy', fontsize=12, fontweight='bold')
ax4.set_xlabel('Strategy')
ax4.set_ylabel('Predictions/Second')
ax4.set_xticks(range(len(processing_rates)))
ax4.set_xticklabels(processing_rates.index, rotation=45, ha='right')

# Add values on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

# 5. Record count by strategy
ax5 = plt.subplot(gs[2, 1])
strategy_counts = df_times['strategy'].value_counts()
colors = plt.cm.Set3(range(len(strategy_counts)))
wedges, texts, autotexts = ax5.pie(strategy_counts.values, labels=strategy_counts.index, 
                                   autopct='%1.1f%%', colors=colors, startangle=90)
ax5.set_title('Data Distribution by Strategy', fontsize=12, fontweight='bold')

# 6. Summary statistics table
ax6 = plt.subplot(gs[2, 2])
ax6.axis('tight')
ax6.axis('off')

# Create summary table
table_data = []
for strategy in strategy_means.index:
    strategy_data = df_times[df_times['strategy'] == strategy]['processing_time']
    table_data.append([
        strategy,
        f"{strategy_data.mean():.2f}s",
        f"{strategy_data.median():.2f}s",
        f"{len(strategy_data):,}"
    ])

table = ax6.table(cellText=table_data,
                  colLabels=['Strategy', 'Mean', 'Median', 'Count'],
                  cellLoc='center',
                  loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 1.5)
ax6.set_title('Summary Statistics', fontsize=12, fontweight='bold')

plt.suptitle('LLM-Mob Processing Time Analysis Dashboard', fontsize=16, fontweight='bold')
plt.show()

print("\n=== ANALYSIS COMPLETE ===")
print(f"Total processing time records analyzed: {len(df_times):,}")
print(f"Strategies analyzed: {', '.join(df_times['strategy'].unique())}")
print(f"Models covered: {', '.join(df_times['model'].unique())}")