In [16]:
import json
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np # Usado para lidar com dados ausentes (NaN)
import collections # Para usar defaultdict

In [17]:
with open('results.json', 'r', encoding='utf-8') as arquivo:
    all_data  = json.load(arquivo)

print(all_data)

{'tree-0': {'data-0': {'radius-0': {'avg_time': 3.88789, 'disk_access': 218.26, 'avg_dist_calc': 358.38, 'avg_obj_result': 1.006, 'radius': 0, 'num_consults': 500}, 'radius-11151': {'avg_time': 9.12274, 'disk_access': 238.024, 'avg_dist_calc': 3143.81, 'avg_obj_result': 1.242, 'radius': 11151, 'num_consults': 500}, 'radius-22302': {'avg_time': 14.1467, 'disk_access': 257.28, 'avg_dist_calc': 5771.01, 'avg_obj_result': 13.696, 'radius': 22302, 'num_consults': 500}, 'radius-33453': {'avg_time': 17.5907, 'disk_access': 274.984, 'avg_dist_calc': 8140.77, 'avg_obj_result': 62.012, 'radius': 33453, 'num_consults': 500}, 'radius-44604': {'avg_time': 22.9583, 'disk_access': 290.954, 'avg_dist_calc': 10248.8, 'avg_obj_result': 165.388, 'radius': 44604, 'num_consults': 500}, 'radius-55755': {'avg_time': 25.6551, 'disk_access': 305.546, 'avg_dist_calc': 12101.3, 'avg_obj_result': 367.748, 'radius': 55755, 'num_consults': 500}, 'radius-66906': {'avg_time': 29.7855, 'disk_access': 318.328, 'avg_dis

In [18]:
# Block to ensure the DataFrame df exists and has the necessary columns
try:
    if 'df' not in locals() or not isinstance(df, pd.DataFrame): # Check if df exists
        print("DataFrame 'df' not found or invalid. Creating from 'all_data'...")
        flat_data_list = []
        for tree_key, tree_content in all_data.items():
            if not isinstance(tree_content, dict): continue
            for data_key, data_content in tree_content.items():
                if not isinstance(data_content, dict): continue
                for radius_key, metrics in data_content.items():
                    if not isinstance(metrics, dict): continue
                    record = metrics.copy()
                    record['tree'] = tree_key
                    record['data'] = data_key
                    if 'radius' not in record:
                        try:
                            record['radius'] = int(radius_key.split('-')[-1])
                        except (ValueError, IndexError):
                            record['radius'] = -1
                    flat_data_list.append(record)

        if not flat_data_list:
             raise ValueError("No valid data found in 'all_data' to create DataFrame.")

        df = pd.DataFrame(flat_data_list)
        numeric_cols = ['avg_time', 'disk_access', 'avg_dist_calc', 'avg_obj_result', 'radius']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                 print(f"Warning: Column '{col}' not found in source data.")

        # Ensure 'radius' and 'avg_obj_result' are not null for this plot type
        required_cols = ['radius', 'avg_obj_result', 'tree', 'data']
        if not all(col in df.columns for col in required_cols):
             raise ValueError(f"DataFrame missing one or more required columns: {required_cols}")

        df.dropna(subset=['radius', 'avg_obj_result'], inplace=True) # Check for avg_obj_result
        df = df[df['radius'] >= 0].copy()
        print("DataFrame 'df' created/regenerated successfully.")
    else:
        print("DataFrame 'df' already exists. Skipping creation.")

except Exception as e:
    print(f"Error processing data to create DataFrame: {e}")
    df = pd.DataFrame() # Create empty df to prevent later errors
# --- End of Loading and Preparation Block ---


# --- Mappings (English) ---
tree_title_map = {
    "tree-0": "Index - Haar Transform: 0 Levels",
    "tree-1": "Index - Haar Transform: 1 Level",
    "tree-2": "Index - Haar Transform: 2 Levels",
    # Add more if needed, but script filters for 0, 1, 2
}

data_label_map = {
    "data-0": "Haar Lvl 0 (256 bins)",
    "data-1": "Haar Lvl 1 (128 bins)",
    "data-2": "Haar Lvl 2 (64 bins)",
    "data-3": "Haar Lvl 3 (32 bins)",
    "data-4": "Haar Lvl 4 (16 bins)",
    "data-5": "Haar Lvl 5 (8 bins)",
    "data-6": "Haar Lvl 6 (4 bins)",
    "data-7": "Haar Lvl 7 (2 bins)",
    # Add more mappings if your data has more data-X keys
}

# --- Plot Generation: Average Objects Returned vs. Radius with y=ax^n Fit ---
target_tree = 'tree-0'
target_data_keys = ['data-0', 'data-1', 'data-2'] # Data keys for subplots

# Define marker symbols for different data traces
marker_symbols = ['circle', 'square', 'triangle-up', 'cross', 'diamond', 'star']

# Get the mapped title for the target tree
tree_title = tree_title_map.get(target_tree, target_tree)

print(f"\nGenerating side-by-side plots for {tree_title} with y=ax^n fit...")

# 1. Check if DataFrame is valid and required column exists
if df.empty:
    print("Error: DataFrame 'df' is empty. Cannot generate plots.")
elif 'avg_obj_result' not in df.columns:
     print("Error: Column 'avg_obj_result' not found in the DataFrame.")
else:
    # Filter the DataFrame for the target tree
    df_tree_filtered = df[df['tree'] == target_tree].copy()

    # 2. Pre-processing: Identify data keys with sufficient data for fitting
    plots_to_make = []
    valid_data_keys = [] # Original keys
    subplot_titles = []  # Mapped labels
    fit_params = {}      # Dictionary to store parameters (a, n) for each fit

    for data_key in target_data_keys:
        df_plot = df_tree_filtered[df_tree_filtered['data'] == data_key].copy()
        # Filter points for log-log fit: radius > 0 and avg_obj_result > 0
        # Use a small epsilon to avoid log(0) issues
        epsilon = 1e-9
        df_fit_data = df_plot[(df_plot['radius'] >= epsilon) & (df_plot['avg_obj_result'] > epsilon)].copy()

        if len(df_fit_data) >= 2: # Need at least 2 points for regression
            df_plot.sort_values(by='radius', inplace=True) # Sort original data for plotting
            plots_to_make.append(df_plot)
            valid_data_keys.append(data_key)
            subplot_titles.append(data_label_map.get(data_key, data_key)) # Add mapped title

            # Calculate 'n' and 'a' for this data_key using log-log regression
            try:
                log_x = np.log(df_fit_data['radius'])
                log_y = np.log(df_fit_data['avg_obj_result'])
                # Fit a polynomial of degree 1 (linear fit in log-log space)
                coefficients = np.polyfit(log_x, log_y, 1)
                n = coefficients[0] # Slope in log-log is the exponent n
                a = np.exp(coefficients[1]) # Intercept in log-log is log(a)
                fit_params[data_key] = {'n': n, 'a': a} # Store parameters
                print(f"Fit for '{data_label_map.get(data_key, data_key)}': n = {n:.4f}, a = {a:.4e}")
            except Exception as e:
                print(f"Error calculating fit for '{data_label_map.get(data_key, data_key)}': {e}")
                fit_params[data_key] = None # Mark fit as failed
        elif not df_plot.empty:
             # Data exists but not enough points for fitting (e.g., only radius=0)
             print(f"Warning: Insufficient data for curve fitting in '{data_label_map.get(data_key, data_key)}'. Plotting data only.")
             df_plot.sort_values(by='radius', inplace=True)
             plots_to_make.append(df_plot)
             valid_data_keys.append(data_key)
             subplot_titles.append(data_label_map.get(data_key, data_key))
             fit_params[data_key] = None # No fit performed

    # 3. Check if there are plots to be made
    num_plots = len(plots_to_make)
    if num_plots == 0:
        print(f"Warning: No valid data found for plotting in {tree_title}.")
    else:
        # 4. Create the figure with subplots using mapped titles
        fig = make_subplots(rows=1, cols=num_plots,
                            subplot_titles=subplot_titles)

        # 5. Add traces (data and fitted curve) for each subplot
        for i, df_plot in enumerate(plots_to_make):
            col_num = i + 1
            original_data_key = valid_data_keys[i]
            mapped_data_label = subplot_titles[i]
            marker_symbol = marker_symbols[i % len(marker_symbols)] # Select marker

            # 5.1 Add trace for the original data with unique marker
            fig.add_trace(
                go.Scatter(
                    x=df_plot['radius'],
                    y=df_plot['avg_obj_result'],
                    mode='lines+markers',
                    name=mapped_data_label, # Use mapped label for legend
                    marker=dict(size=6, symbol=marker_symbol), # Apply unique marker
                    legendgroup=f'data_{i}', # Group legend items if needed
                    hovertemplate=( # Custom hover text for data
                        f'<b>{mapped_data_label}</b><br>' +
                        'Radius: %{x}<br>' +
                        'Avg Objects: %{y:.3f}<extra></extra>'
                    )
                ),
                row=1, col=col_num
            )

            # 5.2 Add trace for the fitted curve (if fit was successful)
            params = fit_params.get(original_data_key)
            if params:
                n = params['n']
                a = params['a']

                # Calculate y = a * x^n using the radius values from the plot data
                x_fit = df_plot['radius'].copy()
                # Use the same epsilon for calculation to avoid issues at x=0
                mask = (x_fit >= epsilon) | (n >= 0) # Calculate where x>0 or if n>=0
                y_fit = np.full(x_fit.shape, np.nan) # Initialize with NaN

                # Calculate fit safely, avoiding warnings for 0**negative_n
                with np.errstate(divide='ignore', invalid='ignore'):
                    y_fit[mask] = a * (x_fit[mask] ** n)
                    # Handle x=0 case explicitly if n > 0 (y should be 0)
                    if 0 in x_fit.values and n > 0:
                         zero_mask = (x_fit == 0)
                         y_fit[zero_mask] = 0


                # Add the fitted curve trace
                fig.add_trace(
                    go.Scatter(
                        x=x_fit,
                        y=y_fit,
                        mode='lines',
                        name=f'Fit (a={a:.2e}, n={n:.3f})', # Include params in name
                        line=dict(dash='dash', color='red'), # Style for fit line
                        legendgroup=f'fit_{i}', # Different group from data
                        hovertemplate=( # Custom hover text for fit
                            f'<b>Fit: {mapped_data_label}</b><br>' +
                            'Radius (x): %{x}<br>' +
                            'Fit (axⁿ): %{y:.3f}<br>'+
                            f'a = {a:.2e}, n = {n:.3f}<extra></extra>'
                        )
                    ),
                    row=1, col=col_num
                )

        # 6. Update overall figure layout and axes
        fig.update_layout(
            title_text=f'{tree_title} - Average Objects Returned vs. Query Radius (y=axⁿ Fit)', # Mapped title
            showlegend=True,
            legend_title_text="Data / Fit" # English legend title
        )
        fig.update_xaxes(
            title_text='Query Radius', # English
            rangemode='tozero',
            row=1
        )
        fig.update_yaxes(
            title_text='Average Objects Returned', # English
            rangemode='tozero',
            row=1
        )

        # 7. Show the combined figure
        fig.show()

print("\nEnd of side-by-side plot generation with y=ax^n fit.")

DataFrame 'df' not found or invalid. Creating from 'all_data'...
DataFrame 'df' created/regenerated successfully.

Generating side-by-side plots for Index - Haar Transform: 0 Levels with y=ax^n fit...
Fit for 'Haar Lvl 0 (256 bins)': n = 3.5716, a = 4.2091e-15
Fit for 'Haar Lvl 1 (128 bins)': n = 3.5580, a = 5.8750e-14
Fit for 'Haar Lvl 2 (64 bins)': n = 3.5515, a = 7.5466e-13



End of side-by-side plot generation with y=ax^n fit.


In [33]:
# Block to ensure the DataFrame df exists and has the necessary columns
try:
    if 'df' not in locals() or not isinstance(df, pd.DataFrame): # Check if df exists
        print("DataFrame 'df' not found or invalid. Creating from 'all_data'...")
        flat_data_list = []
        for tree_key, tree_content in all_data.items():
            if not isinstance(tree_content, dict): continue
            for data_key, data_content in tree_content.items():
                if not isinstance(data_content, dict): continue
                for radius_key, metrics in data_content.items():
                    if not isinstance(metrics, dict): continue
                    record = metrics.copy()
                    record['tree'] = tree_key
                    record['data'] = data_key
                    if 'radius' not in record:
                        try:
                            record['radius'] = int(radius_key.split('-')[-1])
                        except (ValueError, IndexError):
                            record['radius'] = -1
                    flat_data_list.append(record)

        if not flat_data_list:
             raise ValueError("No valid data found in 'all_data' to create DataFrame.")

        df = pd.DataFrame(flat_data_list)
        numeric_cols = ['avg_time', 'disk_access', 'avg_dist_calc', 'avg_obj_result', 'radius']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                 print(f"Warning: Column '{col}' not found in source data.")

        # Ensure 'radius' and 'disk_access' are not null for this plot type
        required_cols = ['radius', 'disk_access', 'tree', 'data']
        if not all(col in df.columns for col in required_cols):
             raise ValueError(f"DataFrame missing one or more required columns: {required_cols}")

        df.dropna(subset=['radius', 'disk_access'], inplace=True) # Check for disk_access
        df = df[df['radius'] >= 0].copy()
        print("DataFrame 'df' created/regenerated successfully.")
    else:
        print("DataFrame 'df' already exists. Skipping creation.")

except Exception as e:
    print(f"Error processing data to create DataFrame: {e}")
    df = pd.DataFrame() # Create empty df to prevent later errors

# --- End of Loading and Preparation Block ---


# --- Mappings (English) ---
tree_title_map = {
    "tree-0": "Index - Haar Transform: 0 Levels",
    "tree-1": "Index - Haar Transform: 1 Level",
    "tree-2": "Index - Haar Transform: 2 Levels",
    # Add more if needed, but script filters for 0, 1, 2
}

data_label_map = {
    "data-0": "Haar Lvl 0 (256 bins)",
    "data-1": "Haar Lvl 1 (128 bins)",
    "data-2": "Haar Lvl 2 (64 bins)",
    "data-3": "Haar Lvl 3 (32 bins)",
    "data-4": "Haar Lvl 4 (16 bins)",
    "data-5": "Haar Lvl 5 (8 bins)",
    "data-6": "Haar Lvl 6 (4 bins)",
    "data-7": "Haar Lvl 7 (2 bins)",
    # Add more mappings if your data has more data-X keys
}

# --- Plot Generation: Disk Access vs. Radius ---
target_tree = 'tree-0'
target_data_keys = ['data-0', 'data-1', 'data-2'] # Data keys for subplots
horizontal_line_y = 332 # Reference value for disk access
horizontal_line_label = "Sequential Scan Ref." # English label

# Define marker symbols for different traces
marker_symbols = ['circle', 'square', 'triangle-up', 'cross', 'diamond', 'star'] # Add more if needed

# Get the mapped title for the target tree
tree_title = tree_title_map.get(target_tree, target_tree)

print(f"\nGenerating Disk Access vs. Radius plots for {tree_title}...")

# 1. Check if DataFrame is valid and column 'disk_access' exists
if df.empty:
     print(f"Error: DataFrame 'df' is empty. Cannot generate plots.")
elif 'disk_access' not in df.columns:
    print(f"Error: Column 'disk_access' not found in the DataFrame.")
else:
    # Filter for the target tree
    df_tree_filtered = df[df['tree'] == target_tree].copy()

    # 2. Pre-processing: Identify valid data keys and prepare DataFrames for plotting
    plots_to_make = []
    valid_data_keys = [] # Original keys
    subplot_titles = [] # Mapped labels

    for data_key in target_data_keys:
        # Select data, drop NaNs in 'disk_access', and ensure it's not empty
        df_plot = df_tree_filtered[df_tree_filtered['data'] == data_key].dropna(subset=['disk_access']).copy()
        if not df_plot.empty:
            df_plot.sort_values(by='radius', inplace=True)
            plots_to_make.append(df_plot)
            valid_data_keys.append(data_key)
            subplot_titles.append(data_label_map.get(data_key, data_key))

    # 3. Check if there are plots to create
    num_plots = len(plots_to_make)
    if num_plots == 0:
        print(f"Warning: No valid data found for plotting 'disk_access' for keys {target_data_keys} in {tree_title}.")
    else:
        # 4. Create the figure with subplots using mapped titles
        fig = make_subplots(rows=1, cols=num_plots,
                            subplot_titles=subplot_titles)

        # 5. Add traces and shapes/annotations to each subplot
        for i, df_plot in enumerate(plots_to_make):
            col_num = i + 1
            original_data_key = valid_data_keys[i]
            mapped_data_label = subplot_titles[i]
            # Select a marker symbol based on index 'i'
            marker_symbol = marker_symbols[i % len(marker_symbols)]

            # 5.1 Add Main Data Trace (Disk Access) with unique marker
            fig.add_trace(
                go.Scatter(
                    x=df_plot['radius'],
                    y=df_plot['disk_access'], # Plotting disk access
                    mode='lines+markers',
                    name=mapped_data_label, # Mapped label for legend
                    marker=dict(size=7, symbol=marker_symbol), # Assign unique symbol <<< CHANGE
                    legendgroup=f'group{i}',
                     hovertemplate = ( # Custom hover text
                        f'<b>{mapped_data_label}</b><br>' +
                        'Radius: %{x}<br>' +
                        'Disk Access: %{y:.2f}<extra></extra>' # Use .2f for float display
                    )
                ),
                row=1, col=col_num
            )

            # 5.2 Add Horizontal Reference Line using add_shape
            xmax_plot = df_plot['radius'].max() if not df_plot.empty else 0
            fig.add_shape(
                type="line",
                x0=0, y0=horizontal_line_y,
                x1=xmax_plot,
                y1=horizontal_line_y,
                line=dict(color="grey", width=2, dash="dot"),
                row=1, col=col_num
            )

            # 5.3 Add Annotation for the Reference Line
            fig.add_annotation(
                text=horizontal_line_label, # English label
                align='right',
                x=xmax_plot,
                y=horizontal_line_y,
                xref=f"x{col_num}",
                yref=f"y{col_num}",
                showarrow=False,
                yshift=-10,
                xanchor="right"
            )

        # 6. Update the overall figure layout
        fig.update_layout(
            title_text=f'{tree_title} - Disk Access vs. Query Radius', # Mapped Title
            showlegend=True,
            legend_title_text="Data Representation", # English,
                                  width=1200,  # Largura
                                  height=600
        )
        # Update axes titles and range
        fig.update_xaxes(
            title_text='Query Radius', # English
            rangemode='tozero',
            row=1
        )
        fig.update_yaxes(
            title_text='Disk Access', # English
            range=[200, None],
            rangemode='tozero',
            row=1
        )

        # 7. Show the figure
        fig.show()

print("\nEnd of Disk Access plot generation.")

DataFrame 'df' already exists. Skipping creation.

Generating Disk Access vs. Radius plots for Index - Haar Transform: 0 Levels...



End of Disk Access plot generation.


In [32]:
# Block to ensure the DataFrame df exists and has the necessary columns
try:
    if 'df' not in locals() or not isinstance(df, pd.DataFrame): # Check if df exists
        print("DataFrame 'df' not found or invalid. Creating from 'all_data'...")
        flat_data_list = []
        for tree_key, tree_content in all_data.items():
            if not isinstance(tree_content, dict): continue # Skip invalid tree entries
            for data_key, data_content in tree_content.items():
                if not isinstance(data_content, dict): continue # Skip invalid data entries
                for radius_key, metrics in data_content.items():
                     # Ensure metrics is a dict before processing
                    if not isinstance(metrics, dict): continue # Skip invalid metric entries
                    record = metrics.copy()
                    record['tree'] = tree_key
                    record['data'] = data_key
                    # Ensure 'radius' exists or try to parse from key
                    if 'radius' not in record:
                        try:
                            record['radius'] = int(radius_key.split('-')[-1])
                        except (ValueError, IndexError):
                            record['radius'] = -1 # Assign default if parsing fails
                    flat_data_list.append(record)

        if not flat_data_list:
             raise ValueError("No valid data found in 'all_data' to create DataFrame.")

        df = pd.DataFrame(flat_data_list)
        numeric_cols = ['avg_time', 'disk_access', 'avg_dist_calc', 'avg_obj_result', 'radius']
        # Convert numeric columns, coercing errors to NaN
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Column '{col}' not found in source data.")

        # Ensure 'radius' and 'avg_dist_calc' are not null for this specific plot type
        required_cols = ['radius', 'avg_dist_calc', 'tree', 'data']
        if not all(col in df.columns for col in required_cols):
             raise ValueError(f"DataFrame missing one or more required columns: {required_cols}")

        df.dropna(subset=['radius', 'avg_dist_calc'], inplace=True)
        # Filter out negative radius values that might have resulted from parsing errors
        df = df[df['radius'] >= 0].copy()
        print("DataFrame 'df' created/regenerated successfully.")
    else:
        print("DataFrame 'df' already exists. Skipping creation.")

except Exception as e:
    print(f"Error processing data to create DataFrame: {e}")
    df = pd.DataFrame() # Create an empty DataFrame to avoid later errors

# --- End of Loading and Preparation Block ---


# --- Mappings (English) ---
tree_title_map = {
    "tree-0": "Index - Haar Transform: 0 Levels",
    "tree-1": "Index - Haar Transform: 1 Level",
    "tree-2": "Index - Haar Transform: 2 Levels",
    # Add more if needed, but script filters for 0, 1, 2
}

data_label_map = {
    "data-0": "Haar Lvl 0 (256 bins)",
    "data-1": "Haar Lvl 1 (128 bins)",
    "data-2": "Haar Lvl 2 (64 bins)",
    "data-3": "Haar Lvl 3 (32 bins)",
    "data-4": "Haar Lvl 4 (16 bins)",
    "data-5": "Haar Lvl 5 (8 bins)",
    "data-6": "Haar Lvl 6 (4 bins)",
    "data-7": "Haar Lvl 7 (2 bins)",
    # Add more mappings if your data has more data-X keys
}

# --- Plot Generation: Average Distance Calculations vs. Radius (Subplots) ---
target_tree = 'tree-0' # Choose the tree to plot
target_data_keys = ['data-0', 'data-1', 'data-2'] # Choose data representations for subplots
horizontal_line_y = 20580
horizontal_line_label = "Distance Reference" # English label for annotation

# Define marker symbols for different traces within subplots
marker_symbols = ['circle', 'square', 'triangle-up', 'cross', 'diamond', 'star'] # Add more if needed

# Get the mapped title for the target tree
tree_title = tree_title_map.get(target_tree, target_tree) # Use mapped name or key if not found

print(f"\nGenerating Average Distance Calculations vs. Radius plots for {tree_title}...")

# 1. Check if DataFrame is valid and column 'avg_dist_calc' exists
if df.empty:
     print(f"Error: DataFrame 'df' is empty. Cannot generate plots.")
elif 'avg_dist_calc' not in df.columns:
    print(f"Error: Column 'avg_dist_calc' not found in the DataFrame.")
else:
    # Filter for the target tree first
    df_tree_filtered = df[df['tree'] == target_tree].copy()

    # 2. Pre-processing: Identify which target_data_keys have valid data for this tree
    plots_to_make = []
    valid_data_keys = [] # Store original keys ('data-0', etc.)
    subplot_titles = [] # Store mapped labels for titles

    for data_key in target_data_keys:
        # Select data for the specific data_key and drop rows where 'avg_dist_calc' is NaN
        df_plot = df_tree_filtered[df_tree_filtered['data'] == data_key].dropna(subset=['avg_dist_calc']).copy()

        # Only proceed if there's data left after filtering
        if not df_plot.empty:
            df_plot.sort_values(by='radius', inplace=True) # Sort by radius for line plot
            plots_to_make.append(df_plot)
            valid_data_keys.append(data_key)
            # Get the mapped label for the subplot title, default to original key if not found
            subplot_titles.append(data_label_map.get(data_key, data_key))

    # 3. Check if there are any valid plots to create
    num_plots = len(plots_to_make)
    if num_plots == 0:
        print(f"Warning: No valid data found for plotting 'avg_dist_calc' for keys {target_data_keys} in {tree_title}.")
    else:
        # 4. Create the figure with subplots, using mapped titles
        fig = make_subplots(rows=1, cols=num_plots,
                            subplot_titles=subplot_titles) # Use mapped labels here

        # 5. Add traces and shapes/annotations to each subplot
        for i, df_plot in enumerate(plots_to_make):
            col_num = i + 1
            original_data_key = valid_data_keys[i] # Original key ('data-0')
            mapped_data_label = subplot_titles[i] # Mapped label ('Haar Lvl 0 (256 bins)')
            # Select marker symbol based on the subplot index 'i'
            marker_symbol = marker_symbols[i % len(marker_symbols)]

            # 5.1 Add Main Data Trace (Average Distance Calculations) with unique marker
            fig.add_trace(
                go.Scatter(
                    x=df_plot['radius'],
                    y=df_plot['avg_dist_calc'],
                    mode='lines+markers',
                    name=mapped_data_label, # Use mapped label for legend entry
                    marker=dict(size=6, symbol=marker_symbol), # <<< APPLY UNIQUE MARKER
                    legendgroup=f'group{i}', # Assign a legend group per subplot trace
                    hovertemplate = ( # Custom hover text
                        f'<b>{mapped_data_label}</b><br>' +
                        'Radius: %{x}<br>' +
                        'Avg Dist Calc: %{y:.2f}<extra></extra>' # Format y-value
                    )
                ),
                row=1, col=col_num
            )

            # 5.2 Add Horizontal Reference Line using add_shape
            xmax_plot = df_plot['radius'].max() if not df_plot.empty else 0
            fig.add_shape(
                type="line",
                x0=0, y0=horizontal_line_y,
                x1=xmax_plot,
                y1=horizontal_line_y,
                line=dict(color="grey", width=2, dash="dot"),
                row=1, col=col_num # Associate shape with the subplot
            )

            # 5.3 Add Annotation for the Reference Line
            fig.add_annotation(
                text=horizontal_line_label, # Use defined English label
                align='right',
                x=xmax_plot, # Position annotation near the end of the line
                y=horizontal_line_y,
                xref=f"x{col_num}", # Reference subplot's x-axis
                yref=f"y{col_num}", # Reference subplot's y-axis
                showarrow=False,
                yshift=-10, # Shift below the line
                xanchor="right" # Anchor text to the right of the x coordinate
            )


        # 6. Update the overall figure layout
        fig.update_layout(
            title_text=f'{tree_title} - Average Distance Calculations vs. Query Radius', # Mapped Tree Title
            showlegend=True, # Show legend for the main traces
            legend_title_text="Data Representation",
            width=1200,  # Largura
            height=600
        )
        # Update axes titles and range for all subplots
        fig.update_xaxes(
            title_text='Query Radius', # English
            rangemode='tozero',
            row=1 # Apply to all x-axes in row 1
        )
        fig.update_yaxes(
            title_text='Average Distance Calculations', # English
            rangemode='tozero',
            row=1 # Apply to all y-axes in row 1
        )

        # 7. Show the figure
        fig.show()

print("\nEnd of Average Distance Calculations plot generation.")

DataFrame 'df' already exists. Skipping creation.

Generating Average Distance Calculations vs. Radius plots for Index - Haar Transform: 0 Levels...



End of Average Distance Calculations plot generation.


In [34]:
# --- Mappings ---
tree_title_map = {
    "tree-0": "Haar Transform Index - 0 Levels",
    "tree-1": "Haar Transform Index - 1 Level",
    "tree-2": "Haar Transform Index - 2 Levels",
}

data_label_map = {
    "data-0": "Haar Lvl 0 (256 bins)",
    "data-1": "Haar Lvl 1 (128 bins)",
    "data-2": "Haar Lvl 2 (64 bins)",
    "data-3": "Haar Lvl 3 (32 bins)",
    "data-4": "Haar Lvl 4 (16 bins)",
    "data-5": "Haar Lvl 5 (8 bins)",
    "data-6": "Haar Lvl 6 (4 bins)",
    "data-7": "Haar Lvl 7 (2 bins)",
}

# --- Constants ---
SEQUENTIAL_ACCESS_VALUE = 332
TARGET_TREES = ["tree-0", "tree-1", "tree-2"]
# Define marker symbols for different rank lines
marker_symbols = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up',
                  'pentagon', 'hexagon', 'star', 'triangle-down'] # Add more if needed

# --- Plotting Logic ---
for tree_name, tree_data in all_data.items():
    # Filter for target trees
    if tree_name not in TARGET_TREES:
        continue

    if not isinstance(tree_data, dict):
        print(f"Skipping {tree_name}: Data is not in the expected dictionary format.")
        continue

    fig = go.Figure()
    original_data_labels = sorted([k for k in tree_data.keys() if k.startswith('data-')])
    if not original_data_labels:
        print(f"Skipping {tree_name}: No 'data-X' keys found.")
        continue
    x_axis_labels = [data_label_map.get(lbl, lbl) for lbl in original_data_labels]

    processed_data = {}
    max_rank = 0

    # --- Step 1: Extract and Rank Data ---
    for data_label in original_data_labels:
        data_set = tree_data.get(data_label, {})
        if not isinstance(data_set, dict):
            print(f"Warning: Skipping {data_label} in {tree_name}: Data is not a dictionary.")
            processed_data[data_label] = []
            continue

        sorted_radii = sorted(
            [item for item in data_set.items() if isinstance(item[1], dict) and 'radius' in item[1]],
            key=lambda item: item[1].get('radius', -1),
            reverse=True
        )
        processed_data[data_label] = sorted_radii
        max_rank = max(max_rank, len(sorted_radii))

    if max_rank == 0:
        print(f"Skipping {tree_name}: No valid radius data found across all data keys.")
        continue

    # --- Step 2: Structure Data by Rank Across Datasets ---
    for rank in range(max_rank):
        rank_disk_access = []
        rank_avg_obj = []
        for data_label in original_data_labels:
            sorted_radii = processed_data.get(data_label, [])
            if rank < len(sorted_radii):
                radius_key, radius_data = sorted_radii[rank]
                if isinstance(radius_data, dict):
                    rank_disk_access.append(radius_data.get('disk_access'))
                    rank_avg_obj.append(radius_data.get('avg_obj_result'))
                else:
                    rank_disk_access.append(None)
                    rank_avg_obj.append(None)
            else:
                rank_disk_access.append(None)
                rank_avg_obj.append(None)

        # Only add trace if there is at least one non-None value for this rank
        if not all(v is None for v in rank_disk_access):
            valid_avg_objs = [obj for obj in rank_avg_obj if obj is not None]
            avg_legend = np.nanmean(valid_avg_objs) if valid_avg_objs else float('nan')
            legend_name = f"Rank {rank + 1} (Avg Obj: {avg_legend:.3f})"
            # Select marker symbol for this rank
            marker_symbol = marker_symbols[rank % len(marker_symbols)]

            fig.add_trace(go.Scatter(
                x=x_axis_labels,
                y=rank_disk_access,
                mode='lines+markers',
                name=legend_name,
                marker=dict(symbol=marker_symbol, size=7), # <<< ADDED/MODIFIED MARKER
                line=dict(width=2), # Control line width if desired
                hovertemplate = (
                    f"<b>{tree_title_map.get(tree_name, tree_name)}</b><br>" +
                    'Representation: %{x}<br>' +
                    'Disk Access: %{y:.2f}<br>' + # Format y value
                    f'Rank: {rank + 1}<br>' +
                    f'Avg Obj Result (Line Avg): {avg_legend:.3f}' +
                    '<extra></extra>'
                )
            ))

    # --- Step 3: Add Reference Line and Finalize Layout ---
    fig.add_hline(
        y=SEQUENTIAL_ACCESS_VALUE,
        line_dash="dot",
        annotation_text="Sequential Scan Ref.",
        annotation_position="bottom right"
    )

    fig.update_layout(
        title=f"{tree_title_map.get(tree_name, tree_name)} - Disk Access by Radius Rank", # Updated title
        xaxis_title="Data Representation",
        yaxis_title="Disk Access",
        legend_title="Rank (Highest Radius = Rank 1)",
        hovermode="x unified",
        xaxis={'categoryorder':'array', 'categoryarray':x_axis_labels},
        width=1200,  # Largura
        height=600
    )

    # --- Step 4: Show Plot ---
    fig.show()

print("\nDisk Access plots by rank generated for selected trees.")



Disk Access plots by rank generated for selected trees.


In [35]:
# --- Mappings ---
tree_title_map = {
    "tree-0": "Haar Transform Index - 0 Levels",
    "tree-1": "Haar Transform Index - 1 Level",
    "tree-2": "Haar Transform Index - 2 Levels",
}

data_label_map = {
    "data-0": "Haar Lvl 0 (256 bins)",
    "data-1": "Haar Lvl 1 (128 bins)",
    "data-2": "Haar Lvl 2 (64 bins)",
    "data-3": "Haar Lvl 3 (32 bins)",
    "data-4": "Haar Lvl 4 (16 bins)",
    "data-5": "Haar Lvl 5 (8 bins)",
    "data-6": "Haar Lvl 6 (4 bins)",
    "data-7": "Haar Lvl 7 (2 bins)",
}

# --- Constants ---
REFERENCE_DISTANCE_VALUE = 20580
TARGET_TREES = ["tree-0", "tree-1", "tree-2"]
# Define marker symbols for different rank lines
marker_symbols = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up',
                  'pentagon', 'hexagon', 'star', 'triangle-down'] # Add more if needed

# --- Plotting Logic ---
for tree_name, tree_data in all_data.items():
    # Filter for target trees
    if tree_name not in TARGET_TREES:
        continue

    if not isinstance(tree_data, dict):
        print(f"Skipping {tree_name}: Data is not in the expected dictionary format.")
        continue

    fig = go.Figure()
    original_data_labels = sorted([k for k in tree_data.keys() if k.startswith('data-')])
    if not original_data_labels:
        print(f"Skipping {tree_name}: No 'data-X' keys found.")
        continue
    x_axis_labels = [data_label_map.get(lbl, lbl) for lbl in original_data_labels]

    processed_data = {}
    max_rank = 0

    # --- Step 1: Extract and Rank Data ---
    for data_label in original_data_labels:
        data_set = tree_data.get(data_label, {})
        if not isinstance(data_set, dict):
            print(f"Warning: Skipping {data_label} in {tree_name}: Data is not a dictionary.")
            processed_data[data_label] = []
            continue

        sorted_radii = sorted(
            [item for item in data_set.items() if isinstance(item[1], dict) and 'radius' in item[1]],
            key=lambda item: item[1].get('radius', -1),
            reverse=True
        )
        processed_data[data_label] = sorted_radii
        max_rank = max(max_rank, len(sorted_radii))

    if max_rank == 0:
        print(f"Skipping {tree_name}: No valid radius data found across all data keys.")
        continue

    # --- Step 2: Structure Data by Rank Across Datasets ---
    for rank in range(max_rank):
        rank_avg_dist = [] # Changed variable name
        rank_avg_obj = []
        for data_label in original_data_labels:
            sorted_radii = processed_data.get(data_label, [])
            if rank < len(sorted_radii):
                radius_key, radius_data = sorted_radii[rank]
                if isinstance(radius_data, dict):
                     # Get 'avg_dist_calc'
                    rank_avg_dist.append(radius_data.get('avg_dist_calc'))
                    rank_avg_obj.append(radius_data.get('avg_obj_result'))
                else:
                    rank_avg_dist.append(None)
                    rank_avg_obj.append(None)
            else:
                rank_avg_dist.append(None)
                rank_avg_obj.append(None)

        # Only add trace if there is at least one non-None value for this rank
        if not all(v is None for v in rank_avg_dist):
            valid_avg_objs = [obj for obj in rank_avg_obj if obj is not None]
            avg_legend = np.nanmean(valid_avg_objs) if valid_avg_objs else float('nan')
            legend_name = f"Rank {rank + 1} (Avg Obj: {avg_legend:.3f})"
            # Select marker symbol for this rank
            marker_symbol = marker_symbols[rank % len(marker_symbols)]

            fig.add_trace(go.Scatter(
                x=x_axis_labels,
                y=rank_avg_dist, # Use avg_dist data
                mode='lines+markers',
                name=legend_name,
                marker=dict(symbol=marker_symbol, size=7), # <<< ADDED/MODIFIED MARKER
                line=dict(width=2),
                hovertemplate = (
                    f"<b>{tree_title_map.get(tree_name, tree_name)}</b><br>" +
                    'Representation: %{x}<br>' +
                    'Avg Dist Calc: %{y:.2f}<br>' + # Updated label and formatting
                    f'Rank: {rank + 1}<br>' +
                    f'Avg Obj Result (Line Avg): {avg_legend:.3f}' +
                    '<extra></extra>'
                )
            ))

    # --- Step 3: Add Reference Line and Finalize Layout ---
    fig.add_hline(
        y=REFERENCE_DISTANCE_VALUE,
        line_dash="dot",
        annotation_text="Distance Reference",
        annotation_position="bottom right"
    )

    fig.update_layout(
        title=f"{tree_title_map.get(tree_name, tree_name)} - Avg Distance Calc by Radius Rank", # Updated title
        xaxis_title="Data Representation",
        yaxis_title="Average Distance Calculations",
        legend_title="Rank (Highest Radius = Rank 1)",
        hovermode="x unified",
        xaxis={'categoryorder':'array', 'categoryarray':x_axis_labels},
        width=1200,  # Largura
        height=600
    )

    # --- Step 4: Show Plot ---
    fig.show()

print("\nAverage Distance Calculation plots by rank generated for selected trees.")


Average Distance Calculation plots by rank generated for selected trees.


In [10]:
import plotly.graph_objects as go
import json

with open('results_page.json', 'r', encoding='utf-8') as arquivo:
    dados  = json.load(arquivo)

print(dados)

# Gráfico de porcentagem para data-0
tamanhos_pagina_0 = []
porcentagens_0 = []

for chave, valores in dados["tree-0"]["data-0"].items():
    tamanho = int(chave.split("-")[1])
    tamanhos_pagina_0.append(tamanho)
    if valores["seq_disk_access"] != 0:
        porcentagem = (valores["disk_access"] / valores["seq_disk_access"]) * 100
        porcentagens_0.append(porcentagem)
    else:
        porcentagens_0.append(0)  # Ou outra forma de lidar com divisão por zero

fig_porcentagem_data0 = go.Figure()
fig_porcentagem_data0.add_trace(go.Scatter(x=tamanhos_pagina_0, y=porcentagens_0, mode='lines+markers', name='Disk Access / Seq. Disk Access (%)'))
fig_porcentagem_data0.update_layout(title='Percentage of Disk Accesses (0 - Haar Transform (256 bins))',
                                  xaxis_title='Page Size (bytes)',
                                  yaxis_title='Disk Access / Total Disk Pages',
                                  width=800,  # Largura
                                  height=600)
fig_porcentagem_data0.show()

# Gráfico de porcentagem para data-1
tamanhos_pagina_1 = []
porcentagens_1 = []

for chave, valores in dados["tree-0"]["data-1"].items():
    tamanho = int(chave.split("-")[1])
    tamanhos_pagina_1.append(tamanho)
    if valores["seq_disk_access"] != 0:
        porcentagem = (valores["disk_access"] / valores["seq_disk_access"]) * 100
        porcentagens_1.append(porcentagem)
    else:
        porcentagens_1.append(0)  # Ou outra forma de lidar com divisão por zero

fig_porcentagem_data1 = go.Figure()
fig_porcentagem_data1.add_trace(go.Scatter(x=tamanhos_pagina_1, y=porcentagens_1, mode='lines+markers', name='Disk Access / Seq. Disk Access (%)'))
fig_porcentagem_data1.update_layout(title='Percentage of Disk Accesses (1 - Haar Transform (128 bins))',
                                  xaxis_title='Page Size (bytes)',
                                  yaxis_title='Disk Access / Total Disk Pages',
                                  width=800,  # Largura
                                  height=600)
fig_porcentagem_data1.show()

{'tree-0': {'data-0': {'page_size-32768': {'avg_time': 17.7356, 'disk_access': 1245.31, 'avg_dist_calc': 6830.79, 'avg_obj_result': 15.044, 'radius': 22830, 'num_consults': 500, 'seq_disk_access': 1372}, 'page_size-65536': {'avg_time': 14.5775, 'disk_access': 523.162, 'avg_dist_calc': 6016.61, 'avg_obj_result': 15.044, 'radius': 22830, 'num_consults': 500, 'seq_disk_access': 664}, 'page_size-131072': {'avg_time': 13.8582, 'disk_access': 258.174, 'avg_dist_calc': 5889.81, 'avg_obj_result': 15.044, 'radius': 22830, 'num_consults': 500, 'seq_disk_access': 332}, 'page_size-262144': {'avg_time': 13.8663, 'disk_access': 135.91, 'avg_dist_calc': 5935.15, 'avg_obj_result': 15.044, 'radius': 22830, 'num_consults': 500, 'seq_disk_access': 165}}, 'data-1': {'page_size-32768': {'avg_time': 36.2339, 'disk_access': 1246.5, 'avg_dist_calc': 9307.16, 'avg_obj_result': 15.09, 'radius': 11320, 'num_consults': 500, 'seq_disk_access': 1372}, 'page_size-65536': {'avg_time': 33.7726, 'disk_access': 523.536,