# **Batch Size Result Analysis**

In this notebook, the raw energy consumption data, with different configurations of inference batch sizes, is analyzed. The results are averaged over the five runs, along with the computation of the standard deviation for each tracked parameter.

In [185]:
# import required libraries
import pandas as pd
import numpy as np
import glob
import os
import re

In [None]:
# get working directory, necessary to gather the data to be analyzed
current_dir = os.getcwd()
print(f"Current Working Directory: {current_dir}")
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
print(f"Parent Directory: {parent_dir}")

## **Helper Functions**
Useful functions to properly format labels and to obtain final results from raw inference energy consumption data.

In [187]:
def get_meanstd(mdl, direct):
    """
    Compute mean and standard deviation for emission data across multiple runs.

    Processes CSV files containing emission data from codecarbon runs.
    The results are saved into new CSV files.

    Parameters:
    -----------
    mdl : str
        The model name, used to identify the relevant emission files.
        Files are expected to start with "{model}-emissions-...".

    direct : str
        The directory path where the emission CSV files are located.

    Returns:
    --------
    None
        The function does not return any value but saves the computed
        mean and standard deviation results to CSV files within the input directory.
    """
    columns_to_average = [
        "emissions", "emissions_rate", "cpu_power", "gpu_power", "ram_power",
        "cpu_energy", "gpu_energy", "ram_energy", "energy_consumed"
    ]

    if direct.endswith("batch1\\") or direct.endswith("batch2\\") or direct.endswith("batch5\\"):
        if direct.endswith("batch1\\"):
            batch_name = "batch1"
        elif direct.endswith("batch2\\"):
            batch_name = "batch2"
        else:
            batch_name = "batch5"

        file_pattern = os.path.join(direct, f"{mdl}-emissions-{batch_name}-n*-run*.csv")
        files = glob.glob(file_pattern)
        file_groups = {}

        pattern = re.compile(rf"^{re.escape(mdl)}-emissions-{batch_name}-(n.+?)-run")

        for file in files:
            basename = os.path.basename(file)
            match = pattern.search(basename)
            if match:
                n_group = match.group(1)
                file_groups.setdefault(n_group, []).append(file)
            else:
                print(f"Filename {basename} did not match expected pattern.")

        averaged_files = []
        variances_files = []

        # process each group of files
        for n_group, group_files in file_groups.items():
            dfs = [pd.read_csv(file) for file in group_files]
            selected_data = [df[columns_to_average] for df in dfs]

            concatenated = pd.concat(selected_data, ignore_index=True)
            averaged_series = concatenated.mean(numeric_only=True)
            std_series = concatenated.std(numeric_only=True)
            # Convert the Series to a one-row DataFrame.
            averaged_df = pd.DataFrame([averaged_series])
            std_df = pd.DataFrame([std_series])
            variances_df = std_df[columns_to_average] ** 2

            averaged_csv_path = os.path.join(direct, f"averaged_{batch_name}_{n_group}.csv")
            averaged_df.to_csv(averaged_csv_path, index=False)
            averaged_files.append(averaged_csv_path)

            variances_csv_path = os.path.join(direct, f"var_{batch_name}_{n_group}.csv")
            variances_df.to_csv(variances_csv_path, index=False)
            variances_files.append(variances_csv_path)

        averaged_dfs = [pd.read_csv(file) for file in averaged_files]
        averaged_combined = pd.concat(averaged_dfs, ignore_index=True)

        # Create a dict to hold final combined values.
        final_data = {}
        for col in averaged_combined.columns:
            if col == "emissions_rate":
                # For the rate column, average across groups.
                final_data[col] = averaged_combined[col].mean()
            else:
                # For other columns, sum across groups.
                final_data[col] = averaged_combined[col].sum()

        final_df = pd.DataFrame([final_data])
        total_csv_path = os.path.join(direct, "total.csv")
        final_df.to_csv(total_csv_path, index=False)

        # For the variance files, process similarly:
        variance_dfs = [pd.read_csv(file) for file in variances_files]
        variance_combined = pd.concat(variance_dfs, ignore_index=True)

        final_variance = {}
        for col in variance_combined.columns:
            if col == "emissions_rate":
                final_variance[col] = variance_combined[col].mean()
            else:
                final_variance[col] = variance_combined[col].sum()

        final_variance_df = pd.DataFrame([final_variance])

        # pooled std, assuming 5 runs per group (adjust division as needed)
        final_std = np.sqrt(final_variance_df / 5)
        totalstd_csv_path = os.path.join(direct, "total_std.csv")
        final_std.to_csv(totalstd_csv_path, index=False)
        return

    else:
        # handle files for the 10 batch experiments in the main folder
        file_pattern = os.path.join(direct, fr"{mdl}-emissions-batch10*")
        files = glob.glob(file_pattern)
        dfs = [pd.read_csv(file) for file in files]
        selected_data = [df[columns_to_average] for df in dfs]

        concatenated = pd.concat(selected_data, ignore_index=True)
        averaged_series = concatenated.mean(numeric_only=True)
        averaged_df = pd.DataFrame([averaged_series])
        averaged_csv_path = os.path.join(direct, r"averaged_batch10.csv")
        averaged_df.to_csv(averaged_csv_path, index=False)

        std_series = concatenated.std(numeric_only=True)
        std_df = pd.DataFrame([std_series])
        std_csv_path = os.path.join(direct, r"std_batch10.csv")
        std_df.to_csv(std_csv_path, index=False)

        return

In [188]:
def insert_model_name(paths, modello):
    """
    Modifies a list of file paths by inserting the model name (`modello`)
    before the last element of each path. The resulting paths are returned as a list.

    Parameters:
    -----------
    paths : list[str]
        A list of file paths where the model name will be inserted.

    modello : str
        The model name to be inserted into each path.

    Returns:
    --------
    list[str]
        A list of modified paths with the model name inserted before the last element.
    """
    modified_paths = []
    for pathh in paths:
        # Remove leading/trailing slashes
        path_parts = pathh.strip("/").split("/")
        # Insert the model name before the last part of the path
        new_path_parts = path_parts[:-1] + [modello] + path_parts[-1:]
        modified_path = "/" + "/".join(new_path_parts) # Reconstruct the modified path
        modified_paths.append(modified_path)
    return modified_paths

In [189]:
def get_emission_data(perc, mdl, option="summary"):
    """
    Analyze emission experiment results for diffusion models using CodeCarbon logs.

    Processes CodeCarbon CSV output files to generate statistical summaries
    of energy consumption and emissions across multiple experiment runs. It calculates
    mean and standard deviation for energy and emissions metrics, provides output options.

    Parameters:
    -----------
    path : list[str]
        List of file paths to CodeCarbon log CSV files for the experiment.
        Each path should point to a CSV file containing emission metrics.

    mdl : str
        Name of the model being analyzed. Used for labeling output files and results.

    option : str, optional (default="summary")
        Specifies the type of metrics to analyze and output.
        Valid options include:

        - "gpu_energy": Total energy used
        - "emissions": Carbon emissions
        - "emissions_rate": Emission rate
        - "energy_rate": Energy rate
        - "summary": Overview of all metrics

    Returns:
    --------
    None
        Outputs are written to CSV files and printed to the terminal.

    Outputs:
    --------
    - CSV files with mean and standard deviation of selected metrics
    - Formatted table of results printed to the terminal
    - Files saved in the same directory as the input logs,
      with names following the pattern: {model}-{metric}-{type}.csv (e.g., model-emissions-mean.csv)
    """

    # Initialize the summary flag to False. This controls whether a comprehensive summary
    # of all metrics is generated or if the analysis focuses on a specific metric option.
    summary = False
    # Dictionary mapping emission metrics to their visualization and labeling
    # Each entry contains: [column_name, color_set, display_title, unit, description]
    # Dictionary mapping metrics to their plotting properties
    emissdict = {"gpu_energy": ["gpu_energy", "rocket_r", "Energy at inference (GPU)", "[kWh]", "Energy at inference (GPU)"],
                 "emissions": ["emissions", "crest", "Total emissions", "[Kg]", "Emissions in CO$_2$-equivalents"],
                 "emissions_rate": ["emissions_rate", "crest", "Emission Rate", "[Kg/s]", "Emissions divided per duration"],
                 "energy_rate": ["energy_rate", "crest", "Energy consumption rate", "[kW]", "Energy consumption rate"]}

    batches = [
        {"path": perc + r"batch1\total.csv", "std_path": perc + r"batch1\total_std.csv", "label": f"1"},
        {"path": perc + r"batch2\total.csv", "std_path": perc + r"batch2\total_std.csv", "label": f"2"},
        {"path": perc + r"batch5\total.csv", "std_path": perc + r"batch5\total_std.csv", "label": f"5"},
        {"path": perc + "averaged_batch10.csv", "std_path": perc + "std_batch10.csv", "label": f"10"}
    ]

    separator = "-" * 80
    print(separator)
    print(mdl.center(80))
    print(separator)

    # Iterate over emission metrics
    for num, p in enumerate(emissdict.keys()):
        if option == "summary":
            summary = True

        # Skip other metrics if not in summary mode
        if not summary and num != 0:
            continue
        elif summary:
            option = p

        all_batches = []
        all_stds = []

        for batch in batches:
            # Load data for each batch size experiment
            batch_data = pd.read_csv(batch["path"])
            batch_std = pd.read_csv(batch["std_path"])
            batch_data["batch_n"] = batch["label"]
            batch_std["batch_n"] = batch["label"]
            all_batches.append(batch_data)
            all_stds.append(batch_std)

        # Combine data from all batches into single DataFrames
        df_mean = pd.concat(all_batches, ignore_index=True)
        df_std = pd.concat(all_stds, ignore_index=True)

        if option == "energy_rate":
            df_mean['duration'] = df_mean['emissions'] / df_mean['emissions_rate'] / 3600
            df_mean['energy_rate'] = df_mean['energy_consumed'] / df_mean['duration']
            df_std['duration'] = df_std['emissions'] / df_std['emissions_rate'] / 3600
            df_std['energy_rate'] = df_std['energy_consumed'] / df_std['duration']

        # Handling for power and energy metrics (group values for the same inference steps)
        if option in ("power", "energy"):
            df_mean = melt_and_map(df_mean, option)
            df_std = melt_and_map(df_std, option)

        # Metric title
        print(f"'{emissdict[option][2]}'")
        if option in ("power", "energy"):
            print(format_powen(df_mean, emissdict, option, df_std, printed=True).to_string(index=False) + "\n \n")
        else:
            print(format_rest(df_mean, emissdict, option, df_std, printed=True).to_string(index=False) + "\n \n")

In [190]:
def melt_and_map(df, option):
    """
    Transforms a DataFrame by reshaping and labeling component-specific energy or power metrics.
    - Reshapes the DataFrame from wide to long format, focusing on CPU, GPU, and RAM metrics
    - Replaces generic column names with component descriptions

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing component-specific metrics. Expected to have columns for CPU, GPU, and RAM metrics.

    option : str
        The type of metric being processed. Either "power" or "energy".

    Returns:
    --------
    pandas.DataFrame
        Reshaped and relabeled DataFrame with the following changes:
        - Converted from wide to long format
        - Component columns renamed to specific hardware descriptions
        - Maintains the original project name as an identifier
    """
    # Reshape the DataFrame from wide to long format, focusing on component-specific metrics
    df = pd.melt(df, id_vars=["batch_n"],
                 value_vars=[f"cpu_{option}", f"gpu_{option}", f"ram_{option}"],
                 var_name=f"{option}_type", value_name=f"{option}")

    # Map generic column names to specific hardware component descriptions
    df[f"{option}_type"] = df[f"{option}_type"].map({
        f"cpu_{option}": "CPU (AMD EPYC 7313)",
        f"gpu_{option}": "GPU (NVIDIA A40)",
        f"ram_{option}": "RAM (64 GB)"
    })
    return df

In [191]:
def format_powen(df, emissdict, option, std=pd.DataFrame(), printed=False):
    """
    Formats DataFrame of performance metrics with sorted project names and units appended to numeric values.
    Works with either "energy" or "power" options.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing performance and energy metrics.
        Expected to have columns 'batch_n' and metric-specific columns.

    emissdict : dict
        Dictionary mapping metric options to specific configuration details.

    option : str
        Specifies the type of metric to be formatted. Must be a valid key in emissdict.

    std : pandas.DataFrame
        DataFrame containing the confidence intervals for the obtained results

    printed : bool
        Flag to check whether the function is being used for printing the results to terminal.

    Returns:
    --------
    pandas.DataFrame
        Formatted DataFrame with:
        - Rows sorted by numeric project identifier
        - Columns: 'Batch size', 'CPU', 'GPU', 'RAM'
        - Numeric values with appropriate units
    """
    # Handles case for printing to terminal, adds the confidence interval to the data visualization
    if printed:
        # Pivot the DataFrame to structure it with 'batch_n' as rows and a metric as columns
        df_pivot = df.pivot(index="batch_n", columns=emissdict[option][0], values=f"{option}").reset_index()
        std_pivot = std.pivot(index="batch_n", columns=emissdict[option][0], values=f"{option}").reset_index()
        # Sort the DataFrame by extracting numeric identifiers from the 'batch_n' column
        df_pivot = df_pivot.loc[df_pivot['batch_n'].str.extract(r'(\d+)').astype(int).squeeze().sort_values().index]
        std_pivot = std_pivot.loc[std_pivot['batch_n'].str.extract(r'(\d+)').astype(int).squeeze().sort_values().index]
        # Rename columns
        df_pivot.columns = ['Batch size', 'CPU (AMD EPYC 7313)', 'GPU (NVIDIA A40)', 'RAM (64 GB)']
        std_pivot.columns = ['Batch size', 'CPU (AMD EPYC 7313)', 'GPU (NVIDIA A40)', 'RAM (64 GB)']
        # Append units to numeric values in 'CPU', 'GPU', and 'RAM' columns
        for col in ['CPU (AMD EPYC 7313)', 'GPU (NVIDIA A40)', 'RAM (64 GB)']:
            df_values = df_pivot[col]
            std_values = std_pivot[col]
            # Round both sets of values
            rounded_df_values = round(df_values, 8)
            df_str = rounded_df_values.astype(str)
            # Add confidence interval
            std_str = std_values.apply(lambda b: "{:.1e}".format(b))
            combined_data = df_str + " ± " + std_str + f" {emissdict[option][3]}"
            df_pivot[col] = combined_data
    else:
        # Pivot the DataFrame to structure it with 'batch_n' as rows and a metric as columns
        df_pivot = df.pivot(index="batch_n", columns=emissdict[option][0], values=f"{option}").reset_index()
        # Sort the DataFrame by extracting numeric identifiers from the 'batch_n' column
        df_pivot = df_pivot.loc[df_pivot['batch_n'].str.extract(r'(\d+)').astype(int).squeeze().sort_values().index]
        # Rename columns
        df_pivot.columns = ['Batch size', 'CPU (AMD EPYC 7313)', 'GPU (NVIDIA A40)', 'RAM (64 GB)']
        # Append units to numeric values in 'CPU', 'GPU', and 'RAM' columns
        for col in ['CPU (AMD EPYC 7313)', 'GPU (NVIDIA A40)', 'RAM (64 GB)']:
            df_pivot[col] = df_pivot[col].astype(str) + f" {emissdict[option][3]}"
    return df_pivot

In [192]:
def format_rest(df, emissdict, option, std=pd.DataFrame(), printed=False):
    """
    Formats a DataFrame by standardizing column names and adding units to metric values.

    - Appends unit notation to the metric values
    - Renames columns to provide clear headers for printing

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing emission or energy metrics.

    emissdict : dict
        A dictionary containing metadata for different emission metrics.

    option : str
        The specific metric being processed. Must be a key present in the emissdict dictionary.

    std : pandas.DataFrame
        DataFrame containing the confidence intervals for the obtained results

    printed : bool
        Flag to check whether the function is being used for printing the results to terminal.

    Returns:
    --------
    pandas.DataFrame
        Formatted DataFrame with:
        - Metric values annotated with their units
        - Columns renamed to provide clearer headers
    """
    # Handles case for printing to terminal, adds the confidence interval to the data visualization
    if printed:
        std_values = std[f'{option}']
        # Round both sets of values
        rounded_df_values = round(df[f'{option}'], 8)
        df_str = rounded_df_values.astype(str)
        std_str = std_values.apply(lambda m: "{:.1e}".format(m))
        combined_data = df_str + " ± " + std_str + f" {emissdict[option][3]}"
        df[f'{option} '] = combined_data
        # Create a table with project name and annotated metric
        table = df[['batch_n', f'{option} ']]
        # Rename columns to descriptive headers
        table.columns = ['Batch size', f'{emissdict[option][2]}']
        return table
    else:
        # Append unit notation to the metric values as a string
        df[f'{option} '] = df[f'{option}'].astype(str) + f" {emissdict[option][3]}"
        # Create a table with project name and annotated metric
        table = df[['batch_n', f'{option} ']]
        # Rename columns to descriptive headers
        table.columns = ['Batch size', f'{emissdict[option][2]}']
        return table

## **Data analysis**
Using the previously defined functions, we now can analyze the data obtained for the five different runs for all considered models.

In [193]:
# List of all models for analysis
model_list = ["AudioLDM", "AudioLDM2", "Make-an-Audio", "Make-an-Audio-2", "Stable Audio Open",
              "Tango", "Tango2"]
path_list = []
# Iterate over selected models to prepare paths, optionally update data, and process experiments
for x in model_list:
    # Define the local path for the model's emission data
    path = fr"{current_dir}\results\batch_size\{x}\\"
    path_list.append(path)
    get_meanstd(x, path+"batch1\\"); get_meanstd(x, path+"batch2\\")
    get_meanstd(x, path+"batch5\\"); get_meanstd(x, path)

In [194]:
# view all models
for i, x in enumerate(model_list):
    get_emission_data(path_list[i], x)

--------------------------------------------------------------------------------
                                    AudioLDM                                    
--------------------------------------------------------------------------------
'Energy at inference (GPU)'
Batch size  Energy at inference (GPU)
         1 0.00106286 ± 2.0e-06 [kWh]
         2 0.00072306 ± 2.1e-06 [kWh]
         5 0.00053467 ± 1.5e-06 [kWh]
        10 0.00047264 ± 3.2e-06 [kWh]
 

'Total emissions'
Batch size           Total emissions
         1 0.00066945 ± 1.1e-06 [Kg]
         2 0.00040238 ± 9.7e-07 [Kg]
         5 0.00025846 ± 8.7e-07 [Kg]
        10 0.00022346 ± 2.5e-06 [Kg]
 

'Emission Rate'
Batch size              Emission Rate
         1 2.406e-05 ± 1.2e-07 [Kg/s]
         2 2.802e-05 ± 1.1e-07 [Kg/s]
         5 3.612e-05 ± 1.3e-07 [Kg/s]
        10 3.767e-05 ± 4.0e-07 [Kg/s]
 

'Energy consumption rate'
Batch size   Energy consumption rate
         1 0.26194301 ± 1.3e-03 [kW]
         2 0.30504559

In [195]:
# view single models
model_name = "AudioLDM" # model = "AudioLDM2" # model = "Make-an-Audio" # . . .
# Iterate over selected models to prepare paths, optionally update data, and process experiments
# Define the local path for the model's emission data
path = fr"{current_dir}\results\batch_size\{model_name}\\"
get_emission_data(path, model_name)

--------------------------------------------------------------------------------
                                    AudioLDM                                    
--------------------------------------------------------------------------------
'Energy at inference (GPU)'
Batch size  Energy at inference (GPU)
         1 0.00106286 ± 2.0e-06 [kWh]
         2 0.00072306 ± 2.1e-06 [kWh]
         5 0.00053467 ± 1.5e-06 [kWh]
        10 0.00047264 ± 3.2e-06 [kWh]
 

'Total emissions'
Batch size           Total emissions
         1 0.00066945 ± 1.1e-06 [Kg]
         2 0.00040238 ± 9.7e-07 [Kg]
         5 0.00025846 ± 8.7e-07 [Kg]
        10 0.00022346 ± 2.5e-06 [Kg]
 

'Emission Rate'
Batch size              Emission Rate
         1 2.406e-05 ± 1.2e-07 [Kg/s]
         2 2.802e-05 ± 1.1e-07 [Kg/s]
         5 3.612e-05 ± 1.3e-07 [Kg/s]
        10 3.767e-05 ± 4.0e-07 [Kg/s]
 

'Energy consumption rate'
Batch size   Energy consumption rate
         1 0.26194301 ± 1.3e-03 [kW]
         2 0.30504559