In [1]:
!uv pip install numpy pandas tinyflux flatten_dict tqdm

[2mUsing Python 3.10.14 environment at /u/49/anhdun1/unix/git/research/odop/.venv[0m
[2K[2mResolved [1m9 packages[0m [2min 119ms[0m[0m                                         [0m
[2K[2mPrepared [1m2 packages[0m [2min 238ms[0m[0m                                             
[2K[2mInstalled [1m4 packages[0m [2min 51ms[0m[0m                                [0m
 [32m+[39m [1mpandas[0m[2m==2.2.3[0m
 [32m+[39m [1mpytz[0m[2m==2025.2[0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m
 [32m+[39m [1mtzdata[0m[2m==2025.2[0m


In [15]:
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm


In [19]:
import numpy as np
from flatten_dict import unflatten
from tinyflux import TinyFlux


def revert_unit(unit_conversion, converted_report: dict):
    original_report = converted_report.copy()
    for key, value in converted_report.items():
        if "unit" in key:
            if "frequency" in key:
                for original_unit, converted_unit in unit_conversion[
                    "frequency"
                ].items():
                    if converted_unit == value:
                        original_report[key] = original_unit
                        break
            elif "mem" in key:
                for original_unit, converted_unit in unit_conversion["mem"].items():
                    if converted_unit == value:
                        original_report[key] = original_unit
                        break
            elif "cpu" in key:
                if "usage" in key:
                    for original_unit, converted_unit in unit_conversion["cpu"][
                        "usage"
                    ].items():
                        if converted_unit == value:
                            original_report[key] = original_unit
                            break
            elif "gpu" in key:
                if "usage" in key:
                    for original_unit, converted_unit in unit_conversion["gpu"][
                        "usage"
                    ].items():
                        if converted_unit == value:
                            original_report[key] = original_unit
                            break
    return original_report


def extract_data(data):
    unit_conversion = {
        "cpu": {"usage": {"milicpu": 1, "cputime": 2, "percentage": 3}},
        "gpu": {"usage": {"percentage": 1}},
        "mem": {"Gb": 1, "Mb": 2, "Kb": 3},
        "frequency": {"GHz": 1, "MHz": 2},
    }
    converted_process_data = {}
    converted_system_data = []
    for datapoint in data:
        if datapoint.tags["type"] == "process":
            converted_datapoint = {
                **datapoint.tags,
                **datapoint.fields,
                "timestamp": datetime.timestamp(datapoint.time),
            }
            converted_datapoint = unflatten(
                revert_unit(unit_conversion, converted_datapoint), "dot"
            )
            pid = converted_datapoint["metadata"]["pid"]
            if pid in converted_process_data:
                converted_process_data[pid].append(converted_datapoint)
            else:
                converted_process_data[pid] = [converted_datapoint]
        else:
            converted_datapoint = {
                **datapoint.tags,
                **datapoint.fields,
                "timestamp": datetime.timestamp(datapoint.time),
            }
            converted_datapoint = unflatten(
                revert_unit(unit_conversion, converted_datapoint), "dot"
            )
            converted_system_data.append(converted_datapoint)

    return converted_process_data, converted_system_data


def extract_data_from_file_path(file_path: str):
    db = TinyFlux(file_path)
    data = db.all()
    return extract_data(data)

In [6]:
file_path = "runs_32/run_1-task_data-movement_round-robin/metric_database/nid007960.csv"
#try:
# process_data, system_data = extract_data_from_file_path(file_path)
# time_taken += len(system_data)
# for entity in system_data:
#     cpu_usage = entity["cpu"]["usage"]["value"]
#     filtered_usage = [
#         usage for core, usage in cpu_usage.items() if int(core.split("_")[1]) < 64
#     ]
#     total_usage += sum(filtered_usage)
#     total_cores += len(filtered_usage)
#xcept Exception as e:
#   print(e)
#   print(entity)

In [28]:
len(system_data)

694

In [26]:
from concurrent.futures import ProcessPoolExecutor
node_config_list = ["4"]
results = []

def process_file(file_path):
    """Extract data from a CSV file and compute usage statistics."""
    try:
        process_data, system_data = extract_data_from_file_path(file_path)
        total_usage = sum(
            sum(usage for core, usage in entity["cpu"]["usage"]["value"].items() if int(core.split("_")[1]) < 64)
            for entity in system_data
        )
        total_cores = sum(
            sum(1 for core in entity["cpu"]["usage"]["value"] if int(core.split("_")[1]) < 64)
            for entity in system_data
        )
        time_taken = len(system_data)
        return total_usage, total_cores, time_taken
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0, 0, 0

for numb_node in node_config_list:
    base_folder = f"runs_{numb_node}"
    for folder in tqdm(sorted(os.listdir(base_folder), reverse=True)):
        sub_folder = os.path.join(base_folder, folder, "metric_database")
        if not os.path.exists(sub_folder):
            continue
        
        total_usage = 0
        total_cores = 0
        time_taken = 0
        folder_split = folder.split("_")

        csv_files = [os.path.join(sub_folder, file) for file in os.listdir(sub_folder) if file.endswith(".csv")]

        with ProcessPoolExecutor() as executor:
            results_list = executor.map(process_file, csv_files)

        for usage, cores, time in results_list:
            total_usage += usage
            total_cores += cores
            time_taken += time

        overall_average_usage = total_usage / total_cores if total_cores > 0 else 0
        results.append([
            numb_node,
            folder_split[2],
            folder_split[3],
            overall_average_usage,
            time_taken / int(numb_node),
        ])

df = pd.DataFrame(
    results,
    columns=[
        "Numb node",
        "Optasks",
        "Algorithm",
        "Average CPU Usage (%)",
        "Time taken (s)",
    ],
)

print(df)

100%|█████████████████████████████████████████████████████████████████████| 15/15 [00:13<00:00,  1.09it/s]

   Numb node               Optasks    Algorithm  Average CPU Usage (%)  \
0          4                                                 17.785998   
1          4  reduce-data-movement     priority              18.695302   
2          4  reduce-data-movement         fifo              18.051298   
3          4  reduce-data-movement     best-fit              18.372785   
4          4                reduce  round-robin              18.448070   
5          4                reduce     priority              17.426560   
6          4                reduce         fifo              17.893591   
7          4                reduce     best-fit              17.996417   
8          4                pc-cpu  round-robin              30.091290   
9          4                pc-cpu     priority              30.500693   
10         4                pc-cpu         fifo              29.817168   
11         4                pc-cpu     best-fit              30.398761   
12         4         data-movement  ro




In [29]:
node_config_list = ["32"]
results = []

def process_file(file_path):
    """Extract data from a CSV file and compute usage statistics."""
    try:
        process_data, system_data = extract_data_from_file_path(file_path)
        total_usage = sum(
            sum(usage for core, usage in entity["cpu"]["usage"]["value"].items() if int(core.split("_")[1]) < 64)
            for entity in system_data
        )
        total_cores = sum(
            sum(1 for core in entity["cpu"]["usage"]["value"] if int(core.split("_")[1]) < 64)
            for entity in system_data
        )
        time_taken = len(system_data)
        return total_usage, total_cores, time_taken
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0, 0, 0

for numb_node in node_config_list:
    base_folder = f"runs_{numb_node}"
    for folder in tqdm(sorted(os.listdir(base_folder), reverse=True)):
        sub_folder = os.path.join(base_folder, folder, "metric_database")
        if not os.path.exists(sub_folder):
            continue
        
        total_usage = 0
        total_cores = 0
        time_taken = 0
        folder_split = folder.split("_")

        csv_files = [os.path.join(sub_folder, file) for file in os.listdir(sub_folder) if file.endswith(".csv")]

        with ProcessPoolExecutor() as executor:
            results_list = executor.map(process_file, csv_files)

        for usage, cores, time in results_list:
            total_usage += usage
            total_cores += cores
            time_taken += time

        overall_average_usage = total_usage / total_cores if total_cores > 0 else 0
        results.append([
            numb_node,
            folder_split[2],
            folder_split[3],
            overall_average_usage,
            time_taken / int(numb_node),
        ])

df = pd.DataFrame(
    results,
    columns=[
        "Numb node",
        "Optasks",
        "Algorithm",
        "Average CPU Usage (%)",
        "Time taken (s)",
    ],
)

print(df)

100%|█████████████████████████████████████████████████████████████████████| 13/13 [01:38<00:00,  7.55s/it]

   Numb node        Optasks    Algorithm  Average CPU Usage (%)  \
0         32                                          17.439153   
1         32         reduce  round-robin              17.519794   
2         32         reduce     priority              17.211960   
3         32         reduce         fifo              17.035072   
4         32         reduce     best-fit              17.450913   
5         32         pc-cpu  round-robin              29.991062   
6         32         pc-cpu     priority              29.856786   
7         32         pc-cpu         fifo              29.908088   
8         32         pc-cpu     best-fit              29.904649   
9         32  data-movement  round-robin              16.718184   
10        32  data-movement     priority              17.440737   
11        32  data-movement         fifo              16.556026   
12        32  data-movement     best-fit              16.472471   

    Time taken (s)  
0        700.15625  
1        719.46875 




In [45]:
latex_table = df.to_latex(index=False, float_format="%.2f")
with open("cpu_usage_table.tex", "w") as f:
    f.write(latex_table)

# Print LaTeX table
print(latex_table)

\begin{tabular}{lllr}
\toprule
Numb node & Optasks & Algorithm & Average CPU Usage (%) \\
\midrule
4 &  &  & 17.79 \\
4 & reduce & round-robin & 18.45 \\
4 & reduce & priority & 17.43 \\
4 & reduce & fifo & 17.89 \\
4 & reduce & best-fit & 18.00 \\
4 & pc-cpu & round-robin & 30.09 \\
4 & pc-cpu & priority & 30.50 \\
4 & pc-cpu & fifo & 29.82 \\
4 & pc-cpu & best-fit & 30.40 \\
\bottomrule
\end{tabular}

