In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from yaml import load, Loader
import subprocess
from json import dump

## Parameters for Processing

In [15]:
output_path = Path("carbonate_run1")
raw_path = output_path / "raw_data"
actual_path = raw_path / "actual_time.yaml"
theoretical_path = raw_path / "theoretical_time.yaml"
metadata_path = output_path / "metadata.yaml"
mem_folder_path = raw_path / "memory_output"

data_dump_path = output_path / "processed_output"
output_fnames = ("actual_runtime.csv", "theoretical_runtime.csv", "bytes.csv", "all_fields.json")
MFLOPS = 751.310861 # carbonate run 1
# MFLOPS = 866.458302 # quartz run 3
# MFLOPS = 1250 # macbook run 1
FLOPS = MFLOPS * 10E6
MODELS = ["tf-necd", "tf-cod", "pytorch-qrcp", "pytorch-qr", "pytorch-svd", "pytorch-svddc", "sklearn-svddc"]

## Loading Metadata

In [16]:
with open(metadata_path, "r") as f:
    metadata = load(f, Loader=Loader)
    
metadata

{'dataset_shape': '750000000 x 10',
 'failed_regs': [],
 'failed_regs_exceptions': [],
 'repeat': 1,
 'rows_in_experiment': [10,
  15,
  25,
  39,
  63,
  100,
  158,
  251,
  398,
  630,
  1000,
  1584,
  2511,
  3981,
  6309,
  10000,
  15848,
  25118,
  39810,
  63095,
  100000,
  158489,
  251188,
  398107,
  630957,
  1000000,
  1584893,
  2511886,
  3981071,
  6309573,
  10000000,
  15848931,
  25118864,
  39810717,
  63095734,
  100000000,
  158489319,
  251188643,
  398107170,
  630957344],
 'timer_method': 'process in nanoseconds'}

## Loading Actual Times

In [17]:
with open(actual_path, "r") as f:
    actual_times = load(f, Loader=Loader)
    
actual_times_dict = {reg_name: {row_count: np.mean([nanoseconds for rc, nanoseconds in actual_times if row_count == rc]) for row_count, _ in actual_times} for reg_name, actual_times in actual_times.items()}
actual_times_dict

{'pytorch-qr': {10: 1352550.0,
  15: 467393.0,
  25: 463622.0,
  39: 453137.0,
  63: 439054.0,
  100: 428975.0,
  158: 476150.0,
  251: 672388.0,
  398: 442520.0,
  630: 468264.0,
  1000: 474951.0,
  1584: 506794.0,
  2511: 548645.0,
  3981: 600238.0,
  6309: 768044.0,
  10000: 1025042.0,
  15848: 1363962.0,
  25118: 1981874.0,
  39810: 3558143.0,
  63095: 5431610.0,
  100000: 8273313.0,
  158489: 12213175.0,
  251188: 18533394.0,
  398107: 27798812.0,
  630957: 48130967.0,
  1000000: 102834308.0,
  1584893: 192216703.0,
  2511886: 315616921.0,
  3981071: 511314944.0,
  6309573: 842394902.0,
  10000000: 1352906980.0,
  15848931: 2143429477.0,
  25118864: 3405443022.0,
  39810717: 5397195815.0,
  63095734: 8542677216.0,
  100000000: 13558421851.0,
  158489319: 21547695997.0,
  251188643: 32623499766.0,
  398107170: 53573543011.0,
  630957344: 93609084932.0},
 'pytorch-qrcp': {10: 73436394.0,
  15: 468439.0,
  25: 1058864.0,
  39: 949907.0,
  63: 537679.0,
  100: 474765.0,
  158: 470684.

## Loading Theoretical Flop Counts

In [18]:
with open(theoretical_path, "r") as f:
    theoretical_flops = load(f, Loader=Loader)
    
theoretical_times_dict = {reg_name: {row_count: (flops / FLOPS) * 10E9 for row_count, flops in theoretical_flops} for reg_name, theoretical_flops in theoretical_flops.items()}
# theoretical_times_dict

## Loading / Processing Binary Memory Files

In [19]:
make_mempath = lambda reg_name, row_count, iter, ext: mem_folder_path / f"{'mem' if ext == 'bin' else 'memray-csv-mem'}_{reg_name}_{row_count}_{iter}.{ext}"

mem_dict = {}
for reg_name in MODELS:
    mem_dict[reg_name] = {}
    for row_count in metadata["rows_in_experiment"]:
        runs = []
        for iter in range(metadata["repeat"]):
            # use memray transform to actually create csv
            input_path = make_mempath(reg_name, row_count, iter, "bin")
            print(f"{input_path =}")
            subprocess.run(["memray", "transform", "csv", input_path, "-f"])
            
            # load csv normally, sum all mem usage
            output_path = make_mempath(reg_name, row_count, iter, "csv")
            single_mem_df = pd.read_csv(output_path, header=0, index_col=None)
            bytes_used = single_mem_df["size"].sum()
            runs.append(bytes_used)
            
        mem_dict[reg_name][row_count] = np.mean(runs)

input_path =PosixPath('carbonate_run1/raw_data/memory_output/mem_tf-necd_10_0.bin')

⚠  No symbol information was found for the Python interpreter  ⚠

Without symbolic information reports showing native traces may not accurately 
reflect stack traces. Please use an interpreter built with debug symbols for 
best results. Check https://bloomberg.github.io/memray/native_mode.html for more
information regarding how memray resolves symbols.


Wrote carbonate_run1/raw_data/memory_output/memray-csv-mem_tf-necd_10_0.csv
input_path =PosixPath('carbonate_run1/raw_data/memory_output/mem_tf-necd_15_0.bin')

⚠  No symbol information was found for the Python interpreter  ⚠

Without symbolic information reports showing native traces may not accurately 
reflect stack traces. Please use an interpreter built with debug symbols for 
best results. Check https://bloomberg.github.io/memray/native_mode.html for more
information regarding how memray resolves symbols.


Wrote carbonate_run1/raw_data/memory_out

## Aggregate and Output Data

In [20]:
json_prep_dict = {}
for reg_name in MODELS:
    json_prep_dict[reg_name] = {}
    for row_count in metadata["rows_in_experiment"]:
        json_prep_dict[reg_name][row_count] = {
            "real_runtime": actual_times_dict[reg_name][row_count],
            "theoretical_runtime": theoretical_times_dict[reg_name][row_count],
            "bytes": mem_dict[reg_name][row_count]
        }

df_real_runtime = pd.DataFrame(actual_times_dict)
df_theoretical_runtime = pd.DataFrame(theoretical_times_dict)
df_memory = pd.DataFrame(mem_dict)

data_dump_path.mkdir(exist_ok=True)

df_real_runtime.to_csv(data_dump_path / output_fnames[0], header=True, index=True)
df_theoretical_runtime.to_csv(data_dump_path / output_fnames[1], header=True, index=True)
df_memory.to_csv(data_dump_path / output_fnames[2], header=True, index=True)

with open(data_dump_path / output_fnames[3], "w") as f:
    dump(json_prep_dict, f)
    
print(f"All data has been dumped to: {data_dump_path.resolve()}")

All data has been dumped to: /Users/joshuaelms/Desktop/github_repos/Machine-Learning-Project/BetaDataExper/BigOTest/postprocessing/carbonate_run1/processed_output
