In [1]:
from loaders import *

In [42]:
config = dict(
    matrix_size_M_dim=2048,
    matrix_size_K_dim=2048,
    matrix_size_N_dim=2048,
    
    DRAM_factor_M=1024,
    DRAM_factor_N=2048,
    DRAM_factor_K=2048,
    DRAM_permutation=['K', 'N', 'M'],
    
    ClusterArray_factor_M=2,
    ClusterArray_factor_N=1,
    ClusterArray_factor_K=1,
    ClusterArray_permutation=['K', 'N', 'M'],
    
    GLB_Cluster_factor_M=1,
    GLB_Cluster_factor_N=1,
    GLB_Cluster_factor_K=1,
    GLB_Cluster_permutation=['K', 'N', 'M'],

    glb_factor_M=1, 
    glb_factor_N=1, 
    glb_factor_K=1, 
    glb_permutation=['K', 'N', 'M'],

    PE_Cluster_factor_M=1,
    PE_Cluster_factor_N=1,
    PE_Cluster_factor_K=1,
    PE_Cluster_permutation=['K', 'N', 'M'],

    iact_spad_factor_M=1,
    iact_spad_factor_N=1,
    iact_spad_factor_K=1,
    iact_spad_permutation=['K', 'N', 'M'],

    weight_spad_factor_M=1,
    weight_spad_factor_N=1,
    weight_spad_factor_K=1,
    weight_spad_permutation=['K', 'N', 'M'],

    psum_factor_M=1,
    psum_factor_N=1,
    psum_factor_K=1,
    psum_permutation=['K', 'N', 'M'],

    reg_factor_M=1,
    reg_factor_N=1,
    reg_factor_K=1,
    reg_permutation=['K', 'N', 'M'],

    density_weights=1, 
    density_inputs=1
) 

In [43]:
'''
Modeling lower triangular matrices: 
|--|--|
|Q1|Q2|
|--|--|
|Q4|Q3|
|__|__|

Q2: all-zeroes -- input density = 0
Q4: input density = 1 (actual values)
Q1 & Q3: input density = 0.5
'''

q1_config, q2_config, q4_config = config.copy(), config.copy(), config.copy()
q1_config['density_inputs'] = 0.5
q2_config['density_inputs'] = 0
q4_config['density_inputs'] = 1.0

In [48]:
q1_out = run_timeloop_model(
    q1_config,
    problem='designs/attention/quadrant_problem.yaml',
    mapping='designs/baseline/baseline_mapping.yaml',
    sparse_optimizations='designs/baseline/baseline_sparse_opt.yaml', 
)
q1_stats = open('./output_dir/timeloop-model.stats.txt', 'r').read()
# print(q1_stats)

q2_out = run_timeloop_model(
    q2_config,
    problem='designs/attention/quadrant_problem.yaml',
    mapping='designs/baseline/baseline_mapping.yaml',
    sparse_optimizations='designs/baseline/baseline_sparse_opt.yaml', 
)
q2_stats = open('./output_dir/timeloop-model.stats.txt', 'r').read()

q4_out = run_timeloop_model(
    q4_config,
    problem='designs/attention/quadrant_problem.yaml',
    mapping='designs/baseline/baseline_mapping.yaml',
    sparse_optimizations='designs/baseline/baseline_sparse_opt.yaml', 
)

q4_stats = open('./output_dir/timeloop-model.stats.txt', 'r').read()

[INFO] 2025-04-29 17:39:01,903 - pytimeloop.accelergy_interface - Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


INFO:pytimeloop.accelergy_interface:Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


[INFO] 2025-04-29 17:39:05,590 - pytimeloop.accelergy_interface - Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


INFO:pytimeloop.accelergy_interface:Running Accelergy with command: accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml -o ./output_dir/ -v


Exception: 

========================================================================================================================
Timeloop model failed with return code 134. Please check the output files in ./output_dir for more information. To debug, you can edit the file:
	./output_dir/parsed-processed-input.yaml
and run 
	timeloop model ./output_dir/parsed-processed-input.yaml
to see the error. If you're running the mapper and Timeloop can't find a vaild mapping, try setting 'diagnostics: true' in the mapper input specification.

In [51]:
!timeloop model ./output_dir/parsed-processed-input.yaml

Running apps: model
Found parsed-processed-input.yaml in input files. Running Timeloop without parsing or processing steps. If this is not the intended behavior, please name the input files differently.
input file: /home/workspace/final_project/output_dir/parsed-processed-input.yaml
execute:/usr/local/bin/accelergy /home/workspace/final_project/output_dir/parsed-processed-input.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
timeloop-model: src/sparse-analysis/compute-gs-analyzer.cpp:378: void sparse::CalculateFineGrainedComputeAccesses2Operand(const sparse::SparseAnalysisState&, tiling::CompoundTileNest&): Assertion `abs(skipped_compute + random_compute + gated_compute + nonexistent_compute - total_compute) < total_compute * 0.00001' failed.
Aborted


In [26]:
import re
from collections import defaultdict

def parse_timeloop_stats(content):
    
    results = {
        'energy_per_compute_fJ': {},      # fJ/Compute per component
        'total_energy_uJ': None,          # Total energy
        'memory_energy_pJ': {},           # Total memory energy in pJ
        'memory_traffic': {},             # Scalar accesses per memory level
        'utilization_percent': {},        # Utilization per component and overall
        'computes_per_cycle': None,       # Throughput
        'actual_computes': None,          # Needed for later calc
        'cycles': None                    # Needed for later calc
    }

    # 1. Energy per compute (fJ/Compute)
    energy_compute_matches = re.findall(r"fJ/Compute\s+((?:.|\n)+?)\n\n", content)
    if energy_compute_matches:
        for line in energy_compute_matches[0].split('\n'):
            match = re.match(r"\s*(\S+)\s*=\s*([\d.]+)", line)
            if match:
                component, value = match.groups()
                results['energy_per_compute_fJ'][component] = float(value)

    # 2. Total energy (Summary Stats)
    total_energy_match = re.search(r"Energy:\s+([\d.]+)\s*uJ", content)
    if total_energy_match:
        results['total_energy_uJ'] = float(total_energy_match.group(1))

    # 3. Actual Computes and Cycles
    actual_compute_match = re.search(r"Actual Computes\s*=\s*(\d+)", content)
    if actual_compute_match:
        results['actual_computes'] = int(actual_compute_match.group(1))

    cycle_match = re.search(r"Cycles:\s*(\d+)", content)
    if cycle_match:
        results['cycles'] = int(cycle_match.group(1))

    if results['actual_computes'] and results['cycles']:
        results['computes_per_cycle'] = results['actual_computes'] / results['cycles']

    # 4. Memory traffic (scalar accesses per level)
    memory_traffic_matches = re.findall(r"=== (\w+) ===\n\s*Total scalar accesses\s*:\s*(\d+)", content)
    for level, accesses in memory_traffic_matches:
        results['memory_traffic'][level] = int(accesses)

    # 5. Memory energy: per component from `Energy (total)` lines
    energy_total_matches = re.findall(r"=== (\w+) ===.+?Energy \(total\)\s*:\s*([\d.]+)\s*pJ", content, re.DOTALL)
    for level, energy in energy_total_matches:
        results['memory_energy_pJ'][level] = float(energy)

    # 6. Utilization (MAC unit + overall)
    mac_util_match = re.search(r"MAC.+?Utilized instances \(average\)\s*:\s*([\d.]+)", content, re.DOTALL)
    if mac_util_match:
        results['utilization_percent']['MAC'] = float(mac_util_match.group(1))

    overall_util_match = re.search(r"Utilization:\s*([\d.]+)%", content)
    if overall_util_match:
        results['utilization_percent']['overall'] = float(overall_util_match.group(1))

    return results


parse_timeloop_stats(q1_stats)

{'energy_per_compute_fJ': {'MAC': 15574.6,
  'reg': 51.66,
  'psum_spad': 574.69,
  'weight_spad': 2036.82,
  'iact_spad': 215.6,
  'glb': 12124.79,
  'DRAM': 192062.5,
  'Total': 222640.65},
 'total_energy_uJ': 956234.3,
 'memory_energy_pJ': {'MAC': 66892397648.28,
  'reg': 221861689.64,
  'psum_spad': 2468241020.55,
  'weight_spad': 8748023748.23,
  'iact_spad': 925981205.12,
  'glb': 34711410290.2,
  'DRAM': 549755813888.0},
 'memory_traffic': {'reg': 8589934592,
  'psum_spad': 17175674880,
  'weight_spad': 17179869184,
  'iact_spad': 8589934592,
  'glb': 25773998080,
  'DRAM': 17184063488},
 'utilization_percent': {'MAC': 1.0, 'overall': 0.06},
 'computes_per_cycle': 1.0,
 'actual_computes': 4294967296,
 'cycles': 4294967296}

In [27]:
from collections import defaultdict

def aggregate_timeloop_runs(parsed_runs):
    
    aggregate = {
        'total_energy_uJ': 0.0,
        'memory_energy_pJ': defaultdict(float),
        'memory_traffic': defaultdict(int),
        'energy_per_compute_fJ': {},
        'utilization_percent': {},
        'computes_per_cycle': 0.0,
    }

    total_computes = 0
    total_cycles = 0
    fj_weighted_sums = defaultdict(float)
    fj_total_computes = defaultdict(float)
    utilization_weighted = defaultdict(float)
    utilization_weights = defaultdict(float)

    for run in parsed_runs:
        # Sum total energy
        aggregate['total_energy_uJ'] += run['total_energy_uJ']

        # Sum memory energy
        for level, energy in run['memory_energy_pJ'].items():
            aggregate['memory_energy_pJ'][level] += energy

        # Sum memory traffic
        for level, accesses in run['memory_traffic'].items():
            aggregate['memory_traffic'][level] += accesses

        # fJ/Compute (weighted by actual computes)
        computes = run['actual_computes']
        for comp, fj in run['energy_per_compute_fJ'].items():
            fj_weighted_sums[comp] += fj * computes
            fj_total_computes[comp] += computes

        # Utilization (weighted by cycles)
        if run['cycles']:
            for key, util in run['utilization_percent'].items():
                utilization_weighted[key] += util * run['cycles']
                utilization_weights[key] += run['cycles']

        # Total actual computes and cycles for throughput
        total_computes += computes
        total_cycles += run['cycles']

    # Finalize fJ/Compute
    for comp in fj_weighted_sums:
        aggregate['energy_per_compute_fJ'][comp] = fj_weighted_sums[comp] / fj_total_computes[comp]

    # Finalize utilization
    for key in utilization_weighted:
        aggregate['utilization_percent'][key] = utilization_weighted[key] / utilization_weights[key]

    # Finalize computes per cycle
    if total_cycles > 0:
        aggregate['computes_per_cycle'] = total_computes / total_cycles

    # Attach raw totals (in case needed later)
    aggregate['total_actual_computes'] = total_computes
    aggregate['total_cycles'] = total_cycles

    return aggregate

In [30]:
all_stats = [q1_stats, q2_stats, q1_stats, q4_stats]
parsed_runs = [parse_timeloop_stats(stats) for stats in all_stats]

combined_stats = aggregate_timeloop_runs(parsed_runs)

combined_stats

{'total_energy_uJ': 3825666.6900000004,
 'memory_energy_pJ': defaultdict(float,
             {'MAC': 267703375394.77,
              'reg': 887890481.93,
              'psum_spad': 9877900564.310001,
              'weight_spad': 34996098332.13,
              'iact_spad': 3705776782.8900003,
              'glb': 138845641160.8,
              'DRAM': 2199023255552.0}),
 'memory_traffic': defaultdict(int,
             {'reg': 34376918237,
              'psum_spad': 68702699520,
              'weight_spad': 68719476736,
              'iact_spad': 34376918237,
              'glb': 103113172189,
              'DRAM': 68736253952}),
 'energy_per_compute_fJ': {'MAC': 15574.6,
  'reg': 51.66,
  'psum_spad': 574.69,
  'weight_spad': 2036.0345113587853,
  'iact_spad': 215.6,
  'glb': 12120.749191202442,
  'DRAM': 191998.5007522446,
  'Total': 222571.82445480584},
 'utilization_percent': {'MAC': 1.0, 'overall': 0.06},
 'computes_per_cycle': 1.0,
 'total_actual_computes': 17188459119,
 'total_cycles