In [None]:
# @title 1. Environment setup and simulator import
import os
import sys
import importlib.util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Project path configuration.
DRIVE_FOLDER = "/content/drive/My Drive/projects/TensorMorph"
LOCAL_FOLDER = "/content/tensormorph_local"

# Mount Drive for source access.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Workspace initialization.
os.makedirs(f"{LOCAL_FOLDER}/experimental", exist_ok=True)
os.makedirs(f"{LOCAL_FOLDER}/data", exist_ok=True)

# Syncing local experimental scripts.
print("Syncing experimental scripts from Drive...")
!rsync -av --progress "{DRIVE_FOLDER}/experimental/" "{LOCAL_FOLDER}/experimental/"

os.chdir(LOCAL_FOLDER)

# Direct path import for schema.py to bypass sys.path caching issues.
schema_path = os.path.join(LOCAL_FOLDER, "experimental/schema.py")
spec = importlib.util.spec_from_file_location("schema", schema_path)
schema = importlib.util.module_from_spec(spec)
try:
    spec.loader.exec_module(schema)
    global FEATURES, TARGET
    FEATURES = schema.FEATURES
    TARGET = schema.TARGET
    print(f"Schema loaded from {schema_path}: {len(FEATURES)} features identified.")
except Exception as e:
    print(f"Error: Failed to execute schema.py: {e}")

# Load simulator module.
if f"{LOCAL_FOLDER}/experimental" not in sys.path:
    sys.path.insert(0, f"{LOCAL_FOLDER}/experimental")

try:
    import simulator
    importlib.reload(simulator)
    from simulator import HardwareScenario, generate_balanced_workload
    print("Simulator module loaded successfully.")
except Exception as e:
    print(f"Error: Failed to load simulator: {e}")

print(f"Working directory: {os.getcwd()}")

In [None]:
# @title 2. Scenario generation and labeling

# 1. Define hardware targets for data labeling.
# Memory-bound: focuses on bandwidth and register pressure.
mem_scenario = HardwareScenario("Memory_Bound", flops_per_sec=1e12, bytes_per_sec=1e9)

# Compute-bound: focuses on ALU saturation and orchestration overhead.
compute_scenario = HardwareScenario("Compute_Bound", flops_per_sec=1e9, bytes_per_sec=1e12)

def generate_and_label_dataset(n, scenario):
    # Use centralized generator to ensure 9-feature schema compliance.
    df = generate_balanced_workload(n=n, scenario_type=scenario.name)

    # Run hardware-aware simulation for ground truth.
    print(f"Labeling {n} samples for {scenario.name}...")
    results = df.apply(lambda row: scenario.run_sim(row), axis=1, result_type='expand')

    labeled_df = df.copy()
    labeled_df['latency_base'] = results[0]
    labeled_df['latency_fused'] = results[1]
    labeled_df['profit_ratio'] = labeled_df['latency_base'] / labeled_df['latency_fused']

    # target=1 means fusion is profitable.
    labeled_df['target'] = (labeled_df['profit_ratio'] > 1.0).astype(int)
    labeled_df['hw_id'] = scenario.name

    return labeled_df

# 2. Build training sets.
df_mem = generate_and_label_dataset(15000, mem_scenario)
df_comp = generate_and_label_dataset(15000, compute_scenario)

# 3. Save and sync back to Drive.
df_mem.to_csv("data/dataset_memory_bound.csv", index=False)
df_comp.to_csv("data/dataset_compute_bound.csv", index=False)

!mkdir -p "{DRIVE_FOLDER}/data"
!cp -r data/* "{DRIVE_FOLDER}/data/"

print("\nDatasets generated and synced for training.")
display(df_mem.head())

In [None]:
# @title 3. Distribution and balance analysis

plt.figure(figsize=(12, 6))
plt.hist(df_mem['profit_ratio'], bins=50, alpha=0.5, label='Memory-Bound', color='steelblue')
plt.hist(df_comp['profit_ratio'], bins=50, alpha=0.5, label='Compute-Bound', color='darkorange')
plt.axvline(1.0, color='red', linestyle='--', linewidth=2)

plt.title("Optimization profit distribution across scenarios")
plt.xlabel("Profit ratio (Higher = Fusion is better)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

def check_balance(df, label):
    fuses = df['target'].sum()
    print(f"{label}: {fuses/len(df):.1%} Fuses, {(len(df)-fuses)/len(df):.1%} Vetoes")

check_balance(df_mem, "Memory-bound")
check_balance(df_comp, "Compute-bound")

In [None]:
# @title 4. Target distribution analysis

def analyze_target_balance(df, label):
    total = len(df)
    fuses = df['target'].sum()
    vetoes = total - fuses

    print(f"\n--- {label} Scenario ---")
    print(f"Total samples: {total}")
    print(f"Fuses  (target=1): {fuses} ({fuses/total:.1%})")
    print(f"Vetoes (target=0): {vetoes} ({vetoes/total:.1%})")

analyze_target_balance(df_mem, "Memory-bound")
analyze_target_balance(df_comp, "Compute-bound")