In [None]:
# @title 1. Environment setup and simulator import
import os
import sys
import importlib.util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Project path configuration.
DRIVE_FOLDER = "/content/drive/My Drive/projects/TensorMorph"
LOCAL_FOLDER = "/content/tensormorph_local"

# Mount Drive for source access.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Workspace initialization.
os.makedirs(f"{LOCAL_FOLDER}/experimental", exist_ok=True)
os.makedirs(f"{LOCAL_FOLDER}/data", exist_ok=True)

# Syncing local experimental scripts.
print("Syncing experimental scripts from Drive...")
!rsync -av --progress "{DRIVE_FOLDER}/experimental/" "{LOCAL_FOLDER}/experimental/"

os.chdir(LOCAL_FOLDER)

# Direct path import for schema.py to bypass sys.path caching issues.
schema_path = os.path.join(LOCAL_FOLDER, "experimental/schema.py")
spec = importlib.util.spec_from_file_location("schema", schema_path)
schema = importlib.util.module_from_spec(spec)
try:
    spec.loader.exec_module(schema)
    global FEATURES, TARGET
    FEATURES = schema.FEATURES
    TARGET = schema.TARGET
    print(f"Schema loaded from {schema_path}: {len(FEATURES)} features identified.")
except Exception as e:
    print(f"Error: Failed to execute schema.py: {e}")

# Load simulator module.
if f"{LOCAL_FOLDER}/experimental" not in sys.path:
    sys.path.insert(0, f"{LOCAL_FOLDER}/experimental")

try:
    import simulator
    importlib.reload(simulator)
    from simulator import HardwareScenario
    print("Simulator module loaded successfully.")
except Exception as e:
    print(f"Error: Failed to load simulator: {e}")

print(f"Working directory: {os.getcwd()}")

In [None]:
# @title 2. Feature vector generation

def generate_random_scenarios(n, feature_schema):
    """
    Generates synthetic tensor scenarios mapped to the shared schema.
    """
    spatial_dims = [7, 14, 28, 56, 112, 224]
    channels = [3, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024]

    data = []
    for _ in range(n):
        dim = np.random.choice(spatial_dims)
        ic = np.random.choice(channels)
        is_dw = np.random.choice([0, 1], p=[0.7, 0.3])
        oc = ic if is_dw else np.random.choice(channels)
        k = np.random.choice([1, 3, 5, 7])
        s = np.random.choice([1, 2])
        cl = np.random.randint(1, 6)
        act = np.random.choice([0, 1])

        scenario = {
            'in_h': dim, 'in_w': dim, 'in_c': ic, 'out_c': oc,
            'kernel': k, 'stride': s, 'is_dw': is_dw,
            'chain_len': cl, 'has_act': act
        }

        # Schema alignment check.
        record = [scenario.get(feat) for feat in feature_schema]
        data.append(record)

    return pd.DataFrame(data, columns=feature_schema)

if 'FEATURES' not in globals():
    print("Error: FEATURES undefined. Ensure Cell 1 executed successfully.")
else:
    raw_df = generate_random_scenarios(15000, FEATURES)
    print(f"Pool generated: {len(raw_df)} samples. Columns aligned with schema.py.")
    display(raw_df.head())

In [None]:
# @title Scenario generation and labeling

# Memory-Bound scenario: low bandwidth relative to compute.
mem_scenario = HardwareScenario(
    "Memory_Bound",
    flops_per_sec=1e12,
    bytes_per_sec=1e9
)

# Compute-Bound scenario: low compute relative to bandwidth.
compute_scenario = HardwareScenario(
    "Compute_Bound",
    flops_per_sec=1e9,
    bytes_per_sec=1e12
)

def label_data(df, scenario):
    # Run the physics simulator for each row.
    results = df.apply(lambda row: scenario.run_sim(row), axis=1, result_type='expand')

    labeled_df = df.copy()
    labeled_df['latency_base'] = results[0]
    labeled_df['latency_fused'] = results[1]

    # Calculate profit ratio for the AI target.
    labeled_df['profit_ratio'] = labeled_df['latency_base'] / labeled_df['latency_fused']
    labeled_df['hw_id'] = scenario.name

    return labeled_df

# Generate labeled datasets for both scenarios.
df_mem = label_data(raw_df, mem_scenario)
df_comp = label_data(raw_df, compute_scenario)

# Save the results to CSV files locally.
df_mem.to_csv("data/dataset_memory_bound.csv", index=False)
df_comp.to_csv("data/dataset_compute_bound.csv", index=False)

# Sync the generated data back to Google Drive.
!mkdir -p "{DRIVE_FOLDER}/data"
!cp -r data/* "{DRIVE_FOLDER}/data/"

print(f"Saved and synced records for {mem_scenario.name} and {compute_scenario.name}.")

In [None]:
# @title Scenario distribution analysis

# Setup the visualization to compare scenarios.
plt.figure(figsize=(12, 6))

# Plot the memory-bound distribution in blue.
plt.hist(df_mem['profit_ratio'], bins=50, alpha=0.5, label='Memory-Bound', color='steelblue')

# Plot the compute-bound distribution in orange.
plt.hist(df_comp['profit_ratio'], bins=50, alpha=0.5, label='Compute-Bound', color='darkorange')

# Mark the efficiency threshold at 1.0.
plt.axvline(1.0, color='red', linestyle='--', linewidth=2)

# Set the axis labels and title.
plt.title("Optimization profit across hardware scenarios")
plt.xlabel("Profit ratio")
plt.ylabel("Frequency")
plt.legend()
plt.grid(axis='y', alpha=0.3)

plt.show()

# Display core metrics for verification.
print(f"Memory-Bound Median: {df_mem['profit_ratio'].median():.2f}")
print(f"Compute-Bound Median: {df_comp['profit_ratio'].median():.2f}")