In [3]:
import pandas as pd
import json
from datetime import datetime
import re

# Helper functions
def convert_to_bytes(size_str):
    if pd.isna(size_str):
        return None
    return float(size_str)

def convert_to_gbps(bandwidth_str):
    if pd.isna(bandwidth_str):
        return None
    return float(bandwidth_str) / 1e9  # Convert to GBps

def convert_to_flops(flops_str):
    if pd.isna(flops_str):
        return None
    return float(flops_str)

def clean_date(date_str):
    if pd.isna(date_str):
        return None
    try:
        # Parse the date string
        return datetime.strptime(date_str, '%Y-%m-%d').strftime('%Y-%m-%d')
    except:
        return None

# Import epoch.ai hardware database to json
def process_csv_to_json():
    # Initialize the output structure with key-value pairs
    # Read CSV file
    df = pd.read_csv("../input/ML Hardware.csv")

    output = {"hardware": {}}
    
    # Process each row
    for _, row in df.iterrows():
        # Generate hardware ID
        hw_id = generate_hardware_id(row["Hardware name"], row["Manufacturer"])
        
        hardware_entry = {
            "basic_info": {
                "name": row["Hardware name"],
                "manufacturer": row["Manufacturer"],
                "type": None if pd.isna(row["Type"]) else row["Type"],
                "release_date": clean_date(row["Release date"]),
                "release_price_USD": None if pd.isna(row["Release price (USD)"]) else float(str(row["Release price (USD)"]).replace('$', '').replace(',', ''))
            },
            "performance": {
                "compute": {
                    "FLOPs": {
                        "FP64": convert_to_flops(row["FP64 (double precision) performance (FLOP/s)"]),
                        "FP32": convert_to_flops(row["FP32 (single precision) performance (FLOP/s)"]),
                        "FP16": convert_to_flops(row["FP16 (half precision) performance (FLOP/s)"]),
                        "TF32": convert_to_flops(row["TF32 (TensorFloat-32) performance (FLOP/s)"]),
                        "tensor_FP16_BF16": convert_to_flops(row["Tensor-FP16/BF16 performance (FLOP/s)"])
                    },
                    "TOPs": {
                        "INT16": convert_to_flops(row["INT16 performance (OP/s)"]),
                        "INT8": convert_to_flops(row["INT8 performance (OP/s)"]),
                        "INT4": convert_to_flops(row["INT4 performance (OP/s)"])
                    }
                },
                "memory": {
                    "capacity_GB": None if pd.isna(row["Memory size per board (Byte)"]) else float(row["Memory size per board (Byte)"]) / 1e9,
                    "bandwidth_GBps": convert_to_gbps(row["Memory bandwidth (byte/s)"])
                },
                "interconnect": {
                    "intranode_bandwidth_GBps": convert_to_gbps(row["Intranode bandwidth (byte/s)"]),
                    "internode_bandwidth_Gbps": None if pd.isna(row["Internode bandwidth (bit/s)"]) else float(row["Internode bandwidth (bit/s)"]) / 1e9
                }
            },
            "technical_specs": {
                "die_size_mm2": None if pd.isna(row["Die Size (mm^2)"]) else float(row["Die Size (mm^2)"]),
                "tdp_W": None if pd.isna(row["TDP (W)"]) else float(row["TDP (W)"]),
                "clocks": {
                    "base_MHz": None if pd.isna(row["Base clock (MHz)"]) else float(row["Base clock (MHz)"]),
                    "boost_MHz": None if pd.isna(row["Boost clock (MHz)"]) else float(row["Boost clock (MHz)"]),
                    "memory_MHz": None if pd.isna(row["Memory clock (MHz)"]) else float(row["Memory clock (MHz)"])
                },
                "memory_bus_width": None if pd.isna(row["Memory bus (bit)"]) else int(row["Memory bus (bit)"]),
                "tensor_cores": None if pd.isna(row["Tensor cores"]) else int(row["Tensor cores"]),
                "process": {
                    "node_nm": None if pd.isna(row["Process size (nm)"]) else int(row["Process size (nm)"]),
                    "foundry": None if pd.isna(row["Foundry"]) else row["Foundry"]
                },
                "transistor_count_M": None if pd.isna(row["Number of transistors (millions)"]) else float(row["Number of transistors (millions)"])
            },
            "ml_models": [] if pd.isna(row["ML models"]) else [model.strip() for model in str(row["ML models"]).split(",")],
            "metadata": {
                "last_modified": None if pd.isna(row["Last modified"]) else datetime.strptime(row["Last modified"], '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d'),
                "sources": {
                    "datasheet_url": None if pd.isna(row["Link to datasheet"]) else row["Link to datasheet"],
                    "price_source": None if pd.isna(row["Source for the price"]) else row["Source for the price"]
                }
            }
        }

        # Remove None values
        hardware_entry = {k: v for k, v in hardware_entry.items() if v is not None}
        
        # Add to output using ID as key
        output["hardware"][hw_id] = hardware_entry
    
    # Write to file
    with open('../input/hardware.json', 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
        
    print("Conversion completed. Output saved to 'hardware.json'")

# Hardware ID generation from Epoch data
def generate_hardware_id(name, manufacturer):
    """Generate a standardized ID from hardware name and manufacturer."""
    # Remove special characters and spaces, convert to lowercase
    clean_name = re.sub(r'[^a-zA-Z0-9]', '_', name.lower())
    clean_manufacturer = re.sub(r'[^a-zA-Z0-9]', '_', manufacturer.lower())
    
    # Remove redundant manufacturer name if it's already in the hardware name
    if clean_manufacturer in clean_name:
        return clean_name
    return f"{clean_manufacturer}_{clean_name}"

In [4]:
process_csv_to_json()

Conversion completed. Output saved to 'hardware.json'


In [5]:
"""
# Read JSON file
with open('../input/hardware.json', 'r') as file:
    data = json.load(file)

# Flatten the nested structure, preserving the key
hardware_df = pd.json_normalize(
    [
        {**{'hardware_id': hardware_id}, **hardware_data} 
        for hardware_id, hardware_data in data['hardware'].items()
    ],
    record_path=None,
    meta=[
        'hardware_id',  # Add the key as a metadata field
        
        # Basic Info
        ['basic_info', 'name'],
        ['basic_info', 'manufacturer'],
        ['basic_info', 'type'],
        ['basic_info', 'release_date'],
        ['basic_info', 'release_price_USD'],
        
        # Performance - Compute FLOPs
        ['performance', 'compute', 'FLOPs', 'FP64'],
        ['performance', 'compute', 'FLOPs', 'FP32'],
        ['performance', 'compute', 'FLOPs', 'TF32'],
        ['performance', 'compute', 'FLOPs', 'tensor_FP16_BF16'],
        
        # Performance - Compute TOPs
        ['performance', 'compute', 'TOPs', 'INT8'],
        
        # Technical Specs
        ['technical_specs', 'tdp_W'],
        ['technical_specs', 'process', 'node_nm'],
        ['technical_specs', 'process', 'foundry'],
        
        # Metadata
        ['metadata', 'last_modified'],
        ['metadata', 'sources', 'datasheet_url']
    ]
)
"""

"\n# Read JSON file\nwith open('../input/hardware.json', 'r') as file:\n    data = json.load(file)\n\n# Flatten the nested structure, preserving the key\nhardware_df = pd.json_normalize(\n    [\n        {**{'hardware_id': hardware_id}, **hardware_data} \n        for hardware_id, hardware_data in data['hardware'].items()\n    ],\n    record_path=None,\n    meta=[\n        'hardware_id',  # Add the key as a metadata field\n        \n        # Basic Info\n        ['basic_info', 'name'],\n        ['basic_info', 'manufacturer'],\n        ['basic_info', 'type'],\n        ['basic_info', 'release_date'],\n        ['basic_info', 'release_price_USD'],\n        \n        # Performance - Compute FLOPs\n        ['performance', 'compute', 'FLOPs', 'FP64'],\n        ['performance', 'compute', 'FLOPs', 'FP32'],\n        ['performance', 'compute', 'FLOPs', 'TF32'],\n        ['performance', 'compute', 'FLOPs', 'tensor_FP16_BF16'],\n        \n        # Performance - Compute TOPs\n        ['performance', 

In [6]:
"""
# Load the JSON file
with open('../input/systems.json', 'r') as file:
    data = json.load(file)

# Flatten the nested structure
systems_df = pd.json_normalize(
    [
        {
            'system_id': system_id,
            'system_name': system_data['name'],
            'vendor': system_data['vendor'],
            'release_date': system_data['release_date'],
            'configuration_id': config_id,
            'accelerator_id': config_data['components']['accelerator']['id'],
            'accelerator_quantity': config_data['components']['accelerator']['quantity'],
            'intranode_interconnect': config_data['interconnect']['intranode'],
            'internode_interconnect': config_data['interconnect']['internode'],
            'internode_ports': config_data['interconnect']['internode_ports'],
            'form_factor': config_data['form_factor'],
            'msrp_usd': config_data['msrp_usd'],
            'source': config_data['source']
        }
        for system_id, system_data in data['systems'].items()
        for config_id, config_data in system_data['configurations'].items()
    ]
)
"""


"\n# Load the JSON file\nwith open('../input/systems.json', 'r') as file:\n    data = json.load(file)\n\n# Flatten the nested structure\nsystems_df = pd.json_normalize(\n    [\n        {\n            'system_id': system_id,\n            'system_name': system_data['name'],\n            'vendor': system_data['vendor'],\n            'release_date': system_data['release_date'],\n            'configuration_id': config_id,\n            'accelerator_id': config_data['components']['accelerator']['id'],\n            'accelerator_quantity': config_data['components']['accelerator']['quantity'],\n            'intranode_interconnect': config_data['interconnect']['intranode'],\n            'internode_interconnect': config_data['interconnect']['internode'],\n            'internode_ports': config_data['interconnect']['internode_ports'],\n            'form_factor': config_data['form_factor'],\n            'msrp_usd': config_data['msrp_usd'],\n            'source': config_data['source']\n        }\n    

In [7]:
"""
# Load the interconnect JSON
with open('../input/interconnect.json', 'r') as file:
    interconnect_data = json.load(file)

# Normalize Interconnects
interconnect_df = pd.json_normalize(
    [
        {**{'interconnect_id': interconnect_id}, **interconnect_data}
        for interconnect_id, interconnect_data in interconnect_data['interconnect'].items()
    ]
)
"""

"\n# Load the interconnect JSON\nwith open('../input/interconnect.json', 'r') as file:\n    interconnect_data = json.load(file)\n\n# Normalize Interconnects\ninterconnect_df = pd.json_normalize(\n    [\n        {**{'interconnect_id': interconnect_id}, **interconnect_data}\n        for interconnect_id, interconnect_data in interconnect_data['interconnect'].items()\n    ]\n)\n"

In [8]:
def generate_system_card(
    systems_json, 
    hardware_json, 
    interconnect_json, 
    system_id
):
    # Load JSON files
    with open(systems_json, 'r') as f:
        systems_data = json.load(f)['systems']
    
    with open(hardware_json, 'r') as f:
        hardware_data = json.load(f)['hardware']
    
    with open(interconnect_json, 'r') as f:
        interconnect_data = json.load(f)['interconnect']
    
    # Extract system information
    system = systems_data[system_id]
    
    # Get first (and typically only) configuration
    config_id = list(system['configurations'].keys())[0]
    configuration = system['configurations'][config_id]
    
    # Get accelerator details
    accelerator_id = configuration['components']['accelerator']['id']
    accelerator = hardware_data[accelerator_id]
    
    # Get intranode and internode interconnect details
    intranode_id = configuration['interconnect']['intranode']
    internode_id = configuration['interconnect']['internode']
    
    # Construct system card
    system_card = {
        # Identifiers
        'System ID': system_id,
        'Accelerator ID': accelerator_id,
        'Intranode Interconnect': intranode_id,
        'Internode Interconnect': internode_id,
        
        # Quantity and Physical Specs
        'Num. Accelerators': configuration['components']['accelerator']['quantity'],
        'Rack Units': configuration['form_factor'],
        
        # Memory Details
        'Memory (GB)': accelerator['performance']['memory'].get('capacity_GB', 'N/A'),
        'Memory Bandwidth (GB/s)': accelerator['performance']['memory'].get('bandwidth_GBps', 'N/A'),
        
        # Performance (FLOPs)
        'FLOPs': {
            'FP64': accelerator['performance']['compute']['FLOPs'].get('FP64', 'N/A'),
            'FP32': accelerator['performance']['compute']['FLOPs'].get('FP32', 'N/A'),
            'TF32': accelerator['performance']['compute']['FLOPs'].get('TF32', 'N/A'),
            'Tensor FP16/BF16': accelerator['performance']['compute']['FLOPs'].get('tensor_FP16_BF16', 'N/A')
        },
        
        # Cost and Power
        'Cost': configuration.get('msrp_usd', 'N/A'),
        'Max TDP (Watts)': (
            accelerator['technical_specs'].get('tdp_W', 0) * 
            configuration['components']['accelerator']['quantity']
        )
    }
    
    return system_card

In [9]:
# Example usage
def print_system_card(system_card):
    print("> SYSTEM CARD:")
    print("--------------------------------")
    for key, value in system_card.items():
        if isinstance(value, dict):
            print('*', f"{key}:")
            for sub_key, sub_value in value.items():
                print(' ', "-", f"  {sub_key}: {sub_value}")
        else:
            print('*', f"{key}: {value}")

# Generate and print system card
system_card = generate_system_card(
    '../input/systems.json', 
    '../input/hardware.json', 
    '../input/interconnect.json', 
    'nvidia_dgx_h100'
)

print_system_card(system_card)

> SYSTEM CARD:
--------------------------------
* System ID: nvidia_dgx_h100
* Accelerator ID: nvidia_h100_sxm5_80gb
* Intranode Interconnect: nvlink_4.0
* Internode Interconnect: infiniband_ndr_400
* Num. Accelerators: 8
* Rack Units: 5U
* Memory (GB): 80.0
* Memory Bandwidth (GB/s): 3350.0
* FLOPs:
  -   FP64: 33450000000000.0
  -   FP32: 66910000000000.0
  -   TF32: 494500000000000.0
  -   Tensor FP16/BF16: 989500000000000.0
* Cost: 350000
* Max TDP (Watts): 5600.0
