# üè≠ Notebook 0: Synthetic Wafer Manufacturing Data Generation

## Overview

This notebook generates synthetic semiconductor wafer manufacturing data for the **Snowflake ML wafer yield forecasting demo**.

### Tables Generated

| Table | Description | Rows |
|-------|-------------|------|
| `WAFER_PROCESS_DATA` | Process telemetry per wafer per step | ~750K |
| `WAFER_DEFECT_LOGS` | Defect inspection results | ~150K |
| `FINAL_YIELD_LABELS` | Yield outcomes (ML target) | ~50K |

### Prerequisites

Run `setup/01_snowflake_setup.sql` first to create the database and tables.


In [None]:
# ============================================================================
# IMPORTS & SEED
# ============================================================================

import numpy as np
import pandas as pd
import json
from datetime import datetime, timedelta
from typing import List, Dict, Tuple

from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, lit, parse_json

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print(f"‚úÖ Imports complete. Seed: {RANDOM_SEED}")


In [None]:
# ============================================================================
# SNOWFLAKE SESSION
# ============================================================================

try:
    from snowflake.snowpark.context import get_active_session
    session = get_active_session()
    print("‚úÖ Using Snowflake Notebook session")
except:
    # Local development
    connection_parameters = {
        "account": "<YOUR_ACCOUNT>",
        "user": "<YOUR_USER>",
        "password": "<YOUR_PASSWORD>",
        "role": "<YOUR_ROLE>",
        "warehouse": "WAFER_DEMO_WH",
        "database": "WAFER_YIELD_DEMO",
        "schema": "RAW_DATA"
    }
    session = Session.builder.configs(connection_parameters).create()
    print("‚úÖ Created local session")

# Set context
session.sql("USE DATABASE WAFER_YIELD_DEMO").collect()
session.sql("USE SCHEMA RAW_DATA").collect()
session.sql("USE WAREHOUSE WAFER_DEMO_WH").collect()
print(f"üìç Database: WAFER_YIELD_DEMO.RAW_DATA")


In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG = {
    "num_wafers": 50000,
    "num_lots": 2000,
    "wafers_per_lot": 25,
    
    "process_steps": [
        "CLEAN", "OXIDATION", "LITHOGRAPHY_1", "ETCH_1", "DEPOSITION_1",
        "CMP_1", "LITHOGRAPHY_2", "IMPLANT", "ANNEAL", "DEPOSITION_2",
        "ETCH_2", "CMP_2", "METALLIZATION", "PASSIVATION", "FINAL_TEST"
    ],
    "num_equipment_per_step": 5,
    
    "temp_ranges": {
        "CLEAN": (20, 80), "OXIDATION": (800, 1100), "LITHOGRAPHY_1": (20, 25),
        "ETCH_1": (20, 200), "DEPOSITION_1": (300, 600), "CMP_1": (20, 40),
        "LITHOGRAPHY_2": (20, 25), "IMPLANT": (20, 400), "ANNEAL": (900, 1050),
        "DEPOSITION_2": (350, 650), "ETCH_2": (20, 250), "CMP_2": (20, 40),
        "METALLIZATION": (200, 450), "PASSIVATION": (300, 500), "FINAL_TEST": (20, 30)
    },
    
    "defect_types": ["PARTICLE", "SCRATCH", "PATTERN_DEFECT", "FILM_VOID",
                     "CONTAMINATION", "OVERLAY_ERROR", "ETCH_RESIDUE", "HILLOCKS"],
    
    "inspection_tools": ["KLA_SURFSCAN", "AMAT_BRIGHTFIELD", "HITACHI_SEM", 
                         "KLA_ARCHER", "NOVA_OCD"],
    
    "root_causes": ["EQUIPMENT_DRIFT", "CONTAMINATION", "PROCESS_EXCURSION",
                    "MATERIAL_DEFECT", "HUMAN_ERROR", "UNKNOWN", "NORMAL_VARIATION"],
    
    "start_date": datetime(2024, 1, 1),
    "end_date": datetime(2024, 6, 30),
}

print(f"‚úÖ Config: {CONFIG['num_wafers']:,} wafers, {len(CONFIG['process_steps'])} steps")


In [None]:
# ============================================================================
# HELPER FUNCTIONS - Profile Generation
# ============================================================================

def generate_temperature_profile(step: str, num_points: int = 10, anomaly_prob: float = 0.05) -> Tuple[List[float], bool]:
    """Generate temperature profile with occasional anomalies."""
    temp_min, temp_max = CONFIG["temp_ranges"].get(step, (20, 100))
    target_temp = np.random.uniform(temp_min, temp_max)
    
    # Ramp-up, hold, cool-down pattern
    ramp = np.linspace(25, target_temp, num_points // 3)
    hold = np.full(num_points // 3, target_temp) + np.random.normal(0, 2, num_points // 3)
    cool = np.linspace(target_temp, 25, num_points - 2 * (num_points // 3))
    profile = np.concatenate([ramp, hold, cool])
    
    # Random anomaly (temperature spike)
    is_anomaly = np.random.random() < anomaly_prob
    if is_anomaly:
        profile[np.random.randint(len(profile))] += np.random.uniform(50, 150)
    
    return list(np.round(profile, 2)), is_anomaly


def generate_pressure_profile(step: str, num_points: int = 10, anomaly_prob: float = 0.03) -> Tuple[List[float], bool]:
    """Generate pressure profile (mTorr) with step-specific ranges."""
    pressure_ranges = {
        "DEPOSITION_1": (100, 500), "DEPOSITION_2": (100, 500),
        "ETCH_1": (10, 100), "ETCH_2": (10, 100),
        "IMPLANT": (0.001, 0.1), "OXIDATION": (500, 760),
    }
    p_min, p_max = pressure_ranges.get(step, (100, 200))
    base = np.random.uniform(p_min, p_max)
    profile = base + np.random.normal(0, base * 0.02, num_points)
    profile = np.clip(profile, p_min * 0.8, p_max * 1.2)
    
    is_anomaly = np.random.random() < anomaly_prob
    if is_anomaly:
        profile += np.random.uniform(-base * 0.3, base * 0.5, num_points)
    
    return list(np.round(profile, 4)), is_anomaly

print("‚úÖ Helper functions defined")


## üìä Generate WAFER_PROCESS_DATA

Each wafer passes through 15 process steps ‚Üí ~750K records total.


In [None]:
# ============================================================================
# GENERATE WAFER_PROCESS_DATA
# ============================================================================

def generate_wafer_process_data() -> Tuple[pd.DataFrame, Dict[str, int]]:
    """Generate process data for all wafers. Returns DataFrame and anomaly counts."""
    print("üîÑ Generating WAFER_PROCESS_DATA...")
    
    records = []
    wafer_anomalies = {}
    
    lot_ids = [f"LOT_{i:05d}" for i in range(CONFIG["num_lots"])]
    wafer_ids = [f"WFR_{i:07d}" for i in range(CONFIG["num_wafers"])]
    equipment_map = {step: [f"{step}_EQ_{j:02d}" for j in range(CONFIG["num_equipment_per_step"])]
                     for step in CONFIG["process_steps"]}
    date_range = (CONFIG["end_date"] - CONFIG["start_date"]).days
    
    for i, wafer_id in enumerate(wafer_ids):
        lot_id = lot_ids[i // CONFIG["wafers_per_lot"] % len(lot_ids)]
        base_time = CONFIG["start_date"] + timedelta(
            days=np.random.randint(0, date_range),
            hours=np.random.randint(0, 24)
        )
        base_humidity = 40 + 10 * np.sin(base_time.hour * np.pi / 12)
        anomaly_count = 0
        
        for step_idx, step in enumerate(CONFIG["process_steps"]):
            temp_profile, temp_anom = generate_temperature_profile(step)
            pres_profile, pres_anom = generate_pressure_profile(step)
            anomaly_count += int(temp_anom or pres_anom)
            
            gas_base = {"DEPOSITION_1": 500, "ETCH_1": 200, "OXIDATION": 1000}.get(step, 100)
            
            records.append({
                "WAFER_ID": wafer_id,
                "LOT_ID": lot_id,
                "EQUIPMENT_ID": np.random.choice(equipment_map[step]),
                "PROCESS_STEP": step,
                "TEMPERATURE_PROFILE": json.dumps(temp_profile),
                "PRESSURE_PROFILE": json.dumps(pres_profile),
                "GAS_FLOW_RATE": round(gas_base * np.random.uniform(0.9, 1.1), 2),
                "AMBIENT_HUMIDITY": round(base_humidity + np.random.normal(0, 3), 2),
                "TIMESTAMP": base_time + timedelta(hours=step_idx * 2)
            })
        
        wafer_anomalies[wafer_id] = anomaly_count
        if (i + 1) % 10000 == 0:
            print(f"   {i + 1:,} wafers...")
    
    df = pd.DataFrame(records)
    print(f"‚úÖ Generated {len(df):,} records")
    return df, wafer_anomalies

process_df, wafer_anomalies = generate_wafer_process_data()


## üîç Generate WAFER_DEFECT_LOGS

Defect counts correlate with process anomalies (higher anomalies ‚Üí more defects).


In [None]:
# ============================================================================
# GENERATE WAFER_DEFECT_LOGS
# ============================================================================

def generate_defect_logs(wafer_anomalies: Dict[str, int]) -> pd.DataFrame:
    """Generate defect logs with correlation to process anomalies."""
    print("üîÑ Generating WAFER_DEFECT_LOGS...")
    
    records = []
    defect_weights = [0.25, 0.15, 0.20, 0.10, 0.12, 0.08, 0.05, 0.05]
    severity_base = {"PARTICLE": 3, "SCRATCH": 7, "PATTERN_DEFECT": 8, "FILM_VOID": 6,
                     "CONTAMINATION": 5, "OVERLAY_ERROR": 9, "ETCH_RESIDUE": 4, "HILLOCKS": 5}
    
    for wafer_id, anomaly_count in wafer_anomalies.items():
        for _ in range(np.random.randint(1, 6)):
            defect_type = np.random.choice(CONFIG["defect_types"], p=defect_weights)
            
            # Anomalies increase defect rate; 2% chance of outlier
            if np.random.random() < 0.02:
                defect_count = np.random.randint(100, 500)
            else:
                defect_count = np.random.poisson(5 + anomaly_count * 10)
            
            severity = min(10, max(0, severity_base[defect_type] + 
                                   np.random.normal(0, 1.5) + defect_count / 50))
            
            records.append({
                "WAFER_ID": wafer_id,
                "DEFECT_TYPE": defect_type,
                "DEFECT_COUNT": defect_count,
                "INSPECTION_TOOL": np.random.choice(CONFIG["inspection_tools"]),
                "SEVERITY_SCORE": round(severity, 2),
                "TIMESTAMP": CONFIG["start_date"] + timedelta(days=np.random.randint(0, 180))
            })
    
    df = pd.DataFrame(records)
    print(f"‚úÖ Generated {len(df):,} defect records")
    return df

defect_df = generate_defect_logs(wafer_anomalies)


## üéØ Generate FINAL_YIELD_LABELS

Yield is modeled as a logistic function of anomalies + defects + severity.


In [None]:
# ============================================================================
# GENERATE FINAL_YIELD_LABELS
# ============================================================================

def generate_yield_labels(wafer_anomalies: Dict[str, int], defect_df: pd.DataFrame) -> pd.DataFrame:
    """Generate yield labels with realistic correlations."""
    print("üîÑ Generating FINAL_YIELD_LABELS...")
    
    defect_stats = defect_df.groupby("WAFER_ID").agg(
        {"DEFECT_COUNT": "sum", "SEVERITY_SCORE": "max"}
    ).reset_index()
    defect_stats.columns = ["WAFER_ID", "TOTAL_DEFECTS", "MAX_SEVERITY"]
    
    records = []
    for wafer_id, anomaly_count in wafer_anomalies.items():
        wafer_def = defect_stats[defect_stats["WAFER_ID"] == wafer_id]
        total_defects = wafer_def["TOTAL_DEFECTS"].values[0] if len(wafer_def) > 0 else 5
        max_severity = wafer_def["MAX_SEVERITY"].values[0] if len(wafer_def) > 0 else 3
        
        # Logistic yield model
        logit = 3.0 - 0.5 * anomaly_count - 0.02 * total_defects - 0.3 * max_severity + np.random.normal(0, 0.5)
        yield_prob = 1 / (1 + np.exp(-logit))
        yield_good = 1 if np.random.random() < yield_prob else 0
        yield_score = max(0, min(100, round(yield_prob * 100 + np.random.normal(0, 5), 2)))
        
        # Root cause assignment
        if yield_good == 1:
            root_cause = "NORMAL_VARIATION"
        elif anomaly_count > 2:
            root_cause = np.random.choice(["EQUIPMENT_DRIFT", "PROCESS_EXCURSION"], p=[0.6, 0.4])
        elif total_defects > 50:
            root_cause = np.random.choice(["CONTAMINATION", "MATERIAL_DEFECT"], p=[0.7, 0.3])
        else:
            root_cause = np.random.choice(CONFIG["root_causes"])
        
        records.append({
            "WAFER_ID": wafer_id,
            "YIELD_GOOD": yield_good,
            "YIELD_SCORE": yield_score,
            "ROOT_CAUSE_CATEGORY": root_cause
        })
    
    df = pd.DataFrame(records)
    print(f"‚úÖ Generated {len(df):,} yield labels")
    print(f"   Yield rate: {df['YIELD_GOOD'].mean()*100:.1f}%")
    return df

yield_df = generate_yield_labels(wafer_anomalies, defect_df)


## üì§ Upload to Snowflake

Using Snowpark's `create_dataframe()` and `save_as_table()` per best practices.


In [None]:
# ============================================================================
# UPLOAD WAFER_PROCESS_DATA
# ============================================================================

print("üì§ Uploading WAFER_PROCESS_DATA...")
session.create_dataframe(process_df).write.mode("overwrite").save_as_table("WAFER_PROCESS_DATA")

# Convert JSON strings to VARIANT for query performance
session.sql("""
    UPDATE WAFER_PROCESS_DATA 
    SET TEMPERATURE_PROFILE = PARSE_JSON(TEMPERATURE_PROFILE),
        PRESSURE_PROFILE = PARSE_JSON(PRESSURE_PROFILE)
""").collect()

print(f"‚úÖ WAFER_PROCESS_DATA: {session.table('WAFER_PROCESS_DATA').count():,} rows")


In [None]:
# ============================================================================
# UPLOAD WAFER_DEFECT_LOGS
# ============================================================================

print("üì§ Uploading WAFER_DEFECT_LOGS...")
session.create_dataframe(defect_df).write.mode("overwrite").save_as_table("WAFER_DEFECT_LOGS")
print(f"‚úÖ WAFER_DEFECT_LOGS: {session.table('WAFER_DEFECT_LOGS').count():,} rows")


In [None]:
# ============================================================================
# UPLOAD FINAL_YIELD_LABELS
# ============================================================================

print("üì§ Uploading FINAL_YIELD_LABELS...")
session.create_dataframe(yield_df).write.mode("overwrite").save_as_table("FINAL_YIELD_LABELS")
print(f"‚úÖ FINAL_YIELD_LABELS: {session.table('FINAL_YIELD_LABELS').count():,} rows")


## ‚úÖ Validation


In [None]:
# ============================================================================
# VALIDATION - Verify yield correlates with defects
# ============================================================================

print("üîó Validating correlations...")

session.sql("""
    SELECT 
        y.YIELD_GOOD,
        AVG(d.DEFECT_COUNT) as AVG_DEFECTS,
        AVG(d.SEVERITY_SCORE) as AVG_SEVERITY,
        COUNT(DISTINCT y.WAFER_ID) as WAFER_COUNT
    FROM FINAL_YIELD_LABELS y
    LEFT JOIN WAFER_DEFECT_LOGS d ON y.WAFER_ID = d.WAFER_ID
    GROUP BY y.YIELD_GOOD
    ORDER BY y.YIELD_GOOD
""").show()

print("‚úÖ Bad yield wafers should have higher defect counts & severity")


In [None]:
# ============================================================================
# SUMMARY
# ============================================================================

print("\nüéâ Notebook 0 Complete!")
print(f"üìç Database: WAFER_YIELD_DEMO.RAW_DATA")
print(f"üìã Tables: WAFER_PROCESS_DATA, WAFER_DEFECT_LOGS, FINAL_YIELD_LABELS")
print("\n‚û°Ô∏è Proceed to Notebook 1: Feature Engineering")


In [None]:
# End of notebook
