In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # BioPilot Quick Start Guide
# MAGIC 
# MAGIC **Gamified Bioreactor Simulation Platform**
# MAGIC 
# MAGIC This notebook demonstrates the complete BioPilot workflow:
# MAGIC 1. Run a simulation with fault injection
# MAGIC 2. Perform anomaly detection
# MAGIC 3. Engage the AI copilot
# MAGIC 4. Save results to Delta Lake
# MAGIC 5. Analyze and visualize outcomes
# MAGIC 
# MAGIC ---

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Setup & Imports

# COMMAND ----------

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import os
from typing import Dict

# Set project paths
PROJECT_ROOT = '/Workspace/Repos/synthetic-twin/synthetic_twin'
MODULES_PATH = os.path.join(PROJECT_ROOT, 'modules')
if MODULES_PATH not in sys.path:
    sys.path.insert(0, MODULES_PATH)

# Import BioPilot components
from config import (
    SIMULATION_PARAMS, INITIAL_STATE, KINETIC_PARAMS, 
    REACTOR_PARAMS, SENSOR_PARAMS, FAULT_TEMPLATES,
    SCENARIOS, SCORING_CONFIG
)
from models import BioreactorSimulation, FaultManager
from anomaly_detection import (
    AnomalyDetectionEngine, 
    create_default_bioreactor_config
)
from agent_copilot import (
    MultiAgentCopilot, 
    create_default_copilot_config,
    AgentObservation
)
from data_lake import BioreactorDataLake
from run_simulation_workflow import BioPilotWorkflow, visualize_run

print("✅ All modules imported successfully!")
print(f"📍 Project root: {PROJECT_ROOT}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Quick Test - Run Single Simulation

# COMMAND ----------

# Assemble configuration
config = {
    'SIMULATION_PARAMS': SIMULATION_PARAMS,
    'INITIAL_STATE': INITIAL_STATE,
    'KINETIC_PARAMS': KINETIC_PARAMS,
    'REACTOR_PARAMS': REACTOR_PARAMS,
    'SENSOR_PARAMS': SENSOR_PARAMS,
    'FAULT_TEMPLATES': FAULT_TEMPLATES
}

# Initialize workflow
workflow = BioPilotWorkflow(
    spark=spark,
    config_dict=config,
    enable_agent=True,
    enable_anomaly_detection=True
)

# Inject a fault scenario
print("🔧 Injecting 'overfeed' fault at t=20h...")
workflow.inject_scenario_faults(scenario="overfeed")

# Run simulation
print("\n🚀 Running simulation...")
results = workflow.run_with_monitoring(
    base_feed_rate=0.1,
    save_to_lake=True
)

print(f"\n✅ Simulation complete!")
print(f"   Run ID: {results['run_id']}")
print(f"   Final Titer: {results['final_titer']:.2f} mg/mL")
print(f"   Anomalies Detected: {results['num_anomalies']}")
print(f"   Agent Actions: {results['num_actions']}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Visualize Results

# COMMAND ----------

# Create comprehensive visualization
visualize_run(results)

# Display run report
if results['agent_report']:
    print("\n" + "="*60)
    print("AGENT COPILOT REPORT")
    print("="*60)
    print(results['agent_report'])

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Inspect Data Lake

# COMMAND ----------

# Query telemetry data
run_id = results['run_id']
telemetry_df = workflow.data_lake.get_run_telemetry(spark, run_id, is_observed=True)

print(f"📊 Telemetry Data Shape: {telemetry_df.shape}")
print(f"\nSignals captured: {telemetry_df['signal_name'].unique()}")
print(f"Time range: {telemetry_df['time_h'].min():.1f} - {telemetry_df['time_h'].max():.1f} hours")

# Show sample
display(telemetry_df.head(10))

# COMMAND ----------

# Query anomaly scores
anomalies_df = workflow.data_lake.get_anomalies(spark, run_id, only_detected=True)

print(f"🚨 Anomalies Detected: {len(anomalies_df)}")

if not anomalies_df.empty:
    print("\nAnomalies by signal:")
    print(anomalies_df.groupby('signal_name').size())
    
    print("\nAnomalies by method:")
    print(anomalies_df.groupby('method').size())
    
    display(anomalies_df.head(10))
else:
    print("✅ No anomalies detected in this run")

# COMMAND ----------

# Query agent actions
actions_df = workflow.data_lake.get_agent_actions(spark, run_id)

print(f"🤖 Agent Actions Taken: {len(actions_df)}")

if not actions_df.empty:
    print("\nActions by type:")
    print(actions_df.groupby('action_type').size())
    
    display(actions_df)
else:
    print("ℹ️ No agent actions were taken")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. SQL Analysis (Delta Lake)

# COMMAND ----------

# MAGIC %sql
# MAGIC -- List all simulation runs
# MAGIC SELECT 
# MAGIC   run_id,
# MAGIC   scenario,
# MAGIC   start_time,
# MAGIC   final_titer,
# MAGIC   num_anomalies,
# MAGIC   num_actions,
# MAGIC   success,
# MAGIC   score
# MAGIC FROM main.biopilot.run_metadata
# MAGIC ORDER BY start_time DESC
# MAGIC LIMIT 10

# COMMAND ----------

# MAGIC %sql
# MAGIC -- Anomaly statistics across all runs
# MAGIC SELECT 
# MAGIC   signal_name,
# MAGIC   method,
# MAGIC   COUNT(*) as anomaly_count,
# MAGIC   AVG(score) as avg_score
# MAGIC FROM main.biopilot.anomaly_scores
# MAGIC WHERE is_anomaly = true
# MAGIC GROUP BY signal_name, method
# MAGIC ORDER BY anomaly_count DESC

# COMMAND ----------

# MAGIC %sql
# MAGIC -- Agent action effectiveness
# MAGIC SELECT 
# MAGIC   rm.run_id,
# MAGIC   rm.scenario,
# MAGIC   rm.final_titer,
# MAGIC   COUNT(aa.action_id) as num_actions
# MAGIC FROM main.biopilot.run_metadata rm
# MAGIC LEFT JOIN main.biopilot.agent_actions aa ON rm.run_id = aa.run_id
# MAGIC GROUP BY rm.run_id, rm.scenario, rm.final_titer
# MAGIC ORDER BY rm.final_titer DESC

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Batch Scenario Testing

# COMMAND ----------

from run_simulation_workflow import run_batch_scenarios

# Test multiple scenarios with replicates
scenarios_to_test = ['baseline', 'overfeed', 'DO_drop', 'contamination']

print("🔬 Running batch scenario analysis...")
print(f"   Scenarios: {scenarios_to_test}")
print(f"   Replicates per scenario: 3")
print(f"   Total runs: {len(scenarios_to_test) * 3}")
print("\nThis may take several minutes...\n")

batch_summary = run_batch_scenarios(
    spark=spark,
    scenarios=scenarios_to_test,
    num_replicates=3
)

print("\n✅ Batch analysis complete!")
display(batch_summary)

# COMMAND ----------

# Statistical summary by scenario
summary_stats = batch_summary.groupby('scenario').agg({
    'final_titer': ['mean', 'std', 'min', 'max'],
    'final_biomass': ['mean', 'std'],
    'num_anomalies': 'mean',
    'num_actions': 'mean'
}).round(3)

print("\n📈 Summary Statistics by Scenario:")
display(summary_stats)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Advanced Analysis - Compare Scenarios

# COMMAND ----------

import seaborn as sns

# Fetch data for comparison
comparison_query = """
SELECT 
    rm.scenario,
    rm.final_titer,
    rm.final_biomass,
    rm.num_anomalies,
    rm.num_actions,
    rm.success,
    rm.score
FROM main.biopilot.run_metadata rm
WHERE rm.scenario IN ('baseline', 'overfeed', 'DO_drop', 'contamination')
"""

comparison_df = spark.sql(comparison_query).toPandas()

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Final titer by scenario
sns.boxplot(data=comparison_df, x='scenario', y='final_titer', ax=axes[0, 0])
axes[0, 0].set_title('Final Titer by Scenario', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Titer [mg/mL]')
axes[0, 0].tick_params(axis='x', rotation=45)

# Anomalies by scenario
sns.boxplot(data=comparison_df, x='scenario', y='num_anomalies', ax=axes[0, 1])
axes[0, 1].set_title('Anomalies Detected by Scenario', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Number of Anomalies')
axes[0, 1].tick_params(axis='x', rotation=45)

# Agent actions by scenario
sns.boxplot(data=comparison_df, x='scenario', y='num_actions', ax=axes[1, 0])
axes[1, 0].set_title('Agent Actions by Scenario', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Number of Actions')
axes[1, 0].tick_params(axis='x', rotation=45)

# Success rate by scenario
success_rate = comparison_df.groupby('scenario')['success'].mean() * 100
success_rate.plot(kind='bar', ax=axes[1, 1], color='skyblue')
axes[1, 1].set_title('Success Rate by Scenario', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Success Rate [%]')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].set_ylim([0, 100])

plt.tight_layout()
plt.show()

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Custom Experiment - Parameter Sweep

# COMMAND ----------

# Test sensitivity to different growth rates (mu_max)
mu_max_values = [0.03, 0.04, 0.05, 0.06]
sweep_results = []

print("🔬 Running parameter sweep: mu_max")
print(f"   Testing values: {mu_max_values}")

for mu_max in mu_max_values:
    print(f"\n   Testing mu_max = {mu_max}...")
    
    # Modify config
    custom_config = config.copy()
    custom_config['KINETIC_PARAMS'] = KINETIC_PARAMS.copy()
    custom_config['KINETIC_PARAMS']['mu_max'] = mu_max
    
    # Run simulation
    workflow_sweep = BioPilotWorkflow(
        spark=spark,
        config_dict=custom_config,
        enable_agent=False,  # Disable for speed
        enable_anomaly_detection=False
    )
    
    results_sweep = workflow_sweep.run_with_monitoring(
        base_feed_rate=0.1,
        save_to_lake=True
    )
    
    sweep_results.append({
        'mu_max': mu_max,
        'final_titer': results_sweep['final_titer'],
        'final_biomass': results_sweep['final_biomass']
    })

# Analyze sweep results
sweep_df = pd.DataFrame(sweep_results)
print("\n✅ Parameter sweep complete!")
display(sweep_df)

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(sweep_df['mu_max'], sweep_df['final_titer'], 'o-', linewidth=2, markersize=8)
ax1.set_xlabel('mu_max [1/h]', fontsize=11)
ax1.set_ylabel('Final Titer [mg/mL]', fontsize=11)
ax1.set_title('Titer vs Growth Rate', fontweight='bold')
ax1.grid(True, alpha=0.3)

ax2.plot(sweep_df['mu_max'], sweep_df['final_biomass'], 'o-', 
         linewidth=2, markersize=8, color='green')
ax2.set_xlabel('mu_max [1/h]', fontsize=11)
ax2.set_ylabel('Final Biomass [g/L]', fontsize=11)
ax2.set_title('Biomass vs Growth Rate', fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# COMMAND ----------

# MAGIC %md
# MAGIC ## 9. Export Results for Reporting

# COMMAND ----------

# Get comprehensive run summary
run_summary = workflow.data_lake.get_run_summary(spark, results['run_id'])

# Save summary to file
output_path = f"/tmp/biopilot_run_{results['run_id']}_summary.csv"

# Create summary DataFrame with native Python types
summary_export = pd.DataFrame([{
    'run_id': str(results['run_id']),
    'scenario': 'overfeed',
    'final_titer_mg_mL': float(results['final_titer']),
    'final_biomass_g_L': float(results['final_biomass']),
    'total_anomalies': int(results['num_anomalies']),
    'total_actions': int(results['num_actions']),
    'success': bool(results['final_titer'] > 5.0),
    'timestamp': datetime.now()
}])

summary_export.to_csv(output_path, index=False)
print(f"✅ Summary exported to: {output_path}")

# Also save telemetry
telemetry_path = f"/tmp/biopilot_run_{results['run_id']}_telemetry.csv"
results['observed_history'].to_csv(telemetry_path, index=False)
print(f"✅ Telemetry exported to: {telemetry_path}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 10. Next Steps & Resources
# MAGIC 
# MAGIC ### ✅ What You've Learned
# MAGIC - Running BioPilot simulations with fault injection
# MAGIC - Anomaly detection on telemetry data
# MAGIC - Agent copilot recommendations
# MAGIC - Delta Lake persistence and querying
# MAGIC - Batch scenario testing
# MAGIC - Parameter sensitivity analysis
# MAGIC 
# MAGIC ### 🚀 Try These Next
# MAGIC 1. **Create custom fault scenarios** - Edit `FAULT_TEMPLATES` in config.py
# MAGIC 2. **Build ML models** - Train predictive models on historical data
# MAGIC 3. **Develop new detection methods** - Extend `anomaly_detection.py`
# MAGIC 4. **Optimize agent rules** - Tune thresholds in `agent_copilot.py`
# MAGIC 5. **Build dashboard** - Deploy `dashboard.py` with Streamlit
# MAGIC 
# MAGIC ### 📚 Documentation
# MAGIC - **README**: `/Workspace/Repos/synthetic-twin/README.md`
# MAGIC - **Config Reference**: `modules/config.py`
# MAGIC - **API Docs**: See docstrings in each module
# MAGIC 
# MAGIC ### 💡 Pro Tips
# MAGIC - Use Delta Lake time travel to compare run versions
# MAGIC - Create views for common queries (success rate, avg titer by scenario)
# MAGIC - Set up scheduled jobs for automated scenario testing
# MAGIC - Export results to external tools (Tableau, PowerBI) via JDBC
# MAGIC 
# MAGIC ---
# MAGIC 
# MAGIC **BioPilot v1.0** | Developed for Bioprocess Training & Education

# COMMAND ----------

# Final cleanup
print("\n" + "="*60)
print("🎉 QUICKSTART COMPLETE!")
print("="*60)
print(f"\nYour run ID: {results['run_id']}")
print(f"Final titer: {results['final_titer']:.2f} mg/mL")
print(f"Data saved to: main.biopilot.*")
print("\n👉 Try modifying scenarios and running your own experiments!")
print("="*60)
