In [None]:
# !pip install -r requirements-py37.txt
# !pip install -e .

In [2]:
from pathlib import Path
from backend.dataaccess import DataAccess
from core.mediator import Mediator
import pandas as pd

In [3]:
# Paths
KB_PATH = Path("core/knowledge-base")
DB_PATH = Path("backend/data/mediator.db")
CSV_PATH = Path("backend/data/synthetic_input_data.csv")

In [4]:
# Initialize connection
da = DataAccess(db_path=str(DB_PATH))

# Create tables (if not exists)
da.create_db(drop=False)  # Set drop=True to recreate tables

# Check stats
stats = da.get_table_stats()
for table, info in stats.items():
    print(f"{table}: {info['rows']} rows, {info['n_patients']} patients")

[Info] Creating tables from DDL...
[Info]: DB initiated successfully!
[Info]: Total tables created: 4
[Info]: Table 'sqlite_sequence' - Rows: 2
[Info]: Table 'InputPatientData' - Rows: 1604
[Info]: Table 'OutputPatientData' - Rows: 370
[Info]: Table 'PatientQAScores' - Rows: 0
InputPatientData: 1604 rows, 12 patients
OutputPatientData: 370 rows, 5 patients
PatientQAScores: 0 rows, 0 patients


In [None]:
# Load CSV into InputPatientData
total_rows = da.load_csv_to_input(
    csv_path=str(CSV_PATH),
    if_exists='append',           # 'append' or 'replace'
    clear_output_and_qa=False,    # Set True to clear outputs
    yes=True                      # Auto-confirm
)
print(f"Loaded {total_rows} rows")

In [6]:
# Initialize mediator
mediator = Mediator(knowledge_base_path=KB_PATH, data_access=da)

# Build TAK repository
repo = mediator.build_repository()

print(f"✅ Loaded {len(repo.taks)} TAKs:")
print(f"  - Raw Concepts: {len(mediator.raw_concepts)}")
print(f"  - Events:       {len(mediator.events)}")
print(f"  - States:       {len(mediator.states)}")
print(f"  - Trends:       {len(mediator.trends)}")
print(f"  - Contexts:     {len(mediator.contexts)}")
print(f"  - Patterns:     {len(mediator.patterns)}")

# List all TAK names
print("\nTAK Names:")
for tak_name in sorted(repo.taks.keys()):
    print(f"  - {tak_name}")


PHASE 1: Building TAK Repository


Loading TAKs: 100%|██████████| 25/25 [00:00<00:00, 271.33file/s, Contexts: DIABETES_DIAGNOSYS_CONTEXT]          


[Validation] Running business-logic checks on TAK repository...

✅ TAK Repository Built Successfully
  Raw Concepts: 11
  Events:       5
  States:       4
  Trends:       1
  Contexts:     4
  Patterns:     0
  TOTAL TAKs:   25

✅ Loaded 25 TAKs:
  - Raw Concepts: 11
  - Events:       5
  - States:       4
  - Trends:       1
  - Contexts:     4
  - Patterns:     0

TAK Names:
  - ADMISSION
  - ADMISSION_EVENT
  - ANTIDIABETIC_DRUGS_IV_BITZUA
  - ANTIDIABETIC_DRUGS_IV_BITZUA_CONTEXT
  - ANTIDIABETIC_DRUGS_IV_BITZUA_STATE
  - BASAL_BITZUA
  - BASAL_BITZUA_CONTEXT
  - BASAL_BITZUA_STATE
  - BOLUS_BITZUA
  - BOLUS_BITZUA_CONTEXT
  - BOLUS_BITZUA_STATE
  - DEATH
  - DEATH_EVENT
  - DIABETES_DIAGNOSYS
  - DIABETES_DIAGNOSYS_CONTEXT
  - DISGLYCEMIA_EVENT
  - GLUCOSE_MEASURE
  - GLUCOSE_MEASURE_STATE
  - GLUCOSE_MEASURE_TREND
  - HYPERGLYCEMIA
  - HYPOGLYCEMIA
  - MEAL
  - MEAL_EVENT
  - RELEASE
  - RELEASE_EVENT





In [7]:
# Process specific patients (Jupyter-compatible)
patient_ids = [1000, 1001, 1002]
patient_stats = await mediator.run_async(
    max_concurrent=4,
    patient_subset=patient_ids
)

# Print results
for pid, stats in patient_stats.items():
    if "error" in stats:
        print(f"❌ Patient {pid}: {stats['error']}")
    else:
        total = sum(v for k, v in stats.items() if isinstance(v, int))
        print(f"✅ Patient {pid}: {total} output rows")


PHASE 1: Building TAK Repository


Loading TAKs: 100%|██████████| 25/25 [00:00<00:00, 203.50file/s, Contexts: DIABETES_DIAGNOSYS_CONTEXT]          
Loading TAKs: 100%|██████████| 25/25 [00:00<00:00, 203.50file/s, Contexts: DIABETES_DIAGNOSYS_CONTEXT]



[Validation] Running business-logic checks on TAK repository...

✅ TAK Repository Built Successfully
  Raw Concepts: 22
  Events:       10
  States:       8
  Trends:       2
  Contexts:     8
  Patterns:     0
  TOTAL TAKs:   25


PHASE 2: Processing 3 Patients (max_concurrent=4)
         Patient Subset: [1000, 1001, 1002]



Processing patients: 100%|██████████| 3/3 [00:04<00:00,  1.38s/patient]


✅ Patient Processing Complete
  Patients processed: 3
  Total rows written: 175
  Errors:             0

✅ Patient 1000: 42 output rows
✅ Patient 1001: 94 output rows
✅ Patient 1002: 39 output rows





In [8]:
# Query OutputPatientData
query = """
SELECT PatientId, ConceptName, StartDateTime, EndDateTime, Value
FROM OutputPatientData
WHERE PatientId IN (1000, 1001, 1002)
"""
df_results = pd.read_sql_query(query, da.conn)
df_results

Unnamed: 0,PatientId,ConceptName,StartDateTime,EndDateTime,Value
0,1000,BASAL_BITZUA_STATE,2025-01-21 21:00:00,2025-01-22 21:00:00,Medium
1,1000,BASAL_BITZUA_STATE,2025-01-22 21:00:00,2025-01-23 21:00:00,Low
2,1000,BASAL_BITZUA_STATE,2025-01-23 21:00:00,2025-01-24 13:59:59,SubCutaneous Low
3,1000,BOLUS_BITZUA_STATE,2025-01-21 13:06:00,2025-01-21 19:12:00,SubCutaneous Low
4,1000,BOLUS_BITZUA_STATE,2025-01-21 19:12:00,2025-01-22 03:12:00,SubCutaneous Medium
...,...,...,...,...,...
186,1002,BOLUS_BITZUA_CONTEXT,2025-01-23 19:12:00,2025-01-24 03:12:00,Medium
187,1002,BOLUS_BITZUA_CONTEXT,2025-01-24 07:56:00,2025-01-24 15:56:00,Medium
188,1002,BOLUS_BITZUA_CONTEXT,2025-01-25 08:15:00,2025-01-25 12:15:00,Low
189,1002,BOLUS_BITZUA_CONTEXT,2025-01-25 13:07:00,2025-01-25 13:59:59,Medium


In [9]:
# Count rows with null Value
null_count = pd.read_sql_query(
    "SELECT COUNT(*) AS NullRows FROM OutputPatientData WHERE Value IS NULL", da.conn
)
print(f"Rows with NULL Value: {null_count.iloc[0]['NullRows']}")

Rows with NULL Value: 0


In [10]:
# Find rows where Value looks like a tuple (starts with '(')
tuple_like = pd.read_sql_query(
    "SELECT PatientId, ConceptName, StartDateTime, EndDateTime, Value FROM OutputPatientData WHERE Value LIKE '(%'", da.conn
)
print(f"Rows with tuple-like Value: {len(tuple_like)}")
display(tuple_like.head(50))

Rows with tuple-like Value: 0


Unnamed: 0,PatientId,ConceptName,StartDateTime,EndDateTime,Value
