In [1]:
import pandas as pd
import pandas_flow

# Read CSVs before enabling pandas_flow tracking
patients = pd.read_csv("patients.csv")
exams = pd.read_csv("exams.csv")

print(f"Patients: {len(patients)} rows")
print(f"Exams: {len(exams)} rows")


Patients: 100 rows
Exams: 240 rows


In [2]:
# Setup tracker with scatter plot
# This creates a scatter plot of age vs result_value inside each box
flow = pandas_flow.setup(
    track_row_count=True,
    track_variables={
        "patient_id": "n_unique",
    },
    stats_variable="age",
    stats_types=["min", "max", "mean", "histogram"],
    # scatter_variables generates a simple scatter with blue dots
    scatter_variables=("age", "result_value"),
    modern =False
)

print("Tracker configured with scatter: age vs result_value")


Tracker configured with scatter: age vs result_value


In [3]:
# Merge datasets
combined = patients.merge(exams, on="patient_id", how="inner")
print(f"Combined: {len(combined)} rows")

# Filter adults only
adults = combined.query("age >= 18")
print(f"Adults: {len(adults)} rows")

# Filter completed exams
completed = adults.query("status == 'completed'")
print(f"Completed exams: {len(completed)} rows")

# Remove duplicates
clean_data = completed.drop_duplicates(subset=["patient_id", "exam_date"])
print(f"Clean data: {len(clean_data)} rows")


Combined: 240 rows
Adults: 121 rows
Completed exams: 96 rows
Clean data: 96 rows


In [4]:
# Generate flowchart with scatter plot embedded in HTML
html_path = "scatter_example.html"
flow.render(html_path, title="Patient Analysis with Scatter Plot")

print(f"\nFlowchart saved to: {html_path}")
print("\nOpen the HTML file in a browser to see:")
print("  - Histogram of 'age' variable")
print("  - Scatter plot of 'age' vs 'result_value'")
print("  - Both plots embedded inside the boxes!")



Flowchart saved to: scatter_example.html

Open the HTML file in a browser to see:
  - Histogram of 'age' variable
  - Scatter plot of 'age' vs 'result_value'
  - Both plots embedded inside the boxes!


In [5]:
# Show summary of tracked operations
print(flow.summary())


PANDAS FLOW SUMMARY
Total operations: 4

1. Merge (inner) (merge)
   → 240 rows × 10 cols
   • patient_id: 86 unique
   • age: 43 unique

2. Query (query)
   → 121 rows × 10 cols
   • patient_id: 43 unique
   • age: 32 unique

3. Query (query)
   → 96 rows × 10 cols
   • patient_id: 40 unique
   • age: 32 unique

4. Drop Duplicates (drop_duplicates)
   → 96 rows × 10 cols
   • patient_id: 40 unique
   • age: 32 unique

