In [None]:
# Breast Cancer EDA — Human + Agentic Workflow

**Goal:** EDA that’s reproducible and aligned with our agentic pipeline.  
We load the dataset from the web (and freeze a local copy), explore distributions/correlations,
and then show how our agents (EDA → Modeling → Explain) operate on the same data.

In [5]:
# --- Global Setup for Reproducibility ---
#%pip install -q pandas numpy matplotlib seaborn shap scikit-learn plotly

import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# ensure repo root in sys.path for imports
repo_root = os.getcwd()
if 'agents' not in os.listdir(repo_root):
    os.chdir('..')  # move up if notebook inside /notebooks
    repo_root = os.getcwd()
if repo_root not in sys.path:
    sys.path.append(repo_root)

print('Environment & Path Ready')
print('Repo root:', repo_root)


Note: you may need to restart the kernel to use updated packages.
Environment & Path Ready
Repo root: C:\Users\rajni\Documents\breast-cancer-agentic


In [1]:
# Basic setup for EDA
%pip install -q pandas numpy matplotlib seaborn scikit-learn shap plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import the data file
URL = 'https://raw.githubusercontent.com/rajnimassoun/breast-cancer-agentic/main/data/raw/breast_cancer_with_columns.csv'
df = pd.read_csv(URL)

print('Loaded from GitHub:', df.shape)
df.head()


Loaded from GitHub: (569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
### Data Source Rationale

The dataset used here, `breast_cancer_with_columns.csv` — is a cleaned version of the original UCI Breast Cancer Wisconsin (Diagnostic) dataset.  
Our AI agents were developed and tested on this processed file to ensure consistent results.  
The original raw dataset (without headers) was explored during early EDA and preprocessing steps handled by the agents, 
but this version is retained for reproducibility and clarity.


In [8]:
import sys, types, inspect
import importlib

m = importlib.import_module('agents.eda_agent')
fns = [n for n in dir(m) if callable(getattr(m, n))]
print('agents.eda_agent exports:\n', fns)

# Try to pick a sensible entrypoint
candidate_names = ['run', 'main', 'eda', 'analyze', 'analyze_eda', 'execute', 'start']
entry = None
for name in candidate_names:
    if name in fns:
        entry = getattr(m, name)
        break

# If no obvious entrypoint, create a minimal adapter
if entry is None:
    print('No obvious EDA entrypoint found. Creating a minimal adapter `run_eda` here.')
    def run_eda(dataset_path, target_col='diagnosis'):
        import pandas as pd, numpy as np
        import matplotlib.pyplot as plt, seaborn as sns
        df = pd.read_csv(dataset_path)
        if 'id' in df.columns:
            df = df.drop(columns=['id'])
        if df[target_col].dtype == object:
            df[target_col] = df[target_col].map({'M':1, 'B':0})
        out = {
            'shape': df.shape,
            'class_balance': df[target_col].value_counts().to_dict(),
            'missing_total': int(df.isna().sum().sum()),
            'describe_head': df.describe().T.head(10)
        }
        display(out['describe_head'])
        return out
else:
    print(f'Found entrypoint: {entry.__name__}')
    def run_eda(dataset_path, target_col='diagnosis'):
        # Standardize call to your agent's entrypoint
        return entry(dataset_path=dataset_path, target_col=target_col)

print('run_eda is ready to call.')


agents.eda_agent exports:
 ['Path', 'encode_target', 'find_target_col', 'resolve_path', 'run', 'train_test_split']
Found entrypoint: run
run_eda is ready to call.


In [7]:
# import agent robustly + call it; fallback to built-in EDA if needed
import os, sys, importlib, inspect
from pathlib import Path
import pandas as pd

DATA_URL = 'https://raw.githubusercontent.com/rajnimassoun/breast-cancer-agentic/main/data/raw/breast_cancer_with_columns.csv'
TARGET = 'diagnosis'

# 1) Find repo root containing agents/eda_agent.py
here = Path.cwd()
repo_root = None
for p in [here, *here.parents]:
    if (p / 'agents' / 'eda_agent.py').exists():
        repo_root = p; break

def builtin_eda(url, target):
    df = pd.read_csv(url)
    if 'id' in df.columns: df = df.drop(columns=['id'])
    if df[target].dtype == object:
        df[target] = df[target].map({'M':1,'B':0})
    out = {
        'shape': df.shape,
        'class_balance': df[target].value_counts().to_dict(),
        'missing_total': int(df.isna().sum().sum()),
        'describe_T_head10': df.describe().T.head(10)
    }
    display(out['describe_T_head10'])
    print(' Fallback EDA completed')
    return out

if not repo_root:
    print(' Could not find agents/eda_agent.py — running fallback EDA.')
    res = builtin_eda(DATA_URL, TARGET)
else:
    sys.path.insert(0, str(repo_root))
    try:
        eda_mod = importlib.import_module('agents.eda_agent')
        if not hasattr(eda_mod, 'run'):
            print(' agents.eda_agent has no `run`; using fallback.')
            res = builtin_eda(DATA_URL, TARGET)
        else:
            run_fn = getattr(eda_mod, 'run')
            sig = inspect.signature(run_fn)
            params = [p for p in sig.parameters.values() if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)]
            # Decide how many args to pass
            if len(params) == 0:
                call = lambda: run_fn()
            elif len(params) == 1:
                call = lambda: run_fn(DATA_URL)
            else:
                call = lambda: run_fn(DATA_URL, TARGET)
            res = call()
            print(' EDA agent finished successfully')
    except Exception as e:
        print(' Agent import/call failed:', e, '\n→ Running fallback EDA.')
        res = builtin_eda(DATA_URL, TARGET)

# Light preview of result object
print('Type:', type(res))
try:
    if isinstance(res, dict):
        print('Keys:', list(res.keys())[:10])
except: pass


⚠️ Agent import/call failed: string indices must be integers, not 'str' 
→ Running fallback EDA.


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diagnosis,569.0,0.372583,0.483918,0.0,0.0,0.0,1.0,1.0
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304


✅ Fallback EDA completed
Type: <class 'dict'>
Keys: ['shape', 'class_balance', 'missing_total', 'describe_T_head10']
