In [None]:
%pip install ipywidgets --quiet

import os
import time
import pandas as pd
from IPython.display import display

# try to import ipywidgets but tolerate environments where it's unavailable
try:
    import ipywidgets as widgets
except Exception as e:
    widgets = None
    print("Warning: ipywidgets could not be imported. Upload widget will be disabled. Error:", e)

fn = "breast_cancer.csv"

# try to locate in current tree
if not os.path.exists(fn):
    found = None
    for root, _, files in os.walk(os.getcwd()):
        if fn in files:
            found = os.path.join(root, fn)
            break
    if found:
        fn = found
        print("Found file at:", fn)
    else:
        # If widgets available, offer upload widget; otherwise instruct user how to proceed
        if widgets is None:
            print("File not found. ipywidgets not available, so upload widget is disabled.")
            print("Please place 'breast_cancer.csv' in the current working directory or set 'fn' to the correct absolute path and re-run.")
        else:
            print("File not found. Please upload it using the widget below (accepts .csv).")
            uploader = widgets.FileUpload(accept='.csv', multiple=False)
            display(uploader)

            # wait for upload (simple loop)
            while not uploader.value:
                time.sleep(0.5)
            # handle different ipywidgets versions (value can be dict or tuple)
            uploaded = next(iter(uploader.value.values())) if isinstance(uploader.value, dict) else uploader.value[0]
            content = uploaded.get('content', uploaded.get('content', None)) if isinstance(uploaded, dict) else uploaded['content']
            filename = uploaded.get('name') if isinstance(uploaded, dict) else uploaded['name']
            out_path = os.path.join(os.getcwd(), filename)
            with open(out_path, "wb") as f:
                f.write(content)
            fn = out_path
            print("Saved uploaded file to:", fn)

# load
if os.path.exists(fn):
    df = pd.read_csv(fn)
    print("Loaded:", fn, "shape:", df.shape)
else:
    df = None
    print("CSV still not found. Set 'fn' to the correct absolute path and re-run.")

if df is not None:
    print("Columns (repr):", [repr(c) for c in df.columns])
    print("\nHead:")
    print(df.head())

    # If single-column, attempt auto-detect separator
    if df.shape[1] == 1:
        print("\nSingle column detected, retrying with sep=None (engine='python') to auto-detect delimiter...")
        try:
            df2 = pd.read_csv(fn, sep=None, engine="python")
            print("Sniffed shape:", df2.shape)
            print("Columns (repr):", [repr(c) for c in df2.columns])
            print(df2.head())
            df = df2
        except Exception as e:
            print("Auto-sniff failed:", repr(e))

    # Find diagnosis-like column (allow whitespace/case differences)
    diag_col = None
    for c in df.columns:
        if str(c).strip().lower() == "diagnosis":
            diag_col = c
            break
    if diag_col is None:
        print("\nNo exact 'diagnosis' column found. Columns (repr) above â€” inspect for typos/trailing spaces.")
    else:
        print("\nDiagnosis column detected as repr:", repr(diag_col))
        print("Unique diagnosis values (repr, up to 50):", [repr(x) for x in df[diag_col].dropna().unique()[:50]])

    # Candidate feature columns (exclude id-like/diagnosis/priority)
    exclude = {c for c in df.columns if str(c).strip().lower() in {"id", "diagnosis", "priority"}}
    candidate_cols = [c for c in df.columns if c not in exclude]
    print("\nCandidate feature columns:", candidate_cols)

    # Show sample non-numeric entries for each candidate column
    print("\nNon-numeric samples per candidate column (up to 10):")
    for c in candidate_cols:
        # convert to string and show distinct samples that are not clearly numeric
        s = df[c].astype(str)
        nonnum_mask = ~s.str.replace(",", "").str.replace(".", "").str.lstrip("-").str.isdigit()
        samples = s[nonnum_mask].dropna().unique()[:10]
        print(f" - {repr(c)}: {list(samples)} (count non-numeric: {int(nonnum_mask.sum())})")

    # Coercion summary
    print("\nCoercion attempt summary (before_nonnull, after_numeric_nonnull, na_after):")
    for c in candidate_cols:
        before = df[c].notna().sum()
        coerced = pd.to_numeric(df[c].astype(str).str.replace(",", ""), errors="coerce")
        after = int(coerced.notna().sum())
        na_after = int(coerced.isna().sum())
        print(f" - {repr(c)}: ({before}, {after}, {na_after})")

    print("\nSample rows of candidate columns (first 10):")
    print(df[candidate_cols].head(10))

else:
    print("\nCould not load the CSV into a DataFrame. Check file encoding/delimiter or file path.")