In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries
**Purpose:** Import all the required libraries for ML modeling, explainability, visualization, and Google ADK/ Gemini integration.

In [2]:
import os
import json
import logging
from typing import Dict, Any, List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from lightgbm import LGBMClassifier
import shap
# Gemini
from google import genai
from google.genai import types as genai_types
# ADK core
from google.adk.agents import LlmAgent
from google.adk.runners import InMemoryRunner
from google.adk.tools import AgentTool
from google.adk.sessions import InMemorySessionService
from google.adk.models import LlmRequest
from google.adk.a2a.utils.agent_to_a2a import to_a2a
print("Imports complete.")

Imports complete.


## Configuration and Authentication
**Purpose:** Set up logging, authenticate with Google API, initialize Gemini client, and create global memory storage.

In [3]:
import os
from kaggle_secrets import UserSecretsClient

logging.basicConfig(
   level=logging.INFO,
   format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
logger = logging.getLogger("mlinsightx")
#Gemini Config
try:
    GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
    print("âœ… Setup and authentication complete.")
except Exception as e:
    print(
        f"ðŸ”‘ Authentication Error: Please make sure you have added 'GOOGLE_API_KEY' to your Kaggle secrets. Details: {e}"
    )

GEMINI_MODEL = "gemini-2.0-flash-exp" 
if GOOGLE_API_KEY:
   genai_client = genai.Client(api_key=GOOGLE_API_KEY)
   logger.info("Gemini client initialized with model %s", GEMINI_MODEL)
else:
   genai_client = None
   logger.warning("No GOOGLE_API_KEY found. Using fallback text only.")

artifacts: Dict[str, Any] = {}
print("Config + logging ready; MemoryBank initialized.")

âœ… Setup and authentication complete.
Config + logging ready; MemoryBank initialized.


## Load Dataset
**Purpose:** Load the Bank Marketing dataset from Kaggle input directory.

In [5]:
from pathlib import Path
def load_bank_marketing() -> pd.DataFrame:
   logger.info("Loading Bank Marketing dataset from local files...")
   possible_paths =  [
        "/kaggle/input/bank-csv/bank.csv",
    ]
   for p in possible_paths:
       if Path(p).exists():
           logger.info("Found dataset at: %s", p)
           df = pd.read_csv(p)
          # df = pd.read_csv(p, sep=";")
           print ("Using dataset:", p)
           return df
   raise FileNotFoundError(
       "bank.csv not found. Please attach a Bank Marketing dataset to this Kaggle notebook."
   )
df = load_bank_marketing()

print(df.head())
print("Shape:", df.shape)

Using dataset: /kaggle/input/bank-csv/bank.csv
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  
Shape: (4521, 17)


## Data Preprocessing & Model Training
**Purpose:** Prepare data, build ML pipeline with preprocessing, train LightGBM classifier, and evaluate performance. 

In [6]:
TARGET_COL = "y"
assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found."

y = (df[TARGET_COL].astype(str).str.lower().isin(["yes", "1", "true", "success"])).astype(int)
X = df.drop(columns=[TARGET_COL])
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()
logger.info("Numeric features: %s", numeric_features)
logger.info("Categorical features: %s", categorical_features)
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42, stratify=y
)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(
   steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
   transformers=[
       ("num", numeric_transformer, numeric_features),
       ("cat", categorical_transformer, categorical_features),
   ]
)
lgbm = LGBMClassifier(
   random_state=42,
   n_estimators=200,
   learning_rate=0.05,
   max_depth=-1,
)
pipeline = Pipeline(
   steps=[
       ("preprocessor", preprocessor),
       ("model", lgbm),
   ]
)
logger.info("Training LightGBM pipeline...")
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]
metrics = {
   "accuracy": float(accuracy_score(y_test, y_pred)),
   "f1": float(f1_score(y_test, y_pred)),
   "roc_auc": float(roc_auc_score(y_test, y_proba)),
}
logger.info("Metrics: %s", metrics)

artifacts["pipeline"] = pipeline
artifacts["X_train"] = X_train
artifacts["X_test"] = X_test
artifacts["y_train"] = y_train
artifacts["y_test"] = y_test
artifacts["metrics"] = metrics
artifacts["numeric_features"] = numeric_features
artifacts["categorical_features"] = categorical_features
print("Training complete. Metrics:", metrics)

[LightGBM] [Info] Number of positive: 417, number of negative: 3199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 953
[LightGBM] [Info] Number of data points in the train set: 3616, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115321 -> initscore=-2.037507
[LightGBM] [Info] Start training from score -2.037507
Training complete. Metrics: {'accuracy': 0.8950276243093923, 'f1': 0.46327683615819215, 'roc_auc': 0.9093560933448573}


## Explainability Functions
**Purpose:** Define functions for computing SHAP values, permutation importance, and partial dependence plots.

In [7]:
def compute_shap_values(
   pipeline: Pipeline,
   X_background: pd.DataFrame,
   X_sample: pd.DataFrame
):
   """
   Computes SHAP values using LightGBM's TreeExplainer.
   """
   logger.info("Computing SHAP values (TreeExplainer, LightGBM)...")
   model = pipeline.named_steps["model"]
   preprocessor = pipeline.named_steps["preprocessor"]
   # Transform background & sample using the pipeline's preprocessor
   X_bg_trans = preprocessor.transform(X_background)
   X_sample_trans = preprocessor.transform(X_sample)
   # Feature names after transformation
   feature_names = preprocessor.get_feature_names_out()
   # TreeExplainer works directly on model inputs
   explainer = shap.TreeExplainer(model)
   shap_values = explainer.shap_values(X_sample_trans)
   logger.info("SHAP calculation done. Shape: %s", np.array(shap_values).shape)
   return {
       "shap_values": shap_values,
       "expected_value": explainer.expected_value,
       "feature_names": list(feature_names),
   }

def compute_permutation_importance(
   pipeline: Pipeline,
   X_test: pd.DataFrame,
   y_test: pd.Series
):
   """
   Computes permutation importance using transformed input.
   """
   logger.info("Computing permutation importance...")
   preprocessor = pipeline.named_steps["preprocessor"]
   model = pipeline.named_steps["model"]
   # Transform before permutation importance
   X_test_trans = preprocessor.transform(X_test)
   feature_names = preprocessor.get_feature_names_out()
   result = permutation_importance(
       model,
       X_test_trans,
       y_test,
       n_repeats=10,
       random_state=42,
       n_jobs=-1,
   )
   logger.info("Permutation importance done.")
   return {
       "importances_mean": result.importances_mean.tolist(),
       "importances_std": result.importances_std.tolist(),
       "feature_names": list(feature_names),
   }

def compute_pdp(
   pipeline: Pipeline,
   X: pd.DataFrame,
   features: List[str],
   output_dir: str = "pdp_plots",
):
   """
   Computes PDP for transformed features.
   """
   logger.info("Computing PDP for features: %s", features)
   os.makedirs(output_dir, exist_ok=True)
   preprocessor = pipeline.named_steps["preprocessor"]
   model = pipeline.named_steps["model"]
   # Transform X
   X_trans = preprocessor.transform(X)
   feature_names = list(preprocessor.get_feature_names_out())
   saved_paths = []
   # Map categorical names to feature indices
   name_to_index = {name: idx for idx, name in enumerate(feature_names)}
   for feat in features:

       matching_features = [
           name for name in feature_names if name.startswith(feat + "_") or name == feat
       ]
       if not matching_features:
           logger.warning("Feature %s not found in transformed feature set.", feat)
           continue
       for mf in matching_features:
           idx = name_to_index[mf]
           fig, ax = plt.subplots()
           try:
               PartialDependenceDisplay.from_estimator(
                   model,
                   X_trans,
                   [idx],
                   kind="average",
                   ax=ax,
                   grid_resolution=50
               )
               ax.set_title(f"PDP - {mf}")
               path = os.path.join(output_dir, f"pdp_{mf}.png")
               fig.savefig(path, bbox_inches="tight")
               plt.close(fig)
               saved_paths.append(path)
           except Exception as e:
               logger.warning(f"PDP failed for {mf}: {e}")
               plt.close(fig)
               continue
   logger.info("PDP generation done. Saved %d plots.", len(saved_paths))
   return saved_paths

print("Explainability functions redefined (SHAP, Perm, PDP).")

Explainability functions redefined (SHAP, Perm, PDP).


## Execute Explainability Analysis
**Purpose**: Run all explainability tools and create a global explanation summary.

In [8]:
logger.info("Running explainability tools...")
pipeline = artifacts["pipeline"]
X_train = artifacts["X_train"]
X_test = artifacts["X_test"]
y_test = artifacts["y_test"]
# 1. Prepare SHAP background 
X_background = X_train.sample(n=min(200, len(X_train)), random_state=42)
X_sample = X_test.sample(n=min(200, len(X_test)), random_state=42)
# 2. Run SHAP
shap_result = compute_shap_values(
   pipeline=pipeline,
   X_background=X_background,
   X_sample=X_sample
)
# 3. Run Permutation Importance
perm_result = compute_permutation_importance(
   pipeline=pipeline,
   X_test=X_test,
   y_test=y_test
)
# 4. Run PDP for top features 
feature_names = shap_result["feature_names"]
pdp_paths = compute_pdp(
   pipeline=pipeline,
   X=X_test,
   features=list(set([fn.split("_")[0] for fn in feature_names])), 
   output_dir="pdp_plots"
)
# 5. Store results
artifacts["shap_result"] = shap_result
artifacts["perm_result"] = perm_result
artifacts["pdp_paths"] = pdp_paths
print("Explainability completed.")
print("SHAP values shape:", np.array(shap_result["shap_values"]).shape)
print("Permutation features:", perm_result["feature_names"])
print("Saved PDP plots:", len(pdp_paths))

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
Attempting to set identical low and high ylims makes transformation singular; automatically expanding.
Attempting to set identical low and high ylims makes transformation singular; automatically expanding.
Attempting to set identical low and high ylims makes transformation singular; automatically expanding.


Explainability completed.
SHAP values shape: (2, 200, 51)
Permutation features: ['num__age', 'num__balance', 'num__day', 'num__duration', 'num__campaign', 'num__pdays', 'num__previous', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur', 'cat__job_housemaid', 'cat__job_management', 'cat__job_retired', 'cat__job_self-employed', 'cat__job_services', 'cat__job_student', 'cat__job_technician', 'cat__job_unemployed', 'cat__job_unknown', 'cat__marital_divorced', 'cat__marital_married', 'cat__marital_single', 'cat__education_primary', 'cat__education_secondary', 'cat__education_tertiary', 'cat__education_unknown', 'cat__default_no', 'cat__default_yes', 'cat__housing_no', 'cat__housing_yes', 'cat__loan_no', 'cat__loan_yes', 'cat__contact_cellular', 'cat__contact_telephone', 'cat__contact_unknown', 'cat__month_apr', 'cat__month_aug', 'cat__month_dec', 'cat__month_feb', 'cat__month_jan', 'cat__month_jul', 'cat__month_jun', 'cat__month_mar', 'cat__month_may', 'cat__month_nov', 'ca

In [9]:
# Build Global Explanation Text
shap_text = f"Top SHAP features: {shap_result['feature_names'][:10]}"
perm_text = f"Top permutation features: {perm_result['feature_names'][:10]}"
pdp_text = f"PDP plots saved: {len(pdp_paths)}"
global_explanation = (
   "MODEL EXPLAINABILITY SUMMARY\n\n"
   "Key SHAP drivers:\n"
   f"{shap_text}\n\n"
   "Important features (permutation importance):\n"
   f"{perm_text}\n\n"
   "PDP summary:\n"
   f"{pdp_text}\n"
)
# Save this combined text block for Narrative Agent
artifacts["global_explanation_text"] = global_explanation
print("Global explanation summary created.")

Global explanation summary created.


## Generate Business Narrative with Gemini
**Purpose**: Use Gemini to create a stakeholder-friendly narrative explaining the model.

In [10]:
NARRATIVE_SYSTEM_PROMPT = """
You are an expert data scientist explaining a binary classification model
to business stakeholders. Summarize:
- What the model predicts
- The most important drivers from permutation importance and PDP
- Key patterns (e.g., thresholds, saturation)
- 3 concise, actionable recommendations
Keep it under 200 words. Avoid equations and low-level technical jargon.
"""
def generate_narrative_with_gemini(
   metrics: Dict[str, float],
   global_explanation_text: str,
   target_description: str = "whether a customer subscribes to a term deposit",
) -> str:
   if genai_client is None:
       logger.warning("No Gemini client; using fallback narrative.")
       return (
           f"This model predicts {target_description} using campaign, customer, and contact features. "
           f"On the held-out test data it reaches accuracy of {metrics.get('accuracy', 0):.3f} and ROC-AUC of "
           f"{metrics.get('roc_auc', 0):.3f}. The most important drivers include call duration and certain "
           "campaign attributes, which influence the probability of subscription. Longer and more targeted "
           "conversations tend to increase conversion, while repeated contacts with poor timing reduce it. "
           "Recommended next steps: monitor these top features, refine contact strategies for high-potential "
           "segments, and run targeted experiments to improve uplift."
       )
   user_prompt = f"""
MODEL METRICS:
{json.dumps(metrics, indent=2)}
GLOBAL EXPLANATIONS:
{global_explanation_text}
TARGET:
{target_description}
"""
   try:
       response = genai_client.models.generate_content(
           model=GEMINI_MODEL,
           contents=user_prompt,
           config=genai_types.GenerateContentConfig(
               temperature=1.0,
               max_output_tokens=400,
               system_instruction=NARRATIVE_SYSTEM_PROMPT,
           ),
       )
       text = response.text.strip()
       logger.info("Narrative generated with Gemini, length=%d chars.", len(text))
       return text
   except Exception as e:
       logger.error("Error calling Gemini for narrative: %s", e)
       return (
           f"This model predicts {target_description}. It achieves accuracy {metrics.get('accuracy', 0):.3f} "
           f"and ROC-AUC {metrics.get('roc_auc', 0):.3f}. Features related to campaign interactions and "
           "call duration appear most important. Longer, well-timed calls tend to increase subscription odds, "
           "while repeated unsuccessful contacts may hurt performance. Recommended actions: monitor these "
           "features, adjust call strategies, and test targeted outreach to high-probability groups."
       )
metrics = artifacts["metrics"]
global_explanation_text = artifacts["global_explanation_text"]
narrative_text = generate_narrative_with_gemini(
   metrics=metrics,
   global_explanation_text=global_explanation_text,
)
artifacts["narrative_text"] = narrative_text
print(narrative_text)

This model predicts whether a customer will subscribe to a term deposit. It's quite accurate overall (89.5% accuracy, AUC 0.91) but less precise at identifying *who will subscribe* (F1 score 0.46).

The most important factors are the customer's age, account balance, the day of the month they were last contacted, the duration of the last call, and the number of contacts during the campaign. Also influential are attributes of their job.

Here are some key patterns:
*   Older customers are more likely to subscribe
*   Customers with higher balances are more likely to subscribe

Recommendations:

1.  **Target older customers and those with higher balances.** Focus marketing efforts on these segments as they show a higher propensity to subscribe.
2.  **Increase call duration.** Longer call durations have a positive impact on subscription rates.
3.  **Optimize contact timing.** Focus on making calls on the days of the month when subscription rates are highest.


## LLM-as-Judge Evaluation
**Purpose**: Use Gemini to evaluate the quality of the generated narrative.

In [11]:
JUDGE_SYSTEM_RUBRIC = """
You are an expert ML reviewer evaluating an explanation of a supervised ML model.
You will receive:
- DATASET SUMMARY
- MODEL SUMMARY
- GLOBAL EXPLANATIONS
- NARRATIVE
Score the narrative in [0,1] for:
- technical accuracy
- coverage of key drivers
- clarity for stakeholders
- actionability 
Return ONLY a JSON object:
{
 "score_overall": <float>,
 "score_technical": <float>,
 "score_coverage": <float>,
 "score_clarity": <float>,
 "score_actionable": <float>,
 "verdict": "pass" or "needs_improvement",
 "comment": "short summary"
}
"""
def judge_explanation_with_gemini(
   dataset_summary: str,
   model_summary: str,
   global_explanations_text: str,
   narrative_text: str,
) -> Dict[str, Any]:
   if genai_client is None:
       logger.warning("No Gemini client; using fallback judge scores.")
       return {
           "score_overall": 0.8,
           "score_technical": 0.8,
           "score_coverage": 0.8,
           "score_clarity": 0.8,
           "score_actionable": 0.7,
           "verdict": "pass",
           "comment": "Fallback judge: narrative appears reasonable but Gemini was unavailable.",
       }
   prompt = f"""
DATASET SUMMARY:
{dataset_summary}
MODEL SUMMARY:
{model_summary}
GLOBAL EXPLANATIONS:
{global_explanations_text}
NARRATIVE:
{narrative_text}
"""
   try:
       response = genai_client.models.generate_content(
           model=GEMINI_MODEL,
           contents=prompt,
           config=genai_types.GenerateContentConfig(
               temperature=0.2,
               max_output_tokens=512,
               system_instruction=JUDGE_SYSTEM_RUBRIC,
           ),
       )
       raw = response.text.strip()
       try:
           result = json.loads(raw)
       except json.JSONDecodeError:
           start = raw.find("{")
           end = raw.rfind("}")
           result = json.loads(raw[start : end + 1])
       logger.info("Judge result: %s", result)
       return result
   except Exception as e:
       logger.error("Error calling Gemini judge: %s", e)
       return {
           "score_overall": 0.6,
           "score_technical": 0.6,
           "score_coverage": 0.6,
           "score_clarity": 0.6,
           "score_actionable": 0.5,
           "verdict": "needs_improvement",
           "comment": "LLM judge failed; rely on metrics and plots instead.",
       }
dataset_summary = (
   "Bank Marketing dataset: binary classification predicting if a customer subscribes "
   "to a term deposit based on demographics, contact history, and campaign features."
)
model_summary = (
   "LightGBM classifier with numeric scaling and one-hot encoding for categoricals. "
   f"Test metrics: accuracy={metrics['accuracy']:.3f}, F1={metrics['f1']:.3f}, "
   f"ROC-AUC={metrics['roc_auc']:.3f}."
)
judge_result = judge_explanation_with_gemini(
   dataset_summary=dataset_summary,
   model_summary=model_summary,
   global_explanations_text=global_explanation_text,
   narrative_text=narrative_text,
)
artifacts["judge_result"] = judge_result
print(json.dumps(judge_result, indent=2))

{
  "score_overall": 0.85,
  "score_technical": 0.9,
  "score_coverage": 0.8,
  "score_clarity": 0.9,
  "score_actionable": 0.8,
  "verdict": "pass",
  "comment": "Good explanation of the model and its predictions. It covers the key drivers and provides actionable recommendations. Could benefit from mentioning the limitations of relying solely on global explanations and the importance of considering individual predictions."
}


## Generate Reports
**Purpose**: Save results as JSON and PDF reports for stakeholders.

In [19]:
def save_json_report(
   artifacts: Dict[str, Any],
   path: str = "mlinsightx_report.json",
) -> str:
   data = {
       "metrics": artifacts.get("metrics", {}),
       "global_explanation_text": artifacts.get("global_explanation_text", ""),
       "narrative_text": artifacts.get("narrative_text", ""),
       "judge_result": artifacts.get("judge_result", {}),
       "pdp_features": artifacts.get("pdp_features", []),
       "pdp_paths": artifacts.get("pdp_paths", []),
   }
   with open(path, "w") as f:
       json.dump(data, f, indent=2)
   logger.info("JSON report saved at %s", path)
   return path
    
def save_pdf_report_simple(
   metrics: Dict[str, float],
   narrative_text: str,
   judge_result: Dict[str, Any],
   path: str = "mlinsightx_report.pdf",
) -> str:

   fig, ax = plt.subplots(figsize=(8.5, 11))
   ax.axis("off")
   lines = []
   lines.append("MLInsightX â€“ Model Explainer Summary")
   lines.append("")
   lines.append(f"Accuracy: {metrics.get('accuracy', 0):.3f}")
   lines.append(f"F1 Score: {metrics.get('f1', 0):.3f}")
   lines.append(f"ROC-AUC: {metrics.get('roc_auc', 0):.3f}")
   lines.append("")
   lines.append("Narrative:")
   for line in narrative_text.split("\n"):
       lines.append(line)
   lines.append("")
   lines.append("LLM-as-Judge Summary:")
   lines.append(json.dumps(judge_result, indent=2))
   y = 0.95
   for line in lines:
       ax.text(
           0.05,
           y,
           line[:120],
           fontsize=8,
           va="top",
           wrap=True,
       )
       y -= 0.03
       if y < 0.05:
           break
   fig.savefig(path, bbox_inches="tight")
   plt.close(fig)
   logger.info("PDF report saved at %s", path)
   return path
json_path = save_json_report(artifacts)
pdf_path = save_pdf_report_simple(
   metrics=artifacts["metrics"],
   narrative_text=artifacts["narrative_text"],
   judge_result=artifacts["judge_result"],
)
artifacts["json_report_path"] = json_path
artifacts["pdf_report_path"] = pdf_path
print("JSON report:", json_path)
print("PDF report:", pdf_path)

JSON report: mlinsightx_report.json
PDF report: mlinsightx_report.pdf


## Access Reports
**Purpose**: Create download links and preview the generated reports.

In [23]:
from IPython.display import FileLink, display
import json
print("=" * 60)
print("REPORTS GENERATED")
print("=" * 60)
# JSON Report
print("\nðŸ“„ JSON Report:")
print(f"Location: {artifacts['json_report_path']}")
display(FileLink(artifacts['json_report_path']))
# Preview JSON content
with open(artifacts['json_report_path'], 'r') as f:
   report_data = json.load(f)
   print("\nJSON Preview:")
   print(json.dumps(report_data, indent=2)[:500] + "...")
# PDF Report  
print("\nðŸ“‹ PDF Report:")
print(f"Location: {artifacts['pdf_report_path']}")
display(FileLink(artifacts['pdf_report_path']))

REPORTS GENERATED

ðŸ“„ JSON Report:
Location: mlinsightx_report.json



JSON Preview:
{
  "metrics": {
    "accuracy": 0.8950276243093923,
    "f1": 0.46327683615819215,
    "roc_auc": 0.9093560933448573
  },
  "global_explanation_text": "MODEL EXPLAINABILITY SUMMARY\n\nKey SHAP drivers:\nTop SHAP features: ['num__age', 'num__balance', 'num__day', 'num__duration', 'num__campaign', 'num__pdays', 'num__previous', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur']\n\nImportant features (permutation importance):\nTop permutation features: ['num__age', 'num__balance',...

ðŸ“‹ PDF Report:
Location: mlinsightx_report.pdf


## Define ADK Callbacks
**Purpose**: Create logging callbacks for agent and model interactions.

In [13]:
def before_agent_callback(**kwargs):
   callback_context = kwargs.get('callback_context')
   logger.info(
       "â–¶ BEFORE_AGENT | agent=%s | invocation_id=%s",
       getattr(callback_context, "agent_name", "unknown"),
       getattr(callback_context, "invocation_id", "n/a"),
   )
   return None
def after_agent_callback(**kwargs):
   callback_context = kwargs.get('callback_context')
   logger.info(
       "â—€ AFTER_AGENT | agent=%s | invocation_id=%s",
       getattr(callback_context, "agent_name", "unknown"),
       getattr(callback_context, "invocation_id", "n/a"),
   )
   return None
def before_model_callback(**kwargs):
   callback_context = kwargs.get('callback_context')
   llm_request = kwargs.get('llm_request')
   logger.info(
       "ðŸ’¬ BEFORE_MODEL | agent=%s | invocation_id=%s",
       getattr(callback_context, "agent_name", "unknown"),
       getattr(callback_context, "invocation_id", "n/a"),
   )
   return None
def after_model_callback(**kwargs):
   callback_context = kwargs.get('callback_context')
   llm_response = kwargs.get('llm_response')
   text_preview = ""
   try:
       if llm_response and hasattr(llm_response, 'content'):
           text_preview = llm_response.content.parts[0].text[:200]
   except Exception:
       pass
   logger.info(
       "ðŸ’¬ AFTER_MODEL | agent=%s | invocation_id=%s | preview=%s",
       getattr(callback_context, "agent_name", "unknown"),
       getattr(callback_context, "invocation_id", "n/a"),
       text_preview.replace("\n", " "),
   )
   return None
print("ADK callbacks defined.")

ADK callbacks defined.


## Define Agent Tools
**Purpose**: Create tool functions that agents can use to access artifacts.


In [14]:
#cell11
def trainer_summary_tool() -> dict:
   return {
       "metrics": artifacts.get("metrics", {}),
       "target": "Predict term deposit subscription (y) in the Bank Marketing dataset.",
       "n_train": len(artifacts.get("X_train", [])),
       "n_test": len(artifacts.get("X_test", [])),
   }
def explainer_summary_tool() -> dict:
   return {
       "global_explanation_text": artifacts.get("global_explanation_text", ""),
       "pdp_features": artifacts.get("pdp_features", []),
       "pdp_paths": artifacts.get("pdp_paths", []),
   }
def narrative_summary_tool() -> dict:
   return {
       "narrative_text": artifacts.get("narrative_text", ""),
   }
def judge_summary_tool() -> dict:
   return artifacts.get("judge_result", {})
def report_paths_tool() -> dict:
   return {
       "json_report_path": artifacts.get("json_report_path", None),
       "pdf_report_path": artifacts.get("pdf_report_path", None),
   }
print("Summary tools for Trainer/Explainer/Narrative/Judge/Reporter defined.")

Summary tools for Trainer/Explainer/Narrative/Judge/Reporter defined.


## Create Sub-Agents
**Purpose**: Define sub-agents for each component of the explainability pipeline.

In [15]:
trainer_agent = LlmAgent(
   name="trainer_agent",
   model=GEMINI_MODEL if GOOGLE_API_KEY else GEMINI_MODEL,
   description="Trainer Agent: summarizes the trained ML model and metrics.",
   instruction=(
       "You are the Trainer Agent. Use the trainer_summary_tool to inspect model performance "
       "and dataset sizes. Provide a short summary (3â€“5 sentences) of how well the model is doing."
   ),
   tools=[trainer_summary_tool],
   before_agent_callback=before_agent_callback,
   after_agent_callback=after_agent_callback,
   before_model_callback=before_model_callback,
   after_model_callback=after_model_callback,
)
explainer_agent = LlmAgent(
   name="explainer_agent",
   model=GEMINI_MODEL if GOOGLE_API_KEY else GEMINI_MODEL,
   description="Explainer Agent: summarizes SHAP / permutation / PDP insights.",
   instruction=(
       "You are the Explainer Agent. Use explainer_summary_tool to read the global explanation text "
       "and PDP features. Summarize the key drivers and any patterns (e.g., thresholds) in 3â€“5 sentences."
   ),
   tools=[explainer_summary_tool],
   before_agent_callback=before_agent_callback,
   after_agent_callback=after_agent_callback,
   before_model_callback=before_model_callback,
   after_model_callback=after_model_callback,
)
narrative_agent = LlmAgent(
   name="narrative_agent",
   model=GEMINI_MODEL if GOOGLE_API_KEY else GEMINI_MODEL,
   description="Narrative Agent: refines the explanation into a stakeholder-facing story.",
   instruction=(
       "You are the Narrative Agent. Use narrative_summary_tool to read the existing narrative text. "
       "Optionally refine or tighten the narrative for a business stakeholder, keeping it under 200 words."
   ),
   tools=[narrative_summary_tool],
   before_agent_callback=before_agent_callback,
   after_agent_callback=after_agent_callback,
   before_model_callback=before_model_callback,
   after_model_callback=after_model_callback,
)
judge_agent = LlmAgent(
   name="judge_agent",
   model=GEMINI_MODEL if GOOGLE_API_KEY else GEMINI_MODEL,
   description="Judge Agent: summarizes the quality evaluation of the narrative.",
   instruction=(
       "You are the Judge Agent. Use judge_summary_tool to see numeric scores and the verdict. "
       "Give a concise summary (2â€“3 sentences) on whether the explanation is strong and what could be improved."
   ),
   tools=[judge_summary_tool],
   before_agent_callback=before_agent_callback,
   after_agent_callback=after_agent_callback,
   before_model_callback=before_model_callback,
   after_model_callback=after_model_callback,
)
reporter_agent = LlmAgent(
   name="reporter_agent",
   model=GEMINI_MODEL if GOOGLE_API_KEY else GEMINI_MODEL,
   description="Reporter Agent: tells the user where to find JSON/PDF artifacts.",
   instruction=(
       "You are the Reporter Agent. Use report_paths_tool to get the JSON and PDF report paths. "
       "Summarize what artifacts exist and how they could be used."
   ),
   tools=[report_paths_tool],
   before_agent_callback=before_agent_callback,
   after_agent_callback=after_agent_callback,
   before_model_callback=before_model_callback,
   after_model_callback=after_model_callback,
)
print("Trainer, Explainer, Narrative, Judge, Reporter agents defined.")

Trainer, Explainer, Narrative, Judge, Reporter agents defined.


## Create Orchestrator & Runner
**Purpose**: Build the orchestrator agent that coordinates all sub-agents and initialize the runner.

In [16]:
callbacks = {
   "before_agent_callback": before_agent_callback,
   "after_agent_callback": after_agent_callback,
   "before_model_callback": before_model_callback,
   "after_model_callback": after_model_callback,
}
trainer_tool = AgentTool(trainer_agent)
explainer_tool = AgentTool(explainer_agent)
narrative_tool = AgentTool(narrative_agent)
judge_tool = AgentTool(judge_agent)
reporter_tool = AgentTool(reporter_agent)
# Orchestrator instruction
orchestrator_instruction = """
You are the Orchestrator Agent for the Insight system.
Follow this workflow:
1. Call trainer_agent to summarize model performance.
2. Call explainer_agent to summarize SHAP, permutation, PDP.
3. Call narrative_agent to create a business-ready explanation.
4. Call judge_agent to evaluate explanation quality.
5. Call reporter_agent to notify about saved JSON/PDF outputs.
Return a structured, multi-paragraph overview for stakeholders.
"""
# Orchestrator Agent 
orchestrator_agent = LlmAgent(
   name="orchestrator_agent",
   model=GEMINI_MODEL,
   instruction=orchestrator_instruction,
   tools=[
       trainer_tool,
       explainer_tool,
       narrative_tool,
       judge_tool,
       reporter_tool
   ],
   **callbacks
)
print("Orchestrator agent defined.")
# Runner
runner = InMemoryRunner(agent=orchestrator_agent)
print("Runner initialized.")

Orchestrator agent defined.
Runner initialized.


In [17]:
# Runner Run
_ = await runner.run_debug(
   "Give me full overview"
)


 ### Created new session: debug_session_id

User > Give me full overview




orchestrator_agent > Here's an overview of the model's performance and key insights:

**Model Performance:** The model demonstrates strong overall prediction performance with an accuracy of 89.5% on the test set and a high ROC AUC score of 0.91, indicating its ability to distinguish between positive and negative classes. However, the F1 score is 0.46, suggesting that the model has relatively low precision and recall. It was trained on 3,616 samples and tested on 905 samples, predicting term deposit subscriptions.

**Key Drivers:** Model predictions are significantly influenced by numerical features such as age, balance, day, duration, campaign, pdays, and previous. Categorical features related to job type (admin, blue-collar, and entrepreneur) also play a crucial role.

**Explanation Quality:** The explanation is well-structured, covering key drivers and providing actionable recommendations. To further enhance its quality, it would benefit from addressing the limitations of relying sol

## Display Results
**Purpose**: Print all key outputs from the multi-agents ML system including metrics, explanations, narratives, and report paths.

In [24]:
print("=== MODEL METRICS ===")
print(json.dumps(artifacts["metrics"], indent=2))
print("\n=== GLOBAL EXPLANATION (Permutation + PDP) ===")
print(artifacts["global_explanation_text"])
print("\n=== NARRATIVE TEXT ===")
print(artifacts["narrative_text"])
print("\n=== LLM-as-Judge RESULT ===")
print(json.dumps(artifacts["judge_result"], indent=2))
print("\nJSON report path:", artifacts["json_report_path"])
print("PDF report path:", artifacts["pdf_report_path"])
print("PDP plots:", artifacts["pdp_paths"])

=== MODEL METRICS ===
{
  "accuracy": 0.8950276243093923,
  "f1": 0.46327683615819215,
  "roc_auc": 0.9093560933448573
}

=== GLOBAL EXPLANATION (Permutation + PDP) ===
MODEL EXPLAINABILITY SUMMARY

Key SHAP drivers:
Top SHAP features: ['num__age', 'num__balance', 'num__day', 'num__duration', 'num__campaign', 'num__pdays', 'num__previous', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur']

Important features (permutation importance):
Top permutation features: ['num__age', 'num__balance', 'num__day', 'num__duration', 'num__campaign', 'num__pdays', 'num__previous', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur']

PDP summary:
PDP plots saved: 51


=== NARRATIVE TEXT ===
This model predicts whether a customer will subscribe to a term deposit. It's quite accurate overall (89.5% accuracy, AUC 0.91) but less precise at identifying *who will subscribe* (F1 score 0.46).

The most important factors are the customer's age, account balance, the day of the mon

## A2A Protocol Wrapper 
**Purpose**: Optional A2A protocol wrapper around the Orchestrator Agent for API endpoint.

In [None]:
try:
   a2a_app = to_a2a(orchestrator_agent, port=8001)
   logger.info(
       "A2A app created for orchestrator_agent on port 8001. "
   )
except Exception as e:
   logger.warning("Could not create A2A app in this environment: %s", e)
print("A2A wrapper code defined.")