In [None]:
# =============================================================================
# Test: Compare functions.py approach vs notebook approach
# Purpose: Debug why Classification visualizers don't render correctly
# =============================================================================

import duckdb
import numpy as np
import os
import io
import base64
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import matplotlib
matplotlib.use('Agg')  # Same as functions.py
import matplotlib.pyplot as plt
from IPython.display import Image, display

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
# =============================================================================
# Configuration (same as test0012)
# =============================================================================
MINIO_HOST = "localhost"
MINIO_PORT = "9000"
MINIO_ENDPOINT = f"{MINIO_HOST}:{MINIO_PORT}"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
PROJECT_NAME = "Transaction Fraud Detection"

DELTA_PATH = "s3://lakehouse/delta/transaction_fraud_detection"

TFD_NUMERICAL_FEATURES = ["amount", "account_age_days", "cvv_provided", "billing_address_match"]
TFD_CATEGORICAL_FEATURES = [
    "currency", "merchant_id", "payment_method", "product_category",
    "transaction_type", "browser", "os",
    "year", "month", "day", "hour", "minute", "second",
]
TFD_ALL_FEATURES = TFD_NUMERICAL_FEATURES + TFD_CATEGORICAL_FEATURES
TFD_CAT_FEATURE_INDICES = list(range(len(TFD_NUMERICAL_FEATURES), len(TFD_ALL_FEATURES)))

print(f"Project: {PROJECT_NAME}")

In [None]:
# =============================================================================
# DuckDB Connection Setup
# =============================================================================
os.environ["AWS_EC2_METADATA_DISABLED"] = "true"

conn = duckdb.connect()
conn.execute("INSTALL delta; LOAD delta;")
conn.execute("INSTALL httpfs; LOAD httpfs;")

conn.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_ACCESS_KEY}',
        SECRET '{MINIO_SECRET_KEY}',
        REGION 'us-east-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        URL_STYLE 'path',
        USE_SSL false
    );
""")
print("DuckDB connection ready")

In [None]:
# =============================================================================
# Load Data (simplified - 10K rows for fast testing)
# =============================================================================
query = f"""
SELECT
    amount, account_age_days,
    CAST(cvv_provided AS INTEGER) AS cvv_provided,
    CAST(billing_address_match AS INTEGER) AS billing_address_match,
    DENSE_RANK() OVER (ORDER BY currency) - 1 AS currency,
    DENSE_RANK() OVER (ORDER BY merchant_id) - 1 AS merchant_id,
    DENSE_RANK() OVER (ORDER BY payment_method) - 1 AS payment_method,
    DENSE_RANK() OVER (ORDER BY product_category) - 1 AS product_category,
    DENSE_RANK() OVER (ORDER BY transaction_type) - 1 AS transaction_type,
    DENSE_RANK() OVER (ORDER BY json_extract_string(device_info, '$.browser')) - 1 AS browser,
    DENSE_RANK() OVER (ORDER BY json_extract_string(device_info, '$.os')) - 1 AS os,
    CAST(date_part('year', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS year,
    CAST(date_part('month', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS month,
    CAST(date_part('day', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS day,
    CAST(date_part('hour', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS hour,
    CAST(date_part('minute', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS minute,
    CAST(date_part('second', CAST(timestamp AS TIMESTAMP)) AS INTEGER) AS second,
    is_fraud
FROM delta_scan('{DELTA_PATH}')
LIMIT 10000
"""
df = conn.execute(query).df()
print(f"Loaded {len(df)} rows")

X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
class_names = ["Non-Fraud", "Fraud"]
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
# =============================================================================
# Train Model (fast - 50 iterations)
# =============================================================================
model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.1,
    depth=4,
    auto_class_weights='Balanced',
    cat_features=TFD_CAT_FEATURE_INDICES,
    verbose=False,
    random_seed=42,
)
model.fit(X_train, y_train)
print("Model trained")

In [None]:
# =============================================================================
# CatBoostWrapper - EXACT same as functions.py
# =============================================================================
from sklearn.base import BaseEstimator, ClassifierMixin

class CatBoostWrapper(BaseEstimator, ClassifierMixin):
    """Wraps pre-fitted CatBoost model for YellowBrick sklearn compatibility."""
    _estimator_type = 'classifier'

    def __init__(self, model):
        self.model = model
        self.classes_ = np.array(model.classes_)
        try:
            fi = model.get_feature_importance()
            self.feature_importances_ = np.array(fi) if fi is not None else None
        except Exception:
            self.feature_importances_ = model.feature_importances_

    def fit(self, X, y):
        return self  # Already fitted

    def predict(self, X):
        return self.model.predict(X).flatten()

    def predict_proba(self, X):
        return self.model.predict_proba(X)

wrapped_model = CatBoostWrapper(model)
print("Wrapper created")

In [None]:
# =============================================================================
# TEST 1: Original notebook approach (test0012 - WORKS)
# =============================================================================
from yellowbrick.classifier import ClassificationReport

print("TEST 1: Notebook approach (file output)")
print("="*50)

visualizer = ClassificationReport(
    wrapped_model,
    classes=class_names,
    support=True,
    is_fitted=True,
    force_model=True,
)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
visualizer.fig.savefig("test1_notebook.png", dpi=150, bbox_inches='tight')
plt.close(visualizer.fig)
plt.clf()

print("Saved: test1_notebook.png")
display(Image(filename="test1_notebook.png"))

In [None]:
# =============================================================================
# TEST 2: functions.py approach (BytesIO with dpi=150, bbox_inches='tight')
# =============================================================================
print("TEST 2: functions.py approach (BytesIO + dpi + bbox_inches)")
print("="*50)

visualizer = ClassificationReport(
    wrapped_model,
    classes=class_names,
    support=True,
    is_fitted=True,
    force_model=True,
)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)

# EXACT code from functions.py generate_yellowbrick_image()
visualizer.show()
buf = io.BytesIO()
visualizer.fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
image_base64 = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
plt.close(visualizer.fig)
plt.clf()

# Decode and display
image_bytes = base64.b64decode(image_base64)
with open("test2_functions.png", "wb") as f:
    f.write(image_bytes)
print("Saved: test2_functions.png")
display(Image(filename="test2_functions.png"))

In [None]:
# =============================================================================
# TEST 3: BytesIO WITHOUT dpi and bbox_inches (user's suggestion)
# =============================================================================
print("TEST 3: BytesIO WITHOUT dpi/bbox_inches")
print("="*50)

visualizer = ClassificationReport(
    wrapped_model,
    classes=class_names,
    support=True,
    is_fitted=True,
    force_model=True,
)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)

# Same but WITHOUT dpi and bbox_inches
visualizer.show()
buf = io.BytesIO()
visualizer.fig.savefig(buf, format='png')  # NO dpi, NO bbox_inches
buf.seek(0)
image_base64 = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
plt.close(visualizer.fig)
plt.clf()

# Decode and display
image_bytes = base64.b64decode(image_base64)
with open("test3_no_dpi.png", "wb") as f:
    f.write(image_bytes)
print("Saved: test3_no_dpi.png")
display(Image(filename="test3_no_dpi.png"))

In [None]:
# =============================================================================
# TEST 4: Use finalize() instead of show()
# =============================================================================
print("TEST 4: finalize() instead of show()")
print("="*50)

visualizer = ClassificationReport(
    wrapped_model,
    classes=class_names,
    support=True,
    is_fitted=True,
    force_model=True,
)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)

# Use finalize() instead of show()
visualizer.finalize()
buf = io.BytesIO()
visualizer.fig.savefig(buf, format='png')
buf.seek(0)
image_base64 = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
plt.close(visualizer.fig)
plt.clf()

# Decode and display
image_bytes = base64.b64decode(image_base64)
with open("test4_finalize.png", "wb") as f:
    f.write(image_bytes)
print("Saved: test4_finalize.png")
display(Image(filename="test4_finalize.png"))

In [None]:
# =============================================================================
# COMPARISON: Show all 4 tests side by side
# =============================================================================
import os
print("\nFile sizes:")
for f in ["test1_notebook.png", "test2_functions.png", "test3_no_dpi.png", "test4_finalize.png"]:
    if os.path.exists(f):
        size = os.path.getsize(f)
        print(f"  {f}: {size:,} bytes")