In [None]:
## Interactive ML playground for the Forest Covertype dataset (scikit-learn "covtype")
# Single-cell Jupyter code using ipywidgets
# Requirements: scikit-learn, ipywidgets, pandas, numpy
# If widgets do not render, ensure: pip install ipywidgets && jupyter nbextension enable --py widgetsnbextension

import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
)
import ipywidgets as widgets
from IPython.display import display, clear_output

#########################################################################################################################
####################################### DATASET #########################################################################
#########################################################################################################################
# -----------------------------
# Load dataset once
# -----------------------------
covtype = fetch_covtype(as_frame=False)
X_all, y_all = covtype.data, covtype.target
X_all = X_all.astype(np.float32)

# Stratified split to preserve class distribution
X, _, y, _ = train_test_split(
    X_all, y_all, train_size=0.06, stratify=y_all, random_state=42
)

# -----------------------------
# Widgets
# -----------------------------
class_groups = {
    "Subset A": [1, 2, 5],
    "Subset B": [4, 5, 7],
    "Subset C": [3, 6, 7],
}

group_dropdown = widgets.Dropdown(
    options=list(class_groups.keys()),
    value="Subset A",
    description="Subset:",
)

viz_points = 150 #widgets.IntSlider(
#    value=600, min=200, max=3000, step=100, description="Max points (pairplot)"
#)

output = widgets.Output()

controls = widgets.VBox([
    widgets.HTML("<h2>Forest Covertype Subset Loading and Visualization</h2>"),
    widgets.HBox([group_dropdown]),
    #widgets.HBox([viz_points, show_pairplot]),
])

# -----------------------------
# Helper: balanced subsample per class
# -----------------------------
def balanced_subsample(Xs, ys, max_total=600, seed=42):
    rng = np.random.default_rng(seed)
    classes, counts = np.unique(ys, return_counts=True)
    per_class = max_total // len(classes) if len(classes) > 0 else max_total
    idx_all = []
    for c in classes:
        idx_c = np.where(ys == c)[0]
        take = min(len(idx_c), per_class)
        if take > 0:
            chosen = rng.choice(idx_c, size=take, replace=False)
            idx_all.append(chosen)
    if len(idx_all) == 0:
        return Xs[:0], ys[:0]
    idx_all = np.concatenate(idx_all)
    rng.shuffle(idx_all)
    return Xs[idx_all], ys[idx_all]

# -----------------------------
# Main runner
# -----------------------------
def run_dataset(*args):
    with output:
        clear_output(wait=True)

        selected = class_groups[group_dropdown.value]
        mask = np.isin(y, selected)
        X_sub = X[mask]
        y_sub = y[mask]

        global X_train, X_test, y_train, y_test
        if group_dropdown.value == "Subset C":
            X_sub, _, y_sub, _ = train_test_split(
                X_sub, y_sub, train_size=0.5, stratify=y_sub, random_state=42
            )
            X_train, X_test, y_train, y_test = train_test_split(
                X_sub, y_sub, test_size=0.5, stratify=y_sub, random_state=42
            )
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X_sub, y_sub, test_size=0.2, stratify=y_sub, random_state=42
            )

        present_classes = np.unique(y_sub)
        print("\n=== Dataset Subset Information ===")
        print(f"Selected subset: {group_dropdown.value}")
        print(f"Requested classes: {selected}")
        print(f"Present classes in dataset: {present_classes.tolist()}")
        if len(X_sub) == 0:
            print("No samples found for the selected class group. Try a different selection.")
            return

        # Summary
        counts = {int(c): int((y_sub == c).sum()) for c in present_classes}
        print(f"Subset size: {len(X_sub)} samples")
        print(f"Class counts: {counts}")

        # Correlation heatmap for first 10 features (across the subset)
        #cols = [f"f{i}" for i in range(10)]
        #cols = [
        #    'Elevation',
        #    'Aspect',
        #    'Slope',
        #    'HDist_Hydro',
        #    'VDist_Hydro',
        #    'HDist_Road',
        #    'Hillsh_9am',
        #    'Hillsh_Noon',
        #    'Hillsh_3pm',
        #    'HDist_FirePts'
        #]
        #df10 = pd.DataFrame(X_sub[:, :10], columns=cols)
        #corr = df10.corr(method="pearson")

        #X_vis, y_vis = balanced_subsample(X_sub, y_sub, max_total=viz_points, seed=42)
        #if len(X_vis) == 0:
        #    print("Not enough samples to create pairplot.")
        #    return
        #df_vis = pd.DataFrame(X_vis[:, :10], columns=cols)
        #df_vis["class"] = y_vis.astype(int)

        # Use corner=True to reduce complexity of the grid
        #g = sns.pairplot(
        #    df_vis, vars=cols, hue="class", corner=True,
        #    plot_kws={"s": 10, "alpha": 0.6},
        #    diag_kind="kde"
        #)
        #g.fig.set_size_inches(12, 12)
        #g.fig.suptitle("Pairplot (first 10 features, colored by class, subsampled)", y=1.02)
        #plt.show()

group_dropdown.observe(run_dataset, names='value')


data_ui = widgets.VBox([controls, output])


#########################################################################################################################
####################################### MODEL ###########################################################################
#########################################################################################################################

# -----------------------------
# Utility: build preprocessor
# -----------------------------
def make_preprocessor(choice):
    if choice == "None":
        return None
    if choice == "Center":
        return StandardScaler(with_mean=True, with_std=False)
    if choice == "Scale":
        return StandardScaler(with_mean=False, with_std=True)
    if choice == "Center & Scale":
        return StandardScaler()
    if choice == "Normalize (L2)":
        return Normalizer(norm="l2")
    raise ValueError(f"Unknown preprocessing option: {choice}")

# -----------------------------
# Utility: balance training set by random undersampling to smallest class count
# -----------------------------
def balance_training(Xt, yt, minmax, seed=42):
    rng = np.random.default_rng(seed)
    classes, counts = np.unique(yt, return_counts=True)
    if minmax == "min":
        n = counts.min()
    elif minmax == "max":
        n = counts.max()
    #n = int(counts.mean())
    idx_list = []
    for c in classes:
        idx_c = np.where(yt == c)[0]
        if len(idx_c) < n:
            chosen = rng.choice(idx_c, size=n, replace=True)
        else:
            chosen = rng.choice(idx_c, size=n, replace=False)
        idx_list.append(chosen)
    idx_all = np.concatenate(idx_list)
    rng.shuffle(idx_all)
    return Xt[idx_all], yt[idx_all]

# -----------------------------
# Widgets: Data Preparation
# -----------------------------
preproc_dropdown = widgets.Dropdown(
    options=["None", "Center", "Scale", "Center & Scale", "Normalize (L2)"],
    value="Center & Scale",
    description="Preprocess:",
)

#balance_checkbox = widgets.Checkbox(
#    value=False, description="Balance training samples"
#)
balance_dropdown = widgets.Dropdown(
    options=["None", "Oversample minority class", "Undersample majority class"],
    value="None",
    description="Balancing:"
)


data_prep_box = widgets.VBox([preproc_dropdown, balance_dropdown])

# -----------------------------
# Widgets: Feature Selection (columns 0..9)
# -----------------------------
#feature_checkboxes = [widgets.Checkbox(value=True, description=f"Col {i}") for i in range(10)]
feature_checkboxes = [widgets.Checkbox(value=True, description=f"{i}") for i in covtype.feature_names[:10]]
feature_wild = widgets.Checkbox(value=True, description="Wilderness Area")
feature_soil = widgets.Checkbox(value=True, description="Soil Type")
# Arrange in two rows for readability
feature_selection_box = widgets.VBox([
    widgets.HBox(feature_checkboxes[:3]),
    widgets.HBox(feature_checkboxes[3:6]),
    widgets.HBox(feature_checkboxes[6:9]),
    widgets.HBox(feature_checkboxes[9:] + [feature_wild, feature_soil]),
])

# -----------------------------
# Widgets: Model Selection
# -----------------------------
model_dropdown = widgets.Dropdown(
    options=["Logistic Regression", "MLP", "Decision Tree", "Random Forest"],
    value="Logistic Regression",
    description="Model:",
)

slider_style_props = {'style': {'description_width': '140px'}, 'layout': widgets.Layout(width='360px')}

# Model-dependent complexity controls
logreg_degree = widgets.IntSlider(value=1, min=1, max=6, step=1, description="Polynomial Degree", **slider_style_props)
mlp_neurons = widgets.IntSlider(value=64, min=8, max=256, step=8, description="# Neurons", **slider_style_props)
dt_max_depth = widgets.IntSlider(value=20, min=2, max=100, step=2, description="Max depth", **slider_style_props)
rf_n_estimators = widgets.IntSlider(value=100, min=10, max=300, step=10, description="# Trees", **slider_style_props)
rf_max_depth = widgets.IntSlider(value=20, min=2, max=50, step=1, description="Max depth", **slider_style_props)
complexity_box = widgets.VBox([])

def update_complexity_controls(*args):
    mdl = model_dropdown.value
    if mdl == "Logistic Regression":
        #complexity_box.children = [widgets.HTML("<i>No complexity slider for Logistic Regression.</i>")]
        complexity_box.children = [logreg_degree]
    elif mdl == "MLP":
        complexity_box.children = [mlp_neurons]
    elif mdl == "Decision Tree":
        complexity_box.children = [dt_max_depth]
    elif mdl == "Random Forest":
        complexity_box.children = [rf_n_estimators, rf_max_depth]

model_dropdown.observe(update_complexity_controls, names="value")
update_complexity_controls()

model_selection_box = widgets.VBox([model_dropdown, complexity_box])

# -----------------------------
# Widgets: Model Training & Hyperparameter Tuning (regularization)
# -----------------------------
# Logistic Regression regularization
#lr_C = widgets.FloatLogSlider(
#    value=1.0, base=10, min=-2, max=2, step=0.1, description="LR C (1/λ)"
#)
lr_penalty = widgets.Dropdown(
    #options=["None", 'l2', 'l1', 'elasticnet'],
    options=["None", 'l2'],
    value="None",
    description="Penalty:",
    **slider_style_props
)

# MLP regularization
mlp_alpha = widgets.FloatLogSlider(
    value=1e-2, base=10, min=-8, max=0, step=0.1, description="MLP α (L2)", **slider_style_props
)

# Decision Tree regularization (cost-complexity pruning alpha)
dt_ccp_alpha = widgets.FloatSlider(
    value=0.0, min=0.0, max=0.02, step=0.001, description="Node Pruning (α)", readout_format='.3f', **slider_style_props
)

# Random Forest regularization-ish
rf_min_samples_leaf = widgets.IntSlider(
    value=1, min=1, max=20, step=1, description="min_samples_leaf", **slider_style_props
)

tuning_box = widgets.VBox([])

def update_tuning_controls(*args):
    mdl = model_dropdown.value
    if mdl == "Logistic Regression":
        tuning_box.children = [lr_penalty]
    elif mdl == "MLP":
        tuning_box.children = [mlp_alpha]
    elif mdl == "Decision Tree":
        tuning_box.children = [dt_ccp_alpha]
    elif mdl == "Random Forest":
        tuning_box.children = [rf_min_samples_leaf]

model_dropdown.observe(update_tuning_controls, names="value")
update_tuning_controls()

# -----------------------------
# Containers with block titles
# -----------------------------
data_prep_section = widgets.VBox([widgets.HTML("<h3>Data Preparation</h3>"), data_prep_box])
feature_selection_section = widgets.VBox([widgets.HTML("<h3>Feature Selection</h3>"), feature_selection_box])
model_full_box = widgets.VBox([
    widgets.HTML("<h4>Model</h4>"),
    model_dropdown,
    widgets.HTML("<h4>Tuning / Regularization</h4>"),
    complexity_box,
    tuning_box
])

# Optionally use Accordion
accordion = widgets.Accordion(children=[
    data_prep_box, feature_selection_box, model_full_box
])
accordion.set_title(0, "Data Preparation")
accordion.set_title(1, "Feature Selection")
accordion.set_title(2, "Model Selection, Parameters & Regularization")
#accordion = widgets.VBox([
#    data_prep_box, feature_selection_box, model_selection_box, tuning_box
#])

# -----------------------------
# Output and run controls
# -----------------------------
run_button = widgets.Button(description="Train", button_style="success")
output = widgets.Output()

# -----------------------------
# Build model pipeline based on UI
# -----------------------------
def build_classifier():
    mdl = model_dropdown.value
    if mdl == "Logistic Regression":
        clf = LogisticRegression(
            #C=lr_C.value,
            penalty=lr_penalty.value if lr_penalty.value != 'None' else None,
            #l1_ratio=0.5 if lr_penalty.value == 'elastic_net' else 0.0,
            solver="lbfgs",
            max_iter=100,
            #multi_class="auto",
            random_state=42
        )
    elif mdl == "MLP":
        clf = MLPClassifier(
            hidden_layer_sizes=(mlp_neurons.value,mlp_neurons.value, mlp_neurons.value),
            activation="relu",
            solver="adam",
            alpha=mlp_alpha.value,
            max_iter=100,
            early_stopping=False,
            random_state=42
        )
    elif mdl == "Decision Tree":
        clf = DecisionTreeClassifier(
            max_depth=dt_max_depth.value,
            ccp_alpha=dt_ccp_alpha.value,
            random_state=42
        )
    elif mdl == "Random Forest":
        clf = RandomForestClassifier(
            n_estimators=rf_n_estimators.value,
            max_depth=rf_max_depth.value,
            min_samples_leaf=rf_min_samples_leaf.value,
            n_jobs=-1,
            random_state=42
        )
    else:
        raise ValueError("Unknown model")
    return clf

# -----------------------------
# Run experiment on button click
# -----------------------------
def run_experiment(*args):
    with output:
        clear_output(wait=True)

        # Feature selection: collect selected columns
        selected_cols = [i for i, cb in enumerate(feature_checkboxes) if cb.value]
        if feature_wild.value:
            selected_cols +=  list(range(10,14))
        if feature_soil.value:
            selected_cols +=  list(range(14,54))
            
        if len(selected_cols) == 0:
            print("Please select at least one feature.")
            return

        # Slice features
        Xtr = X_train[:, selected_cols]
        Xte = X_test[:, selected_cols]

        # Balance training set if requested
        if balance_dropdown.value == "Oversample minority class":
            Xtr_bal, ytr_bal = balance_training(Xtr, y_train, minmax="max", seed=42)
        elif balance_dropdown.value == "Undersample majority class":
            Xtr_bal, ytr_bal = balance_training(Xtr, y_train, minmax="min", seed=42)
        else:
            Xtr_bal, ytr_bal = Xtr, y_train

        # Build pipeline
        preproc = make_preprocessor(preproc_dropdown.value)
        clf = build_classifier()
        steps = []
        if model_dropdown.value == "Logistic Regression":
            steps.append(("poly", PolynomialFeatures(degree=logreg_degree.value, include_bias=False)))
        if preproc is not None:
            steps.append(("preprocess", preproc))
        steps.append(("clf", clf))
        pipe = Pipeline(steps)

        # Train
        t0 = time.perf_counter()
        pipe.fit(Xtr_bal, ytr_bal)
        t_train = time.perf_counter() - t0

        # Predict
        ytr_pred = pipe.predict(Xtr_bal)
        t1 = time.perf_counter()
        y_pred = pipe.predict(Xte)
        t_pred = time.perf_counter() - t1

        # Metrics
        tr_acc = accuracy_score(ytr_bal, ytr_pred)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
        rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
        f1m = f1_score(y_test, y_pred, average="macro", zero_division=0)
        cm = confusion_matrix(y_test, y_pred)

        # Report
        print("\n=== Configuration ===")
        active_feat_names = [cb.description for cb in feature_checkboxes if cb.value]
        if feature_wild.value:
            active_feat_names.append("Wilderness Area")
        if feature_soil.value:
            active_feat_names.append("Soil Type")
        print(f"Selected features: {active_feat_names} (count={len(selected_cols)})")
        print(f"Preprocessing: {preproc_dropdown.value}")
        print(f"Balanced training: {balance_dropdown.value}")
        print(f"Model: {model_dropdown.value}")
        if model_dropdown.value == "MLP":
            print(f" - Neurons: {mlp_neurons.value}, alpha={mlp_alpha.value:g}")
        elif model_dropdown.value == "Decision Tree":
            print(f" - Max depth: {dt_max_depth.value}, ccp_alpha={dt_ccp_alpha.value:g}")
        elif model_dropdown.value == "Random Forest":
            print(f" - Trees: {rf_n_estimators.value}, Max depth: {rf_max_depth.value}, min_samples_leaf={rf_min_samples_leaf.value}")
        elif model_dropdown.value == "Logistic Regression":
            print(f" - Penalty: {lr_penalty.value}")

        print("\n=== Performance (Train set) ===")
        print(f"Train Accuracy: {tr_acc:.4f}")
        
        print("\n=== Performance (Test set) ===")
        print(f"Accuracy: {acc:.4f}")
        print(f"Precision (macro): {prec:.4f}")
        print(f"Recall (macro): {rec:.4f}")
        print(f"F1-score (macro): {f1m:.4f}")
        print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred):.4f}")
        print(f"\nTraining time: {t_train:.3f} s")
        print(f"Inference time (predict): {t_pred:.3f} s")

        #print("\nConfusion matrix (rows: true, cols: predicted):")
        #cm_df = pd.DataFrame(cm)
        #display(cm_df)
        
        print("\nConfusion matrix:")
        labels = np.unique(y_test)
        fig, ax = plt.subplots(figsize=(5, 5), dpi=120)
        disp = ConfusionMatrixDisplay.from_predictions(
            y_test, y_pred, cmap="Blues", normalize='true', values_format='.3f', colorbar=True, ax=ax
        )
        ax.set_title("Confusion Matrix")
        plt.tight_layout()
        plt.show()

run_button.on_click(run_experiment)

# -----------------------------
# Defaults for reset
# -----------------------------
DEFAULTS = {
    "preproc": "Center & Scale",
    "balancing": "None",

    "feature_cols": [True] * 10,  # first 10 features checked
    "feature_wild": True,
    "feature_soil": True,

    "model": "Logistic Regression",
    "logreg_degree": 1,
    "lr_penalty": "None",

    "mlp_neurons": 64,
    "mlp_alpha": 1e-2,

    "dt_max_depth": 20,
    "dt_ccp_alpha": 0.0,

    "rf_n_estimators": 100,
    "rf_max_depth": 20,
    "rf_min_samples_leaf": 1,
}

reset_button = widgets.Button(
    description="Reset to defaults",
    button_style="warning",
    icon="refresh"
)

def reset_to_defaults(_):
    # Data preparation
    preproc_dropdown.value = DEFAULTS["preproc"]
    balance_dropdown.value = DEFAULTS["balancing"]

    # Feature selection
    for cb, val in zip(feature_checkboxes, DEFAULTS["feature_cols"]):
        cb.value = val
    feature_wild.value = DEFAULTS["feature_wild"]
    feature_soil.value = DEFAULTS["feature_soil"]

    # Model selection
    model_dropdown.value = DEFAULTS["model"]

    # Model-specific hyperparameters
    logreg_degree.value = DEFAULTS["logreg_degree"]
    lr_penalty.value = DEFAULTS["lr_penalty"]

    mlp_neurons.value = DEFAULTS["mlp_neurons"]
    mlp_alpha.value = DEFAULTS["mlp_alpha"]

    dt_max_depth.value = DEFAULTS["dt_max_depth"]
    dt_ccp_alpha.value = DEFAULTS["dt_ccp_alpha"]

    rf_n_estimators.value = DEFAULTS["rf_n_estimators"]
    rf_max_depth.value = DEFAULTS["rf_max_depth"]
    rf_min_samples_leaf.value = DEFAULTS["rf_min_samples_leaf"]

reset_button.on_click(reset_to_defaults)

# -----------------------------
# Display UI
# -----------------------------
ui = widgets.VBox([
    widgets.HTML("<h2>Forest Covertype ML Playground</h2>"),
    accordion,
    widgets.HBox([run_button, reset_button]),
    output
])

display(widgets.VBox([data_ui, ui]))

run_dataset("Subset A")