In [1]:
# dt_rf_task.py
"""
Decision Trees & Random Forests - end-to-end example
Save as: dt_rf_task.py
Run: python dt_rf_task.py
"""

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def load_data(local_path="data/heart.csv"):
    """
    Try to load a CSV at local_path. If not found, fallback to sklearn breast cancer dataset.
    """
    if os.path.exists(local_path):
        print(f"[INFO] Loading dataset from {local_path}")
        df = pd.read_csv(local_path)
        # try to detect target column (common names)
        for cand in ["target", "Outcome", "HeartDisease", "heart_disease", "has_disease"]:
            if cand in df.columns:
                target_col = cand
                break
        else:
            # assume last column is target
            target_col = df.columns[-1]
        return df, target_col
    else:
        print(f"[WARN] No local CSV at {local_path}. Falling back to sklearn breast_cancer dataset.")
        data = load_breast_cancer()
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df["target"] = data.target
        return df, "target"


def preprocess(df, target_col):
    # Basic cleanup: drop duplicates, drop rows with missing target
    df = df.copy()
    df = df.drop_duplicates()
    df = df.dropna(subset=[target_col])
    # If any non-numeric columns (except target), one-hot encode them
    X = df.drop(columns=[target_col])
    y = df[target_col]
    non_numeric = X.select_dtypes(include=["object", "category"]).columns.tolist()
    if non_numeric:
        X = pd.get_dummies(X, columns=non_numeric, drop_first=True)
    return X, y


def train_decision_tree(X_train, y_train, X_test, y_test, max_depth=None, random_state=42):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"[DT] max_depth={max_depth}  test_acc={acc:.4f}")
    return clf, acc, y_pred


def analyze_overfitting(X_train, X_test, y_train, y_test, depths=range(1, 16)):
    train_scores = []
    test_scores = []
    for d in depths:
        clf = DecisionTreeClassifier(max_depth=d, random_state=42)
        clf.fit(X_train, y_train)
        train_scores.append(clf.score(X_train, y_train))
        test_scores.append(clf.score(X_test, y_test))

    plt.figure(figsize=(8, 5))
    plt.plot(list(depths), train_scores, marker='o', label='train_accuracy')
    plt.plot(list(depths), test_scores, marker='o', label='test_accuracy')
    plt.xlabel("max_depth")
    plt.ylabel("Accuracy")
    plt.title("Decision Tree: Train vs Test accuracy by max_depth")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    out_path = os.path.join(OUTPUT_DIR, "dt_overfitting_depths.png")
    plt.savefig(out_path)
    print(f"[INFO] Saved overfitting plot to {out_path}")
    plt.close()

    return depths, train_scores, test_scores


def visualize_tree(clf, feature_names, class_names=None, max_display_depth=3, out_fname="dt_tree.png"):
    plt.figure(figsize=(20, 10))
    plot_tree(clf,
              feature_names=feature_names,
              class_names=[str(c) for c in class_names] if class_names is not None else None,
              filled=True,
              max_depth=max_display_depth,
              fontsize=8)
    plt.title("Decision Tree (top levels)")
    plt.tight_layout()
    out_path = os.path.join(OUTPUT_DIR, out_fname)
    plt.savefig(out_path)
    print(f"[INFO] Saved tree visualization to {out_path}")
    plt.close()


def train_random_forest(X_train, y_train, X_test, y_test, n_estimators=100, random_state=42):
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"[RF] n_estimators={n_estimators}  test_acc={acc:.4f}")
    return rf, acc, y_pred


def show_feature_importances(model, feature_names, top_k=15, prefix="model"):
    importances = model.feature_importances_
    idx = np.argsort(importances)[::-1][:top_k]
    top_feats = [(feature_names[i], importances[i]) for i in idx]

    # DataFrame
    df_imp = pd.DataFrame(top_feats, columns=["feature", "importance"])
    print(f"\nTop {top_k} feature importances for {prefix}:")
    print(df_imp.to_string(index=False))

    # plot
    plt.figure(figsize=(8, max(3, 0.25 * top_k)))
    plt.barh(df_imp["feature"][::-1], df_imp["importance"][::-1])
    plt.xlabel("Importance")
    plt.title(f"Top {top_k} feature importances ({prefix})")
    out_path = os.path.join(OUTPUT_DIR, f"{prefix}_feature_importances.png")
    plt.tight_layout()
    plt.savefig(out_path)
    print(f"[INFO] Saved feature importance plot to {out_path}")
    plt.close()
    return df_imp


def evaluate_and_report(y_test, y_pred, label="Model"):
    print(f"\n{label} - Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:")
    print(cm)


def cross_val_estimate(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
    print(f"[CROSS-VAL] mean_acc={scores.mean():.4f}  std={scores.std():.4f}")
    return scores


def main():
    df, target_col = load_data(local_path="data/heart.csv")
    print(f"[INFO] Data shape: {df.shape}; target_col: {target_col}")

    X, y = preprocess(df, target_col)
    feature_names = X.columns.tolist()

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y if len(np.unique(y))>1 else None
    )
    print(f"[INFO] Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    # Baseline Decision Tree
    dt_clf, dt_acc, dt_pred = train_decision_tree(X_train, y_train, X_test, y_test, max_depth=None)
    evaluate_and_report(y_test, dt_pred, label="Decision Tree (unrestricted)")

    # Visualize top of tree
    visualize_tree(dt_clf, feature_names=feature_names, class_names=np.unique(y), max_display_depth=3,
                   out_fname="dt_tree_top3.png")

    # Overfitting analysis
    depths, train_scores, test_scores = analyze_overfitting(X_train, X_test, y_train, y_test, depths=range(1, 16))

    # Cross-validation estimate for the best simple tree (we'll pick depth where test accuracy peaked)
    best_depth = int(np.array(depths)[np.argmax(test_scores)])
    print(f"[INFO] Best max_depth by test set within sampled range: {best_depth}")
    dt_best = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
    cross_val_estimate(dt_best, X, y, cv=5)

    # Random Forest
    rf_clf, rf_acc, rf_pred = train_random_forest(X_train, y_train, X_test, y_test, n_estimators=200)
    evaluate_and_report(y_test, rf_pred, label="Random Forest")

    cross_val_estimate(RandomForestClassifier(n_estimators=200, random_state=42), X, y, cv=5)

    # Feature importances
    show_feature_importances(dt_clf, feature_names, top_k=min(15, len(feature_names)), prefix="decision_tree")
    show_feature_importances(rf_clf, feature_names, top_k=min(15, len(feature_names)), prefix="random_forest")

    # Save models
    joblib.dump(dt_clf, os.path.join(OUTPUT_DIR, "decision_tree_model.joblib"))
    joblib.dump(rf_clf, os.path.join(OUTPUT_DIR, "random_forest_model.joblib"))
    print(f"[INFO] Saved models to {OUTPUT_DIR}")

    # Save a small summary file
    with open(os.path.join(OUTPUT_DIR, "summary.txt"), "w") as f:
        f.write("Decision Tree test accuracy: {:.4f}\n".format(dt_acc))
        f.write("Random Forest test accuracy: {:.4f}\n".format(rf_acc))
        f.write("Best max_depth (from sampled range): {}\n".format(best_depth))
    print(f"[INFO] Summary saved to {os.path.join(OUTPUT_DIR, 'summary.txt')}")


if __name__ == "__main__":
    main()


[WARN] No local CSV at data/heart.csv. Falling back to sklearn breast_cancer dataset.
[INFO] Data shape: (569, 31); target_col: target
[INFO] Train shape: (455, 30), Test shape: (114, 30)
[DT] max_depth=None  test_acc=0.9123

Decision Tree (unrestricted) - Accuracy: 0.9123
Classification report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89        42
           1       0.96      0.90      0.93        72

    accuracy                           0.91       114
   macro avg       0.90      0.92      0.91       114
weighted avg       0.92      0.91      0.91       114

Confusion matrix:
[[39  3]
 [ 7 65]]
[INFO] Saved tree visualization to outputs/dt_tree_top3.png
[INFO] Saved overfitting plot to outputs/dt_overfitting_depths.png
[INFO] Best max_depth by test set within sampled range: 3
[CROSS-VAL] mean_acc=0.9191  std=0.0246
[RF] n_estimators=200  test_acc=0.9561

Random Forest - Accuracy: 0.9561
Classification report:
              prec