In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)

# Load dataset
insurance = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")

# Encode categorical variables
insurance["sex"] = insurance["sex"].map({"male": 0, "female": 1})
insurance["smoker"] = insurance["smoker"].map({"yes": 1, "no": 0})
insurance["region"] = insurance["region"].astype("category").cat.codes

# Split features and targets
X_classification = insurance.drop(columns=["smoker"])
y_classification = insurance["smoker"]

X_regression = insurance.drop(columns=["charges"])
y_regression = insurance["charges"]

# Normalize features for regression
scaler = StandardScaler()
X_regression_scaled = scaler.fit_transform(X_regression)

# Split data
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_regression_scaled, y_regression, test_size=0.2, random_state=42)

# Train classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(Xc_train, yc_train)

# Train regressor
reg = DecisionTreeRegressor(max_depth=5, random_state=42)
reg.fit(Xr_train, yr_train)

# Classification predictions and metrics
yc_pred = clf.predict(Xc_test)
cm = confusion_matrix(yc_test, yc_pred)
acc = accuracy_score(yc_test, yc_pred)
report = classification_report(yc_test, yc_pred, target_names=["non-smoker", "smoker"])
precision = precision_score(yc_test, yc_pred)

# Regression predictions and metrics
yr_pred = reg.predict(Xr_test)
mse = mean_squared_error(yr_test, yr_pred)
mae = mean_absolute_error(yr_test, yr_pred)
r2 = r2_score(yr_test, yr_pred)

# Save figures
plt.figure(figsize=(16, 8))
plot_tree(clf, feature_names=X_classification.columns, class_names=["non-smoker", "smoker"], filled=True)
plt.title("Decision Tree for Smoker Classification")
plt.tight_layout()
plt.savefig("/mnt/data/classification_tree.png")

plt.figure(figsize=(16, 8))
plot_tree(reg, feature_names=X_classification.columns, filled=True)
plt.title("Regression Tree for Insurance Charges")
plt.tight_layout()
plt.savefig("/mnt/data/regression_tree.png")

# Confusion matrix heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["non-smoker", "smoker"], yticklabels=["non-smoker", "smoker"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Smoker Classifier")
plt.tight_layout()
plt.savefig("/mnt/data/confusion_matrix.png")

import ace_tools as tools; tools.display_dataframe_to_user(name="Insurance Dataset Head", dataframe=insurance.head())

{
    "classification_accuracy": acc,
    "classification_precision": precision,
    "classification_report": report,
    "regression_mse": mse,
    "regression_mae": mae,
    "regression_r2": r2
}


InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['age', 'sex', 'bmi', 'children', 'region', 'charges'], dtype='object') instead.

<Figure size 1600x800 with 0 Axes>