In [1]:
from sklearn.datasets import make_regression, load_diabetes, fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from spacetree import QuantileNormDecisionTree
from sklearn.preprocessing import StandardScaler

In [19]:
def interpret_tree(tree, ref_points, feature_names=None, depth=0):
    """
    Recursively interprets the decision tree and prints it in a human-readable format.
    
    Parameters:
        tree (dict): The decision tree structure to interpret.
        ref_points (ndarray): Array of reference points used by the tree.
        feature_names (list or None): Optional custom feature names for interpretation.
        depth (int): Current depth of the tree for indentation.
    """
    indent = "  " * depth  # Indentation for better visualization
    
    if tree['is_leaf']:
        # Print details for a leaf node
        print(f"{indent}Leaf: Predict value = {tree['value']:.4f}, "
              f"Samples = {tree['n_samples']}, Impurity = {tree['impurity']:.4f}")
        return
    
    # Generate feature description
    if feature_names is None:
        feature_description = f"distance from reference point {tree['ref_idx']}"
    else:
        feature_description = feature_names[tree['ref_idx']]
    
    # Print the current node's condition
    print(f"{indent}Node: If {feature_description} <= {tree['threshold']:.4f}:")
    
    # Print the left subtree
    print(f"{indent}  Left subtree:")
    interpret_tree(tree['left'], ref_points, feature_names, depth + 2)
    
    # Print the right subtree
    print(f"{indent}  Else (distance > {tree['threshold']:.4f}):")
    print(f"{indent}  Right subtree:")
    interpret_tree(tree['right'], ref_points, feature_names, depth + 2)


In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

# Load dataset
diamonds = sns.load_dataset('diamonds')

# Features and target
X = diamonds.drop('price', axis=1)
y = diamonds['price']

# Categorical columns to encode
categorical_columns = ['cut', 'color', 'clarity']

# One-hot encode categorical variables
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(X[categorical_columns]).toarray()  # Convert to dense array

# Create a DataFrame with encoded categorical features
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))

# Combine encoded features with numerical features
numerical_features = X.drop(categorical_columns, axis=1).reset_index(drop=True)
X = pd.concat([numerical_features, encoded_df], axis=1)



In [4]:
X, y = load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
tree = QuantileNormDecisionTree(max_depth=3, n_quantiles=500, min_samples_split=2, n_ref_points=200, impurity_method="se")
tree.fit(X_train, y_train)

# Tahmin yap
predictions = tree.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse:.2f}")

predictions = tree.predict(X_train)
mse = mean_squared_error(y_train, predictions)
print(f"MSE: {mse:.2f}")

MSE: 3464.74
MSE: 2554.94


In [21]:
interpret_tree(tree.tree, tree.ref_points)

Node: If distance from reference point 75 <= 6.5787:
  Left subtree:
    Node: If distance from reference point 170 <= 5.7590:
      Left subtree:
        Node: If distance from reference point 75 <= 5.9359:
          Left subtree:
            Leaf: Predict value = 204.1765, Samples = 34, Impurity = 81088.9412
          Else (distance > 5.9359):
          Right subtree:
            Leaf: Predict value = 157.9070, Samples = 43, Impurity = 120089.6279
      Else (distance > 5.7590):
      Right subtree:
        Node: If distance from reference point 2 <= 4.3650:
          Left subtree:
            Leaf: Predict value = 211.0345, Samples = 29, Impurity = 87196.9655
          Else (distance > 4.3650):
          Right subtree:
            Leaf: Predict value = 271.8276, Samples = 29, Impurity = 32202.1379
  Else (distance > 6.5787):
  Right subtree:
    Node: If distance from reference point 104 <= 5.8267:
      Left subtree:
        Node: If distance from reference point 80 <= 8.1158:
    

In [18]:
# Modeli oluştur ve eğit
tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train, y_train)

# Tahmin yap
predictions = tree.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse:.2f}")

predictions = tree.predict(X_train)
mse = mean_squared_error(y_train, predictions)
print(f"MSE: {mse:.2f}")

MSE: 4440.95
MSE: 3233.95
