## Plot Node

In [3]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tree_algorithms
import numpy as np
import helper_functions

import importlib
importlib.reload(tree_algorithms)
importlib.reload(helper_functions)

df, X, y = helper_functions.load_auto_data_set()

NodePlot = helper_functions.NodePlot(
                                    X_parent = X, 
                                    y_parent = y, 
                                    threshold = 100, 
                                    selected_feature = "horsepower"
                                    )

NodePlot.plot_split()

In [7]:
import helper_functions
import importlib
importlib.reload(helper_functions)

selected_feature = "horsepower"

list_of_mse_childs = []
list_of_mse_parent = []
thresholds = X.sort_values(by=["horsepower"])["horsepower"].unique()

for threshold in thresholds:

    NodePlot = helper_functions.NodePlot(
                                    X_parent = X, 
                                    y_parent = y, 
                                    threshold = threshold, 
                                    selected_feature = "horsepower"
                                    )

    list_of_mse_childs.append(NodePlot.child_mse)
    list_of_mse_parent.append(NodePlot.parent_mse)

fig = helper_functions.plot_threshold_evaluation(
                            thresholds = thresholds, 
                            mse_parent_list = list_of_mse_parent, 
                            mse_list = list_of_mse_childs, 
                            threshold = 100
                        )

## Practical decision tree building using scikit-learn

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

X, y = load_diabetes(return_X_y=True)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor, X, y, cv=10)

### Load and perpare data set

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def load_auto_data_set():

    # Load the automobile data set from UCI.edu 
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
    df = pd.read_csv(url, header=None)

    # Name columns 
    df.columns = [
                    'symboling', 'normalized_losses', 'make', 
                    'fuel_type', 'aspiration', 'num_doors', 
                    'body_style', 'drive_wheels', 'engine_location',
                    'wheel_base','length','width','height',
                    'curb_weight','engine_type','num_cylinders',
                    'engine_size','fuel_system','bore','stroke',
                    'compression_ratio','horsepower','peak_rpm',
                    'city_mpg','highway_mpg','price'
                ]

    # Filter for lines where power and price are available
    df = df[(df.horsepower != '?')]
    df = df[(df.price != '?')]
    #df = df.reset_index()
    
    # Filter for lines where power and price are available
    df['horsepower'] = df['horsepower'].astype(int)
    df['price'] = df['price'].astype(int)

    # Define the last column of the data frame as y and the rest as X
    y = df.iloc[:, -1]
    X = df.iloc[:, :-1]

    return df, X, y

df, X, y = load_auto_data_set()

from sklearn.preprocessing import OneHotEncoder

X_selected = X[["wheel_base", "length", "width", "height"]]

# define and fit the OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(df[['make']])

# transform the data
make_one_hot_sklearn = pd.DataFrame(ohe.transform(df[["make"]]).toarray(), columns=ohe.categories_[0])

X_selected = X_selected.join(make_one_hot_sklearn)
X = np.array(X)
y = np.array(y)

### Build a model using the decision tree implemented from scratch

In [8]:
import tree_algorithms
from sklearn.metrics import mean_squared_error, mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

regr = tree_algorithms.RegressionTree(min_samples_split=5, max_depth=20)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# calculate mean squared error
print(f"Mean Squared Error: {round(mean_squared_error(y_test, y_pred), 1)}")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean Squared Error: 25392158.7


### Build a model using the scikit-learn decision tree library

In [9]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

regr = DecisionTreeRegressor(min_samples_split=5, max_depth=20)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# calculate mean squared error
print(f"Mean Squared Error: {round(mean_squared_error(y_test, y_pred), 1)}")

Mean Squared Error: 25392158.7
