# 10 - Classification and Regression Trees(CART)

**Classification and Regression Trees or CART for short is a term introduced by Leo Breiman to refer to Decision Tree algorithms that can be used for classification or regression predictive modeling problems.**

# Get Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Hitters.csv")
data = df.copy()
# Clear null values
data = data.dropna()
print(data.isnull().values.any())
data.head()

False


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


# Create Dummy Variable

In [3]:
dummies = pd.get_dummies(data[["League","Division","NewLeague"]])
dummies.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [4]:
y = data["Salary"]

X_pre = data.drop(["Salary","League","Division","NewLeague"],axis=1).astype("float64")
X = pd.concat([X_pre,dummies[["League_N","Division_W","NewLeague_N"]]],axis=1)
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


# Split train and test

(210, 19)
(210,)
(53, 19)
(53,)


# Import model

In [6]:
from sklearn.tree import DecisionTreeRegressor

cart_model = DecisionTreeRegressor().fit(X_train,y_train)
cart_model

DecisionTreeRegressor()

In [14]:
from skompiler import skompile

print(skompile(cart_model.predict))

(if (x[11] <= 307.5) then (if (x[5] <= 0.5) then 2127.333 else (if (x[7] <= 1322.0) then (if (x[7] <= 100.0) then 920.0 else (if (x[8] <= 182.0) then (if (x[11] <= 72.5) then (if (x[0] <= 153.5) then 190.0 else (if (x[7] <= 341.5) then (if (x[18] <= 0.5) then (if (x[15] <= 6.0) then (if (x[12] <= 28.5) then 68.0 else 67.5) else 70.0) else (if (x[0] <= 244.0) then (if (x[14] <= 3.0) then 86.5 else 90.0) else 75.0)) else (if (x[16] <= 0.5) then (if (x[7] <= 511.5) then (if (x[12] <= 25.0) then (if (x[8] <= 110.5) then (if (x[6] <= 2.0) then 97.5 else 100.0) else 90.0) else (if (x[11] <= 41.0) then (if (x[4] <= 32.5) then 100.0 else 105.0) else (if (x[13] <= 294.0) then 110.0 else (if (x[3] <= 40.5) then 120.0 else 115.0)))) else (if (x[11] <= 35.5) then 87.5 else (if (x[4] <= 28.0) then 75.0 else 80.0))) else (if (x[13] <= 348.5) then (if (x[12] <= 37.5) then (if (x[11] <= 45.0) then (if (x[14] <= 164.0) then 125.0 else 130.0) else 110.0) else (if (x[9] <= 4.0) then 150.0 else 140.0)) el

# Model Tuning

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

cart = DecisionTreeRegressor()
cart_params = {"min_samples_split":range(2,10),
              "max_leaf_nodes":range(2,10)}

cart_cv_model = GridSearchCV(cart,cart_params,cv=10)
cart_cv_model.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_leaf_nodes': range(2, 10),
                         'min_samples_split': range(2, 10)})

In [20]:
cart_cv_model.best_params_

{'max_leaf_nodes': 5, 'min_samples_split': 3}

# Tuned Model

In [23]:
cart_tuned = DecisionTreeRegressor(max_leaf_nodes=5,min_samples_leaf=8).fit(X_train,y_train)

# Prediction with tuned model
y_pred_tuned = cart_tuned.predict(X_test)
mse_tuned = mean_squared_error(y_test,y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)

print(f"MSE Tuned Loss = {mse_tuned}")
print(f"RMSE Tuned Loss = {rmse_tuned}")

MSE Tuned Loss = 60821.80906524021
RMSE Tuned Loss = 246.620779873149
