# Does shallow tree always has lower variance and higher bias?

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
LRmodel = LinearRegression()
X = np.linspace(-5, 5, 100).reshape(-1, 1)
W = 3.0

## Formulas
$Variance = E(\hat{y} - E(\hat{y}))^2$

$Bias = (E(\hat{y}) - y_{Truth})^2$

In [7]:
# Train linear model 5000 times with different instance sampling from true model
np.random.seed(231)
for i in range(5000):
    Y = W * X + np.random.normal(0, 10, X.shape)  ## true model Y = 3*X + error
    LRmodel.fit(X, Y)
    y_pred = LRmodel.predict(X)
    if i == 0:
        result1 = y_pred
    else:
        result1 = np.concatenate([result1, y_pred], axis=1)
LRmodelVariance = result1.var(axis=1).mean()
LRmodelBias = ((result1.mean(axis=1, keepdims=True) - W * X)**2).mean()
print(f'Linear model has variance = {LRmodelVariance:.3f}, and bias = {LRmodelBias:.5f}')

Linear model has variance = 2.007, and bias = 0.00029


# 1. If the true model is linear $Y = 3*X + error$
1. Deeper tree = higher variance. (Can we prove it in theory?)
2. You may have smaller bias in lower tree.

## 1.1 Tree with max depth = 2-20

In [8]:
## Train tree model with depth=2-20 5000 times with different instance sampling from true model
for max_depth in range(2, 21):
    np.random.seed(231)  
    for i in range(5000):
        Y = W * X + np.random.normal(0, 10, X.shape)  ## true model Y = 3*X + error
        DeepDtree = DecisionTreeRegressor(max_depth=max_depth)
        DeepDtree.fit(X, Y)
        y_pred = DeepDtree.predict(X).reshape(-1, 1)
        if i == 0:
            result3 = y_pred
        else:
            result3 = np.concatenate([result3, y_pred], axis=1)
    DeepDtreeVariance = result3.var(axis=1).mean()
    DeepDtreeBias = ((result3.mean(axis=1, keepdims=True) - W * X)**2).mean()
    print(f'Tree model with depth = {max_depth} has variance = {DeepDtreeVariance:.3f}, and bias = {DeepDtreeBias:.5f}')

Tree model with depth = 2 has variance = 14.010, and bias = 0.01602
Tree model with depth = 3 has variance = 24.410, and bias = 0.01659
Tree model with depth = 4 has variance = 37.493, and bias = 0.01013
Tree model with depth = 5 has variance = 51.497, and bias = 0.00947
Tree model with depth = 6 has variance = 64.778, and bias = 0.01351
Tree model with depth = 7 has variance = 76.093, and bias = 0.01393
Tree model with depth = 8 has variance = 84.740, and bias = 0.01570
Tree model with depth = 9 has variance = 90.737, and bias = 0.01687
Tree model with depth = 10 has variance = 94.645, and bias = 0.01861
Tree model with depth = 11 has variance = 96.983, and bias = 0.01924
Tree model with depth = 12 has variance = 98.351, and bias = 0.01896
Tree model with depth = 13 has variance = 99.069, and bias = 0.01921
Tree model with depth = 14 has variance = 99.433, and bias = 0.01935
Tree model with depth = 15 has variance = 99.596, and bias = 0.01938
Tree model with depth = 16 has variance = 

## 1.2 Change random seed

In [9]:
## Train tree model with depth=2-20 5000 times with different instance sampling from true model

for max_depth in range(2, 21):
    np.random.seed(459)  # change seed
    for i in range(5000):
        Y = W * X + np.random.normal(0, 10, X.shape)  ## true model Y = 3*X + error
        DeepDtree = DecisionTreeRegressor(max_depth=max_depth)
        DeepDtree.fit(X, Y)
        y_pred = DeepDtree.predict(X).reshape(-1, 1)
        if i == 0:
            result3 = y_pred
        else:
            result3 = np.concatenate([result3, y_pred], axis=1)
    DeepDtreeVariance = result3.var(axis=1).mean()
    DeepDtreeBias = ((result3.mean(axis=1, keepdims=True) - W * X)**2).mean()
    print(f'Tree model with depth = {max_depth} has variance = {DeepDtreeVariance:.3f}, and bias = {DeepDtreeBias:.5f}')

Tree model with depth = 2 has variance = 14.047, and bias = 0.02425
Tree model with depth = 3 has variance = 24.397, and bias = 0.02820
Tree model with depth = 4 has variance = 37.515, and bias = 0.00633
Tree model with depth = 5 has variance = 51.844, and bias = 0.01059
Tree model with depth = 6 has variance = 65.216, and bias = 0.01111
Tree model with depth = 7 has variance = 76.377, and bias = 0.01489
Tree model with depth = 8 has variance = 84.987, and bias = 0.01501
Tree model with depth = 9 has variance = 90.935, and bias = 0.01645
Tree model with depth = 10 has variance = 94.761, and bias = 0.01747
Tree model with depth = 11 has variance = 97.051, and bias = 0.01791
Tree model with depth = 12 has variance = 98.366, and bias = 0.01840
Tree model with depth = 13 has variance = 99.062, and bias = 0.01885
Tree model with depth = 14 has variance = 99.423, and bias = 0.01908
Tree model with depth = 15 has variance = 99.599, and bias = 0.01917
Tree model with depth = 16 has variance = 

# 2. If the true model is $Y = 3*X^3 + error$
1. You may have bigger variance in lower tree.
2. You may have smaller bias in lower tree.

In [10]:
## Train tree model with depth=2-20 5000 times with different instance sampling from true model
for max_depth in range(2, 21):
    np.random.seed(231)  
    for i in range(5000):
        Y = W * X**3 + np.random.normal(0, 10, X.shape)  ## true model Y = 3*X^3 + error
        DeepDtree = DecisionTreeRegressor(max_depth=max_depth)
        DeepDtree.fit(X, Y)
        y_pred = DeepDtree.predict(X).reshape(-1, 1)
        if i == 0:
            result3 = y_pred
        else:
            result3 = np.concatenate([result3, y_pred], axis=1)
    DeepDtreeVariance = result3.var(axis=1).mean()
    DeepDtreeBias = ((result3.mean(axis=1, keepdims=True) - W * X)**2).mean()
    print(f'Tree model with depth = {max_depth} has variance = {DeepDtreeVariance:.3f}, and bias = {DeepDtreeBias:.5f}')

Tree model with depth = 2 has variance = 1284.582, and bias = 15889.47078
Tree model with depth = 3 has variance = 537.204, and bias = 18066.27765
Tree model with depth = 4 has variance = 187.971, and bias = 18825.48490
Tree model with depth = 5 has variance = 84.715, and bias = 19019.86842
Tree model with depth = 6 has variance = 65.019, and bias = 19058.02719
Tree model with depth = 7 has variance = 76.660, and bias = 19057.72296
Tree model with depth = 8 has variance = 83.387, and bias = 19058.78293
Tree model with depth = 9 has variance = 89.082, and bias = 19058.75462
Tree model with depth = 10 has variance = 93.168, and bias = 19058.84203
Tree model with depth = 11 has variance = 95.986, and bias = 19058.83547
Tree model with depth = 12 has variance = 97.760, and bias = 19058.84496
Tree model with depth = 13 has variance = 98.775, and bias = 19058.84300
Tree model with depth = 14 has variance = 99.303, and bias = 19058.84544
Tree model with depth = 15 has variance = 99.548, and b