# Application of Bootstrap samples in Random Forest

In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import statistics
import math

 <li> Load the boston house dataset </li>

In [2]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable

In [3]:
x.shape

(506, 13)

### Task: 1
<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replciate 4 points from [4, 5, 7, 8, 9, 3], consder they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have                different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

### Task: 2
<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intravels of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence intravel</li>
</ol>
</pre>
### Task: 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

## Task: 1

### Step 1 Creating samples: Randomly create 30 samples from the whole boston data points.

In [4]:
import random

def random_sample(a, b, num):
    rand_sample=[]
    for i in range(num):
        rand_sample.append(random.randint(a, b))
    return rand_sample

### Step 2: Build a DecisionTreeRegressor on each of the sample

In [5]:
# https://www.quantstart.com/articles/bootstrap-aggregation-random-forests-and-boosted-trees/
from sklearn.tree import DecisionTreeRegressor

# computed the predicted values of each data point(506 data points) in your corpus
def pred_price(x, y, X):
    indices = []
    col = []
    Y_pred = []
    Y_sample=[]
    
    # applying regression trees on each of 30 samples
    for i in range(30):
        initial_sample = np.random.choice(506,303,replace=False)
        X_sample = x[initial_sample]
        Y_sample=list(y[initial_sample])
        indices.append(initial_sample)
        
        #again replicating 203 data points from the sampled points
        rep_sample = np.random.choice(initial_sample,203,replace=False)
        for j in rep_sample:
            X_sample = np.append(X_sample, x[j].reshape(1, -1), axis = 0)
            Y_sample.append(y[j])
        column = np.random.choice(13, random_sample(3, 13, 1), replace = False)
        X_sample = X_sample[:, column]
        col.append(column)
        
        dt_regressor = DecisionTreeRegressor()
        dt_regressor.fit(X_sample, Y_sample)
        
        pred = dt_regressor.predict(x[:, column])
        Y_pred.append(pred)
    return indices, col, Y_pred

In [6]:
# computed the predicted values of each data point(506 data points) in your corpus
indices, col, Y_pred = pred_price(x, y, x)
print("Predicted house price:",Y_pred[0][:10])

Predicted house price: [22.8 21.6 37.3 37.3 36.2 28.7 22.9 27.1 16.5 18.9]


In [10]:
# calculating mse
# code copied from kaggle
def mse_error(x, y):
    indices, col, pred = pred_price(x, y, x)
    pred = list(map(list, zip(*pred))) 
    y_pred = np.zeros(len(pred))
    for i in range(len(pred)):
        val = 0
        for j in range(len(pred[0])):
            val += pred[i][j]
        val/=len(pred[0])
        y_pred[i] = val
    err = 0
    for i in range(len(y)):
        #calculating mse and updating
        err += (y[i] - y_pred[i])**2
    err/=len(y)
    return err
MSE = mse_error(x, y)
print(MSE)

2.5471248433349603


In [68]:
# calculate the oob error
# code copied from kaggle
def out_of_bag(x, y):
    indices, col, pred = pred_price(x, y, x)
    pred = list(map(list, zip(*pred))) 
    y_pred = np.zeros(len(pred))
    for i in range(len(pred)):
        val = 0
        c = 0
        for j in range(len(pred[0])):
            # for model which was buit on samples not included 𝑥𝑖
            if i not in indices[j]:
                val += pred[i][j]
                c += 1
        val/=c
        y_pred[i] = val
    err = 0
    for i in range(len(y)):
        err += (y[i] - y_pred[i])**2
    err/=len(y)
    return err

oob = out_of_bag(x, y)
print("OOB Score:",oob)
print("-"*50)

OOB Score: 14.57801645911499
--------------------------------------------------


## Task 2

In [48]:
train_mse = []
train_oob = []

# Repeating Task 1 for 35 times, and for each iteration store the Train MSE and OOB score 
for i in range(35):
    train_mse.append(mse_error(y, Y_pred))
    train_oob.append(out_of_bag(x, y))
print(train_mse[0:5])
print(train_oob[0:5])

[17.176363980502558, 17.176363980502558, 17.176363980502558, 17.176363980502558, 17.176363980502558]
[15.487580318341857, 15.356531681410592, 14.998232766290162, 15.47985874544868, 13.85395603027453]


In [58]:
# Computing CI of OOB Score and Train MSE
#https://machinelearningmastery.com/confidence-intervals-for-machine-learning/

def conf_interval(x):
    std = np.std(x)
    standard_error = std/math.sqrt(len(x))
    #upper and lower confidence level 
    upper_cl = np.mean(x) + 1.96*standard_error
    lower_cl = np.mean(x) - 1.96*standard_error
    return upper_cl, lower_cl

In [60]:
upper_cl, lower_cl = conf_interval(train_mse)
print("confidence interval for mse score is: ",upper_cl, lower_cl)

confidence interval for mse score is:  17.176363980502554 17.176363980502554


In [61]:
upper_cl, lower_cl = conf_interval(train_oob)
print("confidence interval for oob score is: ",upper_cl, lower_cl)

confidence interval for oob score is:  14.599291568275044 13.815820910292127


## Task 3
    -Given a single query point predict the price of house

In [85]:
# Predict the house price for this point as mentioned in the step 2 of Task 1
xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]
#converting input to array
xq = np.asarray(xq).reshape(1, -1)
indices, col, Yq = pred_price(x, y, xq)
pred_yq = sum(Yq)/len(Yq)
print("The predicted house price for Xq is:", pred_yq)

The predicted house price for Xq is: [21.735]
