https://towardsdatascience.com/end-your-bias-about-bias-and-variance-67b16f0eb1e6

In [1]:
## Following steps helps in calculating bias and variance for linear and Decision tree model.
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

In [2]:
## Defining Real population
def real_population(x1, x2, x3, x4, x5, size=5000, random_state=1234):
    #set.seed(99)
    b0= 1.1
    b1=2.2
    b2 = 3.3
    b3= 4.4
    b4=5.5
    b5= 6.6
    y = b0 + b1*x1 + b2*(x2**2) + b3*(x3*x4) + b4*x4 + b5*x5
    #print(y)
    r = np.random.RandomState(random_state)
    #print(r)
    irr_noise = r.normal(-5,10,size)
    #print(irr_noise)
    y = y + irr_noise
    #print(y)
    df = pd.DataFrame({'target':y, 'X1':x1, 'X2':x2, 'X3':x3, 'X4':x4, 'X5':x5})
    return df

In [3]:
## Function to simulate the data as per the real population
def simulation_data(size = 5000, random_seed= 101):
    np.random.seed(random_seed)
    x1 = np.random.rand(size)
    x2 = np.random.rand(size)
    x3 = np.random.rand(size)
    x4 = np.random.rand(size)
    x5 = np.random.rand(size)
    df = real_population(x1,x2,x3,x4,x5,size)
    return df

### Test the function

In [4]:
df1=simulation_data()

In [5]:
df1.head()

Unnamed: 0,target,X1,X2,X3,X4,X5
0,12.292279,0.516399,0.253181,0.430758,0.716954,0.731545
1,-3.887578,0.570668,0.112091,0.697556,0.688819,0.715543
2,15.798936,0.028474,0.24284,0.171483,0.377985,0.416743
3,-1.354406,0.171522,0.006595,0.110987,0.641994,0.219718
4,-1.139959,0.685277,0.969489,0.616311,0.029448,0.774969


In [6]:
# if needed export the data into excel and observe it
# first time users need to install
# conda install -c anaconda openpyxl
df1.to_excel("C:\\Users\\ramreddymyla\\Desktop\\rritec\\jjj.xlsx")

In [7]:
np.random.seed(22)
X_test = np.random.rand(5)

In [8]:
X_test

array([0.20846054, 0.48168106, 0.42053804, 0.859182  , 0.17116155])

In [9]:
real_population(X_test[0], 
                            X_test[1],
                            X_test[2],
                            X_test[3],
                            X_test[4], size=1)

Unnamed: 0,target,X1,X2,X3,X4,X5
0,9.483589,0.208461,0.481681,0.420538,0.859182,0.171162


In [10]:
real_population(X_test[0], 
                            X_test[1],
                            X_test[2],
                            X_test[3],
                            X_test[4], size=1)['target'][0]

9.483589316978552

In [11]:
## function to compute mean square error
def get_mse(mydf, model='Lin'):
    truth = real_population(X_test[0], 
                            X_test[1],
                            X_test[2],
                            X_test[3],
                            X_test[4], size=1)['target'][0]
    truth = [truth] * simulations
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    m = np.mean((estimate-truth)**2)
    return m

In [12]:
X_test[0]

0.20846053735884262

In [13]:
real_population(X_test[0], X_test[1],X_test[2],X_test[3],X_test[4], size=1)["target"][0]

9.483589316978552

In [14]:
## function to compute bias
def get_bias(mydf, model='Lin'):
    truth = real_population(X_test[0], X_test[1],X_test[2],X_test[3],X_test[4], size=1)['target'][0]
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    bias = np.mean(estimate) - truth
    return bias

In [15]:
## fucntion to compute variance
def get_var(mydf, model='Lin'):
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    var = np.mean((estimate - np.mean(estimate))**2)
    return var

In [16]:
## fucntion to run the simulation with 100 different datasets # 100 models
def run_simulation(lin_model, tree_model, sims = 100):
    simulations = sims
    predicted = []
    for i in range(0,simulations):
        D = simulation_data(5000, i)
        X = D[['X1', 'X2', 'X3', 'X4', 'X5']]
        Y = D['target']
        lin_model.fit(X,Y)
        tree_model.fit(X,Y)
        # Converting X_test list, into dataframe and trnsposeing
        tup = (i, reg.predict(pd.DataFrame(X_test).T), tree.predict(pd.DataFrame(X_test).T))        
        predicted.append(tup)
        predicted_df = pd.DataFrame(predicted)
    return predicted_df

In [17]:
## function to evaluate the different metrics of simulation
def evaluate_simulation(prediction_df):
    print("Bias for Lin model is: ", get_bias(prediction_df, 'Lin')**2)
    print("Bias for Tree model is: ", get_bias(prediction_df, 'tree')**2)
    print("Var for Lin model is:", get_var(prediction_df, 'Lin'))
    print("var for Tree model is:", get_var(prediction_df, 'tree'))
    print("MSE for Lin model is:", get_mse(prediction_df, 'Lin'))
    print("MSE for Tree model is:", get_mse(prediction_df, 'tree'))
    return()

In [18]:
## Invoking the functions defined above
reg = linear_model.LinearRegression()
simulations = 100
np.random.seed(22)
X_test = np.random.rand(5)
for depth in [3,4,6,8,9,10]:
    tree = DecisionTreeRegressor(max_depth=depth)
    results = run_simulation(reg, tree)
    evaluate_simulation(results)
    print("\n end of iter for depth", depth)
    print('\n')

Bias for Lin model is:  [16.54340396]
Bias for Tree model is:  [12.9530333]
Var for Lin model is: [0.07393302]
var for Tree model is: [1.45579073]
MSE for Lin model is: [16.61733698]
MSE for Tree model is: [14.40882403]

 end of iter for depth 3


Bias for Lin model is:  [16.54340396]
Bias for Tree model is:  [15.77474218]
Var for Lin model is: [0.07393302]
var for Tree model is: [2.06940007]
MSE for Lin model is: [16.61733698]
MSE for Tree model is: [17.84414226]

 end of iter for depth 4


Bias for Lin model is:  [16.54340396]
Bias for Tree model is:  [17.46141236]
Var for Lin model is: [0.07393302]
var for Tree model is: [4.08511866]
MSE for Lin model is: [16.61733698]
MSE for Tree model is: [21.54653102]

 end of iter for depth 6


Bias for Lin model is:  [16.54340396]
Bias for Tree model is:  [21.3386738]
Var for Lin model is: [0.07393302]
var for Tree model is: [6.94711687]
MSE for Lin model is: [16.61733698]
MSE for Tree model is: [28.28579066]

 end of iter for depth 8


Bias f

In [19]:
type(results)

pandas.core.frame.DataFrame

In [20]:
results

Unnamed: 0,0,1,2
0,0,[5.145779181232388],[5.06020335530296]
1,1,[5.743915831430977],[4.202100228876505]
2,2,[5.484633212871911],[6.793794774016674]
3,3,[5.5183281989162225],[21.23435848922528]
4,4,[5.22370706908932],[-11.49180056129655]
5,5,[5.6397593941420805],[4.032585864106407]
6,6,[5.439593465210981],[6.456495686319148]
7,7,[5.470817640881684],[9.800246405795821]
8,8,[5.264011739828982],[-2.055727727132623]
9,9,[5.354571105117517],[5.497111302359568]
