In [1]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

In [125]:
def simulation_model01(n_sim,
                       n_obs_train,
                       n_obs_validation,
                       a,
                       b1,
                       b2,
                       b3):
    
    '''
    model: f(x)=a+bx
    
    n_obs:
    Training and validation size = 10
    Training and validation size = 100
    Training and validation size = 1000
    Training and validation size = 10000
    '''

    bias_list=[]
    variance_list=[]
    mse_list=[]
    
    n_obs=n_obs_train+n_obs_validation
    
    for i in range(0, n_sim, 1):
        
        # simulate under the ground truth function: y=a+b1x+b2x^2+b3x^3+e
        error=np.random.normal(0, 1, n_obs)
        x=np.random.normal(0, 1, n_obs)
        y=a+(b1*x)+(b2*(x**2))+(b3*(x**3))+error

        # training
        Y = y[0:n_obs_train]
        X = x[0:n_obs_train]
        X = sm.add_constant(X)

        model = sm.OLS(endog=Y, exog=X)
        results = model.fit()
        
        # validation
        Y_val = y[n_obs_train:n_obs_train+n_obs_validation]
        X_val = x[n_obs_train:n_obs_train+n_obs_validation]
        Y_pred = results.params[0]+results.params[1]*X_val
    
        bias=np.mean(Y_pred-Y_val)
        variance=np.mean((Y_pred-np.mean(Y_pred))**2)
        mse=bias**2+variance
        
        bias_list.append(bias)
        variance_list.append(variance)
        mse_list.append(mse)
    
    print("Model: f(x)=a+bx")
    print("Ground truth function: a={}, b1={}, b2={}, b3={}".format(a, b1, b2, b3))
    print("Number of Simulations: {}".format(n_sim))
    print("Training size: {}, validation size: {}".format(n_obs_train, n_obs_validation))
    print("MSE of the model with all the simulation: {}".format(np.mean(mse_list)))
    print("Bias of the model with all the simulation: {}".format(np.mean(bias_list)))
    print("Variance of the model with all the simulation: {}".format(np.mean(variance_list)))
    
    print("===================================================")

In [126]:
a=1
b1=2
b2=3
b3=4

n_obs_train=10
n_obs_validation=10
n_sim=1000

simulation_model01(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=100
n_obs_validation=100
n_sim=1000

simulation_model01(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=1000
n_obs_validation=1000
n_sim=1000

simulation_model01(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=10000
n_obs_validation=10000
n_sim=1000

simulation_model01(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

Model: f(x)=a+bx
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 10, validation size: 10
MSE of the model with all the simulation: 174.5560707038301
Bias of the model with all the simulation: -0.6996239283238349
Variance of the model with all the simulation: 153.20513978269096
Model: f(x)=a+bx
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 100, validation size: 100
MSE of the model with all the simulation: 190.57521906866302
Bias of the model with all the simulation: -0.13820170786492572
Variance of the model with all the simulation: 188.19702882372997
Model: f(x)=a+bx
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 1000, validation size: 1000
MSE of the model with all the simulation: 195.85085660943568
Bias of the model with all the simulation: -0.0031101458084644307
Variance of the model with all the simulation: 195.63337388218469
Model: f(x)=a+bx
Ground truth func

In [127]:
def simulation_model02(n_sim,
                       n_obs_train,
                       n_obs_validation,
                       a,
                       b1,
                       b2,
                       b3):
    
    '''
    model: f(x)=a+b1x+b2x^2
    
    n_obs:
    Training and validation size = 10
    Training and validation size = 100
    Training and validation size = 1000
    Training and validation size = 10000
    '''

    bias_list=[]
    variance_list=[]
    mse_list=[]
    
    n_obs=n_obs_train+n_obs_validation
    
    for i in range(0, n_sim, 1):
        
        # simulate under the ground truth function: y=a+b1x+b2x^2+b3x^3+e
        error=np.random.normal(0, 1, n_obs)
        x=np.random.normal(0, 1, n_obs)
        y=a+(b1*x)+(b2*(x**2))+(b3*(x**3))+error

        # training
        Y = y[0:n_obs_train]
        X = np.column_stack((x[0:n_obs_train], (x[0:n_obs_train])**2))
        X = sm.add_constant(X)
        
        model = sm.OLS(endog=Y, exog=X)
        results = model.fit()
        
#         print(results.params)
#         print(results.summary())
        
        # validation
        Y_val = y[n_obs_train:n_obs_train+n_obs_validation]
        X_val = x[n_obs_train:n_obs_train+n_obs_validation]
        Y_pred = results.params[0]+results.params[1]*(X_val)+results.params[2]*(X_val**2)
    
        bias=np.mean(Y_pred-Y_val)
        variance=np.mean((Y_pred-np.mean(Y_pred))**2)
        mse=bias**2+variance
        
        bias_list.append(bias)
        variance_list.append(variance)
        mse_list.append(mse)
    
    print("model: f(x)=a+b1x+b2x^2")
    print("Ground truth function: a={}, b1={}, b2={}, b3={}".format(a, b1, b2, b3))
    print("Number of Simulations: {}".format(n_sim))
    print("Training size: {}, validation size: {}".format(n_obs_train, n_obs_validation))
    print("MSE of the model with all the simulation: {}".format(np.mean(mse_list)))
    print("Bias of the model with all the simulation: {}".format(np.mean(bias_list)))
    print("Variance of the model with all the simulation: {}".format(np.mean(variance_list)))
    
    print("===================================================")

In [128]:
a=1
b1=2
b2=3
b3=4

n_obs_train=10
n_obs_validation=10
n_sim=1000

simulation_model02(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=100
n_obs_validation=100
n_sim=1000

simulation_model02(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=1000
n_obs_validation=1000
n_sim=1000

simulation_model02(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=10000
n_obs_validation=10000
n_sim=1000

simulation_model02(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

model: f(x)=a+b1x+b2x^2
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 10, validation size: 10
MSE of the model with all the simulation: 185.2727168446809
Bias of the model with all the simulation: 0.018442661849270393
Variance of the model with all the simulation: 150.2969274346286
model: f(x)=a+b1x+b2x^2
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 100, validation size: 100
MSE of the model with all the simulation: 206.26960789918945
Bias of the model with all the simulation: -0.010829057708361572
Variance of the model with all the simulation: 203.9990139597735
model: f(x)=a+b1x+b2x^2
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 1000, validation size: 1000
MSE of the model with all the simulation: 214.92266553549183
Bias of the model with all the simulation: -0.0017887848761833282
Variance of the model with all the simulation: 214.7202601833003
model: f(x)=a+

In [129]:
def simulation_model03(n_sim,
                       n_obs_train,
                       n_obs_validation,
                       a,
                       b1,
                       b2,
                       b3):
    
    '''
    model: f(x)=a+b1x+b2x^2+b3x^3+b4x^4+b5x^5+b6x^6
    
    n_obs:
    Training and validation size = 10
    Training and validation size = 100
    Training and validation size = 1000
    Training and validation size = 10000
    '''

    bias_list=[]
    variance_list=[]
    mse_list=[]
    
    n_obs=n_obs_train+n_obs_validation
    
    for i in range(0, n_sim, 1):
        
        # simulate under the ground truth function: y=a+b1x+b2x^2+b3x^3+e
        error=np.random.normal(0, 1, n_obs)
        x=np.random.normal(0, 1, n_obs)
        y=a+(b1*x)+(b2*(x**2))+(b3*(x**3))+error

        # training
        Y = y[0:n_obs_train]
        X = np.column_stack(((x[0:n_obs_train]),
                             (x[0:n_obs_train])**2,
                             (x[0:n_obs_train])**3,
                             (x[0:n_obs_train])**4,
                             (x[0:n_obs_train])**5,
                             (x[0:n_obs_train])**6))
        X = sm.add_constant(X)
        
        model = sm.OLS(endog=Y, exog=X)
        results = model.fit()
        
        # validation
        Y_val = y[n_obs_train:n_obs_train+n_obs_validation]
        X_val = x[n_obs_train:n_obs_train+n_obs_validation]
        Y_pred = (results.params[0]+\
                  results.params[1]*(X_val)+\
                  results.params[2]*(X_val**2)+\
                  results.params[3]*(X_val**3)+\
                  results.params[4]*(X_val**4)+\
                  results.params[5]*(X_val**5)+\
                  results.params[6]*(X_val**6))
    
        bias=np.mean(Y_pred-Y_val)
        variance=np.mean((Y_pred-np.mean(Y_pred))**2)
        mse=bias**2+variance
        
        bias_list.append(bias)
        variance_list.append(variance)
        mse_list.append(mse)
    
    print("model: f(x)=a+b1x+b2x^2+b3x^3+b4x^4+b5x^5+b6x^6")
    print("Ground truth function: a={}, b1={}, b2={}, b3={}".format(a, b1, b2, b3))
    print("Number of Simulations: {}".format(n_sim))
    print("Training size: {}, validation size: {}".format(n_obs_train, n_obs_validation))
    print("MSE of the model with all the simulation: {}".format(np.mean(mse_list)))
    print("Bias of the model with all the simulation: {}".format(np.mean(bias_list)))
    print("Variance of the model with all the simulation: {}".format(np.mean(variance_list)))
    
    print("===================================================")

In [130]:
a=1
b1=2
b2=3
b3=4

n_obs_train=10
n_obs_validation=10
n_sim=1000

simulation_model03(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=100
n_obs_validation=100
n_sim=1000

simulation_model03(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=1000
n_obs_validation=1000
n_sim=1000

simulation_model03(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

n_obs_train=10000
n_obs_validation=10000
n_sim=1000

simulation_model03(n_sim, n_obs_train, n_obs_validation, a, b1, b2, b3)

model: f(x)=a+b1x+b2x^2+b3x^3+b4x^4+b5x^5+b6x^6
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 10, validation size: 10
MSE of the model with all the simulation: 33074543.63319149
Bias of the model with all the simulation: 76.09741923381374
Variance of the model with all the simulation: 29287513.92816305
model: f(x)=a+b1x+b2x^2+b3x^3+b4x^4+b5x^5+b6x^6
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 100, validation size: 100
MSE of the model with all the simulation: 312.73021951482997
Bias of the model with all the simulation: -0.007036007994938097
Variance of the model with all the simulation: 312.5440870867035
model: f(x)=a+b1x+b2x^2+b3x^3+b4x^4+b5x^5+b6x^6
Ground truth function: a=1, b1=2, b2=3, b3=4
Number of Simulations: 1000
Training size: 1000, validation size: 1000
MSE of the model with all the simulation: 309.20398643399506
Bias of the model with all the simulation: 2.5514628554296758e-06
Variance o