In [32]:
import numpy as np
import pandas as pd
from scipy import stats

In [805]:
np.random.seed(2018)
X_MEAN = 20
X_STD_ERROR = 3
BETA_0 = 2
BETA_1 = -2.5

ERROR_FACTORY = {
    'normal': lambda size: stats.norm.rvs(0, 2, size=size),
    'uniform': lambda size: stats.uniform.rvs(-2, 4, size=size),
    'skew_normal': lambda size: stats.skewnorm.rvs(
        5, -5/np.sqrt(1 + 5**2)*np.sqrt(2/np.pi), 1, size=size),
    't': lambda size: stats.t.rvs(2, 0, 1, size=size),
    'levy_stable': lambda size: stats.levy_stable.rvs(3/2, 0, size=size)
}

In [804]:
np.var(ERROR_FACTORY['levy_stable'](1000000))

325.45419621461235

In [806]:
experiments = pd.DataFrame(index=pd.MultiIndex.from_product([
    pd.Series([15, 30], dtype=np.int32),
    pd.Series(['normal','uniform', 'skew_normal', 'levy_stable'], dtype=np.object),
], names=['n', 'error_type'])).reset_index()
experiments

Unnamed: 0,n,error_type
0,15,normal
1,15,uniform
2,15,skew_normal
3,15,levy_stable
4,30,normal
5,30,uniform
6,30,skew_normal
7,30,levy_stable


In [809]:
from collections import OrderedDict

from stat570.linear_model.linear_regression import LinearRegression

def simulate(n, error_factory, num_trials, confidence_level=0.95, seed=2018):
    np.random.seed(seed)    
    X = stats.norm.rvs(x_mean, x_std_error, size=(n, 1))
    
    def simulate_trial():
        errors = error_factory(n)
        y = BETA_0 + np.squeeze(X*BETA_1) + errors
        linear_model = LinearRegression().fit(X, y)
        
        coefficients = linear_model.coefficients_
        
        return OrderedDict([
            ('beta_0_estimate', coefficients['estimate'][0]),
            ('beta_0_variance', np.square(coefficients['std_error'][0])),
            ('beta_1_estimate', coefficients['estimate'][1]),
            ('beta_1_variance', coefficients['std_error'][1]),
        ])
    
    trial_results = pd.DataFrame([simulate_trial() for _ in range(num_trials)])    
    t_conf = stats.t.isf((1 - confidence_level)/2, df=n-2)

    beta_0_error = trial_results['beta_0_estimate'] - BETA_0
    beta_0_pct_conf_interval = np.sum(
        np.abs(beta_0_error) <= t_conf*np.sqrt(trial_results['beta_0_variance']))/num_trials
    print(beta_0_pct_conf_interval)
    print(np.var(trial_results['beta_0_estimate'] ,ddof=1))
    
    
    return trial_results.mean()

In [811]:
for _, experiment in experiments.iterrows():
    print(experiment)
    print(simulate(experiment['n'], ERROR_FACTORY[experiment['error_type']], 1000))

n                 15
error_type    normal
Name: 0, dtype: object
0.952
16.409417148699834
beta_0_estimate     2.156427
beta_0_variance    16.379581
beta_1_estimate    -2.507613
beta_1_variance     0.191126
dtype: float64
n                  15
error_type    uniform
Name: 1, dtype: object
0.951
5.587439202117705
beta_0_estimate    2.111719
beta_0_variance    5.473722
beta_1_estimate   -2.505180
beta_1_variance    0.111606
dtype: float64
n                      15
error_type    skew_normal
Name: 2, dtype: object
0.945
1.6450778137432107
beta_0_estimate    2.008698
beta_0_variance    1.631703
beta_1_estimate   -2.500502
beta_1_variance    0.059968
dtype: float64
n                      15
error_type    levy_stable
Name: 3, dtype: object
0.947
201.47903106500527
beta_0_estimate      2.378370
beta_0_variance    502.542165
beta_1_estimate     -2.520855
beta_1_variance      0.311045
dtype: float64
n                 30
error_type    normal
Name: 4, dtype: object
0.953
6.133336566856355
beta_0_est

In [607]:
np.mean(stats.t.rvs(1.1,0, 1, 15000000))

0.21114095706975328