In [1]:
import collections
import functools
import lzma
import multiprocessing
import pickle
import time

import numpy as np
import pandas as pd
from scipy import linalg
from scipy import special
from scipy import stats

np.set_printoptions(suppress=True)

def make_data(beta, n):
    x = stats.bernoulli.rvs(0.5, size=n)
    z = stats.norm.rvs(size=n)
    X = np.column_stack((np.ones_like(x), x, z))
    y = stats.bernoulli.rvs(special.expit(np.matmul(X, beta)))
    return X, y

def score(X, y, beta):
    return np.matmul(X.T, y - special.expit(np.matmul(X, beta)))

def fit_logistic_regression(X, y):
    """Fits a logistic regression model with Newton-Raphson updates."""
    def jacobian(beta):
        p = special.expit(np.matmul(X, beta))
        weight = p*(1-p)        
        return (weight*X.T).dot(X)
    beta = np.zeros(X.shape[1])
    while np.sum(np.abs(score(X, y, beta))) > 1e-6:
        beta += linalg.cho_solve(
            linalg.cho_factor(jacobian(beta)), score(X, y, beta))
    return beta

def simulate(beta, n=1000, adjusted=False):
    X, y = make_data(beta, n)
    return fit_logistic_regression(X if adjusted else X[:,:-1], y)

def experiment(parameters, num_simulations, num_parallel_calls=4):
    simulation_results = {}
    with multiprocessing.Pool(num_parallel_calls) as pool:
        for i, p in parameters.to_frame().iterrows():
            beta = p[['$\\beta_0$', '$\\beta_1$', '$\\beta_2$']].values.astype(np.float64)
            n = p['$n$']
            adjusted = p['Adjusted']
            simulation_results[i] = pool.map(
                functools.partial(simulate, beta, n),
                (adjusted for _ in range(num_simulations)))
    return simulation_results

In [2]:
parameters = pd.MultiIndex.from_product(iterables=[
    [1000],
    [-2.],
    [0.5, 1.0],
    [0.5, 1.0, 2.0, 3.0],
    [False, True],
], names=['$n$', '$\\beta_0$', '$\\beta_1$', '$\\beta_2$', 'Adjusted'])

In [3]:
#simulation_results = experiment(parameters, 1024*1024)
#with open('logistic_regression_simulations.pickle', 'wb') as f:
#    pickle.dump(simulation_results, f)
with lzma.open('logistic_regression_simulations.pickle.lzma', 'rb') as f:
    simulation_results = pickle.load(f)

In [4]:
simulation_summary = pd.DataFrame(collections.OrderedDict([
    ('$\\mathbb{E}\\left[\\hat\\beta_1\\right]$', [np.mean(simulation_results[p], 0)[1] for p in parameters]),
    ('$\\hat\\sigma_{\\hat\\beta_1}$', [np.sqrt(np.var(simulation_results[p], 0, ddof=1)[1]) for p in parameters])
]), index=parameters)

In [5]:
simulation_summary['Coverage of 95\\% CI'] =  [
    np.mean(np.square(np.array(simulation_results[p])[:,1] - p[2]) <=
            np.square(stats.norm.ppf(0.975))*np.var(simulation_results[p], 0, ddof=1)[1])
    for p in parameters]

In [6]:
simulation_summary['Wald test power'] = [
    np.mean(np.square(np.array(simulation_results[p])[:,1]) >
            stats.chi2.ppf(0.95, df=1)*np.var(simulation_results[p], 0, ddof=1)[1])
    for p in parameters
]

In [7]:
with open('p2_summary.tex', 'w') as f:
    f.write(simulation_summary.to_latex(escape=False))

simulation_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,$\mathbb{E}\left[\hat\beta_1\right]$,$\hat\sigma_{\hat\beta_1}$,Coverage of 95\% CI,Wald test power
$n$,$\beta_0$,$\beta_1$,$\beta_2$,Adjusted,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000,-2.0,0.5,0.5,False,0.486574,0.176321,0.94912,0.787659
1000,-2.0,0.5,0.5,True,0.503676,0.179792,0.949654,0.800568
1000,-2.0,0.5,1.0,False,0.437434,0.165271,0.933632,0.753868
1000,-2.0,0.5,1.0,True,0.503888,0.177877,0.950001,0.80862
1000,-2.0,0.5,2.0,False,0.319081,0.146253,0.763893,0.586224
1000,-2.0,0.5,2.0,True,0.503771,0.18585,0.9496,0.773979
1000,-2.0,0.5,3.0,False,0.237999,0.137772,0.52235,0.407654
1000,-2.0,0.5,3.0,True,0.503356,0.203177,0.950344,0.695671
1000,-2.0,1.0,0.5,False,0.966688,0.168091,0.946236,0.999924
1000,-2.0,1.0,0.5,True,1.005611,0.17158,0.949684,0.999981
