/
cluster4.py
93 lines (65 loc) · 3.32 KB
/
cluster4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
import plotnine as p
# read data
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def read_data(file):
return pd.read_stata("https://github.com/scunning1975/mixtape/raw/master/" + file)
def gen_cluster(param = (.1, .5), n = 1000, n_cluster = 50, rho = .5):
# Function to generate clustered data
# individual level
Sigma_i = np.array((1, 0, 0, 1 - rho)).reshape(2,2)
values_i = np.random.multivariate_normal(np.zeros(2), Sigma_i, size = n)
# cluster level
cluster_name = np.repeat(np.arange(1, n_cluster+1), repeats = n / n_cluster)
Sigma_cl = np.array((1, 0, 0, rho)).reshape(2,2)
values_cl = np.random.multivariate_normal(np.zeros(2),Sigma_cl, size = n_cluster)
# predictor var consists of individual- and cluster-level components
x = values_i[: , 0] + np.repeat(values_cl[: , 0], repeats = n / n_cluster)
# error consists of individual- and cluster-level components
error = values_i[: , 1] + np.repeat(values_cl[: , 1], repeats = n / n_cluster)
# data generating process
y = param[0] + param[1]*x + error
df = pd.DataFrame({'x':x, 'y':y, 'cluster': cluster_name})
return df
def cluster_sim(param = (.1, .5), n = 1000, n_cluster = 50,
rho = .5, cluster_robust = False):
df = gen_cluster(param = param, n = n , n_cluster = n_cluster, rho = rho)
if not cluster_robust:
fit = sm.OLS.from_formula('y ~ x', data = df).fit()
else: # cluster-robust SE
fit = sm.OLS.from_formula('y ~ x', data = df).fit(cov_type='cluster', cov_kwds={'groups': df['cluster']})
b1 = fit.params[1]
Sigma = fit.cov_params()
se = np.sqrt(np.diag(Sigma)[1])
ci95 = se*1.96
b1_ci95 = (b1-ci95, b1+ci95)
return (b1, se, *b1_ci95)
def run_cluster_sim(n_sims = 1000, param = (.1, .5), n = 1000,
n_cluster = 50, rho = .5, cluster_robust = False):
res = [cluster_sim(param = param, n = n, rho = rho,
n_cluster = n_cluster,
cluster_robust = cluster_robust) for x in range(n_sims)]
df = pd.DataFrame(res)
df.columns = ('b1', 'se_b1', 'ci95_lower', 'ci95_upper')
df['param_caught'] = (df['ci95_lower'] <= param[1]) & (param[1] <= df['ci95_upper'])
df['id'] = df.index
return df
# Simulation clustered SE
sim_params = [.4, 0] # beta1 = 0: no effect of x on y
sim_nocluster = run_cluster_sim(n_sims=1000, param = sim_params, cluster_robust = True)
p.ggplot(sim_nocluster, p.aes('b1')) + p.geom_histogram(color = 'black') + p.geom_vline(xintercept = sim_params[1], color = 'red')
p.ggplot(sim_nocluster.sample(100).sort_values('b1'),
p.aes(x = 'factor(id)', y = 'b1',
ymin = 'ci95_lower', ymax = 'ci95_upper',
color = 'param_caught')) +\
p.geom_hline(yintercept = sim_params[1], linetype = 'dashed') +\
p.geom_pointrange() +\
p.labs(x = 'sim ID', y = 'b1', title = 'Randomly Chosen 100 95% CIs') +\
p.scale_color_discrete(name = 'True param value', labels = ('missed', 'hit')) +\
p.coord_flip()
1 - sum(sim_nocluster.param_caught)/sim_nocluster.shape[0]