In [2]:
# using Random
# using BayesianExperiments
using ProgressMeter: @showprogress
using DataFrames

using Random
using Revise

using BayesianExperiments
Revise.track(BayesianExperiments)

# number of columns in a dataframe to show 
ENV["COLUMNS"] = 200

┌ Info: Precompiling BayesianExperiments [0200d6b2-355c-4cc6-8967-0ef505c43a9c]
└ @ Base loading.jl:1278


200

We will use the `NormalEffectSize` model in the package. You can check the definition of `NormalEffectSize` model from the docstring by typing `?NormalEffectSize`.

In [4]:
function simulate(δ, n, σ0; r=0.707, thresh=9, minsample=20)
    i = 0
    rule = TwoSidedBFThresh(thresh)
    model = StudentTEffectSize(r=r)
    experiment = ExperimentBF(model=model, rule=rule)
    xs = rand(Normal(δ, 1), n)
    while (i < n) & (experiment.winner === nothing)
        i += 1
        if i < minsample
            continue
        end
        stats = NormalStatistics(xs[1:i])
        experiment.stats = stats
        decide!(experiment)
    end
    experiment
end


simulate (generic function with 1 method)

## Case when alternative $\delta = 0$

When alternative $\delta > 0$, the error rate relates to the false positive rate. 

In [6]:
#deltas = collect(range(0, 1.5, step=0.2));
delta = 0.0
rs = [0.707, 1.0, 1.414];
threshs = [3, 5, 7, 10];
totalnum = length(rs)*length(threshs);

paramsgrid = reshape(collect(Base.Iterators.product(rs, threshs)), (totalnum, 1));
paramsgrid = [(r=r, thresh=thresh) for (r, thresh) in paramsgrid];
@show length(paramsgrid);

length(paramsgrid) = 12


In [7]:
n =  1000
ns = 5000
minsample = 20

sim_result1 = DataFrame(
    delta=Float64[], 
    r=Float64[], 
    thresh=Float64[], 
    num_sim=Int64[], 
    num_null=Int64[], 
    num_alt=Int64[],
    err_rate=Float64[], 
    avg_sample_size=Int64[])

@showprogress for params in paramsgrid
    delta = 0
    r = params.r
    thresh = params.thresh
    winners = []
    samplesizes = []
    for _ in 1:ns
        experiment = simulate(delta, n, r, thresh=thresh, minsample=minsample)
        push!(winners, experiment.winner)
        push!(samplesizes, experiment.stats.n)
    end
    
    num_null = sum(winners .== "null")
    num_alt = sum(winners .== "alternative")
    
    err_rate = num_alt/ns
    avg_sample_size = mean(samplesizes)
    push!(sim_result1, (delta, r, thresh, ns, num_null, num_alt, err_rate, convert(Int64, round(avg_sample_size))))
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:58[39m


In [8]:
sim_result1

Unnamed: 0_level_0,delta,r,thresh,num_sim,num_null,num_alt,err_rate,avg_sample_size
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Int64,Int64,Float64,Int64
1,0.0,0.707,3.0,5000,4679,321,0.0642,24
2,0.0,1.0,3.0,5000,4692,308,0.0616,23
3,0.0,1.414,3.0,5000,4698,302,0.0604,24
4,0.0,0.707,5.0,5000,4718,282,0.0564,48
5,0.0,1.0,5.0,5000,4707,293,0.0586,49
6,0.0,1.414,5.0,5000,4734,262,0.0524,49
7,0.0,0.707,7.0,5000,4771,218,0.0436,101
8,0.0,1.0,7.0,5000,4765,228,0.0456,99
9,0.0,1.414,7.0,5000,4744,250,0.05,102
10,0.0,0.707,10.0,5000,4767,165,0.033,209


## Case when alternative $\delta > 0$

We create a grid of combinations of all parameters.

In [24]:
deltas = collect(range(0.1, 1.0, step=0.2));
rs = [0.707, 1.0, 1.414];
threshs = [3, 5, 7, 10];
totalnum = length(deltas)*length(rs)*length(threshs);

paramsgrid = reshape(collect(Base.Iterators.product(deltas, rs, threshs)), (totalnum, 1));
paramsgrid = [(delta=delta, r=r, thresh=thresh) for (delta, r, thresh) in paramsgrid]
@show length(paramsgrid);
@show paramsgrid[1:5];

length(paramsgrid) = 60
paramsgrid[1:5] = NamedTuple{(:delta, :r, :thresh),Tuple{Float64,Float64,Int64}}[(delta = 0.1, r = 0.707, thresh = 3), (delta = 0.3, r = 0.707, thresh = 3), (delta = 0.5, r = 0.707, thresh = 3), (delta = 0.7, r = 0.707, thresh = 3), (delta = 0.9, r = 0.707, thresh = 3)]


The simulation is similar to the $\delta=0$ case. When alternative $\delta > 0$, the error rate relates to the false negative evidence.

In [10]:
n =  1000
ns = 5000
minsample = 20

sim_result2 = DataFrame(
    delta=Float64[], 
    r=Float64[], 
    thresh=Float64[], 
    num_sim=Int64[], 
    num_null=Int64[], 
    num_alt=Int64[],
    err_rate=Float64[], 
    avg_sample_size=Int64[])

@showprogress for params in paramsgrid
    delta=params.delta
    r = params.r
    thresh = params.thresh
    winners = []
    samplesizes = []
    for _ in 1:ns
        experiment = simulate(delta, n, r, thresh=thresh, minsample=minsample)
        push!(winners, experiment.winner)
        push!(samplesizes, experiment.stats.n)
    end
    
    num_null = sum(winners .== "null")
    num_alt = sum(winners .== "alternative")
    err_rate = 1-num_alt/ns
    avg_sample_size = mean(samplesizes)
    push!(sim_result2, (delta, r, thresh, ns, num_null, num_alt, 
            err_rate, convert(Int64, round(avg_sample_size))))
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:03:01[39m


Simulation result when $\delta=0.5$

In [13]:
sim_result2 |>
    df -> filter(x->x.delta==0.5, df)|>
    df -> sort(df, [:delta, :r])

Unnamed: 0_level_0,delta,r,thresh,num_sim,num_null,num_alt,err_rate,avg_sample_size
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Int64,Int64,Float64,Int64
1,0.5,0.707,3.0,5000,723,4277,0.1446,25
2,0.5,0.707,5.0,5000,81,4919,0.0162,33
3,0.5,0.707,7.0,5000,0,5000,0.0,37
4,0.5,0.707,10.0,5000,0,5000,0.0,39
5,0.5,1.0,3.0,5000,723,4277,0.1446,26
6,0.5,1.0,5.0,5000,83,4917,0.0166,33
7,0.5,1.0,7.0,5000,1,4999,0.0002,37
8,0.5,1.0,10.0,5000,0,5000,0.0,40
9,0.5,1.414,3.0,5000,716,4284,0.1432,26
10,0.5,1.414,5.0,5000,100,4900,0.02,33


## Evaluate the simulation result with Type I, Type I Error and FDR

As pointed out by [2], we can evaluate the simulation result from the perspective of false discovery rate. Here we assume there is a 50-50 chance that the data is from either the null model or alternative model. 

We can merge the two simulations results by the prior standard deviation $r$ and threshold of bayes factor. In the merged dataframe, each row represents a simulation with the 5000 samples from the null model and 5000 samples from the alternative model with the corresponding parameters ($r$, $thresh$, $\delta_1$).

In [38]:
sim_result = leftjoin(sim_result1, sim_result2, 
    on=[:r, :thresh, :num_sim],
    renamecols= "_0" => "_1"
);

sim_result.num_dis = sim_result.num_alt_0 + sim_result.num_alt_1;
sim_result.num_false_dis = sim_result.num_alt_0;
sim_result.fdr = sim_result.num_false_dis ./ sim_result.num_dis;

sim_result.type1_error = sim_result.num_alt_0 ./ sim_result.num_sim;
#sim_result.type2_error = 1 .- sim_result.num_alt_1 ./ sim_result.num_sim;
sim_result.power = sim_result.num_alt_1 ./ sim_result.num_sim;

sim_result = sim_result |>
    df -> select(df, [:delta_1, :r, :thresh, :num_sim, :num_null_0, :num_alt_0, 
        :num_null_1, :num_alt_1, :type1_error, :power, :fdr]);

In [39]:
sim_result.num_dis = sim_result.num_alt_0 + sim_result.num_alt_1;
sim_result.num_false_dis = sim_result.num_alt_0;
sim_result.fdr = sim_result.num_false_dis ./ sim_result.num_dis;

sim_result.type1_error = sim_result.num_alt_0 ./ sim_result.num_sim;
sim_result.type2_error = 1 .- sim_result.num_alt_1 ./ sim_result.num_sim;

In [40]:
sim_result = sim_result |>
    df -> select(df, [:delta_1, :r, :thresh, :num_sim, :num_null_0, :num_alt_0, 
        :num_null_1, :num_alt_1, :type1_error, :power, :fdr]);

Examples from merged dataframe:

In [41]:
sim_result |>
    df -> filter(
        x -> ((x.delta_1 == 0.1) .& (x.r == 0.707)) .|
             ((x.delta_1 == 0.1) .& (x.r == 1.0)) .|
             ((x.delta_1 == 0.3) .& (x.r == 1.0))
            , df) |>
    df -> sort(df, [:delta_1, :r, :thresh])

Unnamed: 0_level_0,delta_1,r,thresh,num_sim,num_null_0,num_alt_0,num_null_1,num_alt_1,type1_error,power,fdr
Unnamed: 0_level_1,Float64?,Float64,Float64,Int64,Int64,Int64,Int64?,Int64?,Float64,Float64,Float64
1,0.1,0.707,3.0,5000,4679,321,4470,530,0.0642,0.106,0.377203
2,0.1,0.707,5.0,5000,4718,282,4187,813,0.0564,0.1626,0.257534
3,0.1,0.707,7.0,5000,4771,218,3559,1441,0.0436,0.2882,0.131404
4,0.1,0.707,10.0,5000,4767,165,2533,2467,0.033,0.4934,0.06269
5,0.1,1.0,3.0,5000,4692,308,4454,546,0.0616,0.1092,0.360656
6,0.1,1.0,5.0,5000,4707,293,4169,831,0.0586,0.1662,0.260676
7,0.1,1.0,7.0,5000,4765,228,3617,1383,0.0456,0.2766,0.141527
8,0.1,1.0,10.0,5000,4726,206,2506,2494,0.0412,0.4988,0.0762963
9,0.3,1.0,3.0,5000,4692,308,2547,2453,0.0616,0.4906,0.111554
10,0.3,1.0,5.0,5000,4707,293,1125,3875,0.0586,0.775,0.0702975


## References

1. Schönbrodt, Felix D., Eric-Jan Wagenmakers, Michael Zehetleitner, and Marco Perugini. "Sequential hypothesis testing with Bayes factors: Efficiently testing mean differences." Psychological methods 22, no. 2 (2017): 322.
2. Deng, Alex, Jiannan Lu, and Shouyuan Chen. "Continuous monitoring of A/B tests without pain: Optional stopping in Bayesian testing." In 2016 IEEE international conference on data science and advanced analytics (DSAA), pp. 243-252. IEEE, 2016.
3. Rouder, Jeffrey N. "Optional stopping: No problem for Bayesians." Psychonomic bulletin & review 21, no. 2 (2014): 301-308.