In [9]:
import spotify_confidence as conf
import pandas as pd

# Using the SampleSizeCalculator class 

### Two success metrics 

In [22]:
df = pd.DataFrame(
    columns=["metric_name", "binary", "avg", "var", "mde", "nim", "preference"],
    data=[
        ["share_bananas_1d", True, 0.7, 0.21, 0.00617, None, "increase"],
        ["bananas_per_user_7d", False, 4.56, 2.13, 0.01, None, "increase"],
    ],
)

ssc = conf.SampleSizeCalculator(
    data_frame=df,
    point_estimate_column="avg",
    var_column="var",
    metric_column="metric_name",
    is_binary_column="binary",
    interval_size=0.99,
    power=0.8,
    correction_method='bonferroni',
)
treatment_weights = [5000, 2000, 3000]
ss = ssc.sample_size(
    treatment_weights=treatment_weights,
    mde_column="mde",
    nim_column="nim",
    preferred_direction_column="preference",

)

ss[['metric_name', 'adjusted_alpha_power_sample_size', 'null_hypothesis', 
    'alternative_hypothesis','required_sample_size_for_metric']]

Unnamed: 0,metric_name,adjusted_alpha_power_sample_size,null_hypothesis,alternative_hypothesis,required_sample_size_for_metric
0,share_bananas_1d,0.0025,0.0,0.004319,1042871.0
1,bananas_per_user_7d,0.0025,0.0,0.0456,95460.0


The alpha has been divided by 4 because we have two metrics times two comparisons, (based on the three values in `treatment_weight` and the assumption that we want to compare two treatment groups to control.) The mde (minimum detectable effect size) is on a relative scale, so alterative hypothesis is `diff = avg * mde`.

### One success metric, one guardrail (with non-inferiority margin) 

In [26]:
df = pd.DataFrame(
    columns=["metric_name", "binary", "avg", "var", "mde", "nim", "preference"],
    data=[
        ["share_bananas_1d", True, 0.7, 0.21, None, 0.00617, "increase"],
        ["bananas_per_user_7d", False, 4.56, 2.13, 0.01, None, "increase"],
    ],
)

ssc = conf.SampleSizeCalculator(
    data_frame=df,
    point_estimate_column="avg",
    var_column="var",
    metric_column="metric_name",
    is_binary_column="binary",
    interval_size=0.99,
    power=0.8,
    correction_method='bonferroni',
)
treatment_weights = [5000, 2000]
ss = ssc.sample_size(
    treatment_weights=treatment_weights,
    mde_column="mde",
    nim_column="nim",
    preferred_direction_column="preference",
)
ss[['metric_name', 'adjusted_alpha_power_sample_size', 'null_hypothesis', 
    'alternative_hypothesis','required_sample_size_for_metric']]

Unnamed: 0,metric_name,adjusted_alpha_power_sample_size,null_hypothesis,alternative_hypothesis,required_sample_size_for_metric
0,share_bananas_1d,0.005,-0.004319,0.0,644252.0
1,bananas_per_user_7d,0.005,0.0,0.0456,58622.0


For the guardrail metric the null hypothesis is not 0, but rather `- avg* nim`, minus because our prefered direction of the metric is "increase", so we want to guard against a decrease.

# Interactive sample size calculation

There's also has an interactive sample size calculator you can use to see how required sample sizes change when you change the input parameters.

In [2]:
conf.SampleSize.binomial_interactive()

VBox(children=(HTML(value='<h4>Target metric</h4>'), VBox(children=(HTML(value='<strong>Minimal Detectable Eff…

Output()

In [3]:
conf.SampleSize.continuous_interactive()

VBox(children=(HTML(value='<h4>Target metric</h4>'), VBox(children=(HTML(value='<strong>Minimal Detectable Eff…

Output()

In [4]:
conf.SampleSize.binomial(absolute_percentage_mde=0.01,
                         baseline_proportion=0.1,
                         alpha=0.05,
                         power=0.85,
                         treatments=2,
                         comparisons='control_vs_all',
                         treatment_allocations=[0.9, 0.1],
                         bonferroni_correction=False)

(89784, array([80806,  8979]), array([0.9, 0.1]))

In [5]:
conf.SampleSize.continuous(average_absolute_mde=0.5,
                           baseline_variance=250,
                           alpha=0.05,
                           power=0.85,
                           treatments=2,
                           comparisons='control_vs_all',
                           treatment_allocations=[0.9, 0.1],
                           bonferroni_correction=False)

(99760, array([89784,  9976]), array([0.9, 0.1]))