In [1]:
import numpy as np
from scipy.stats import norm
import pandas as pd
import warnings

# Ignore FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

np.random.seed(667)

In [2]:
# Given parameters
N = 100000
m = 0.50
sigma = 75
c = 0.14
num_tests = 200000

In [3]:
# Hypotheses
H0 = 0.0
H1 = 0.2
P_H0 = 0.5
P_H1 = 1 - P_H0

# Simulate true ROIs for each test based on hypothesis probabilities
ROIs = np.random.choice([H0, H1], size=num_tests, p=[P_H0, P_H1])

# Calculate standard deviation for each ROI_est
std_devs = m * sigma * 2**0.5 / (c * N**0.5)

print(std_devs)

1.1978935593748872


In [4]:
# Simulate ROI_est for each test
ROI_ests = np.random.normal(ROIs, std_devs)

print(ROI_ests)

[ 0.59958236 -2.60876981  0.96632107 ...  1.23393386 -1.44816736
  0.59406705]


In [5]:
# Compute t-statistics
t_stats = (ROI_ests - H0) / std_devs

# Compute p-values
p_values = 1 - norm.cdf(t_stats)

print(p_values)

[0.30835076 0.98528945 0.2099244  ... 0.15148473 0.88665479 0.30997317]


In [6]:
# Find the occurrences where ROI = 25%
ROI_20 = ROIs == H1

# Compute the conditional probability
prob = np.sum(p_values[ROI_20] < 0.05) / np.sum(ROI_20)

print(prob)

0.06816705475550404


# Let's try increasing the number of participants in our tests

In [7]:
# Given parameters
m = 0.50
sigma = 75
c = 0.14
num_tests = 200000

# Hypotheses
H0 = 0.0
H1 = 0.2
P_H0 = 0.5
P_H1 = 1 - P_H0

# N values
N_values = [100000, 500000, 1000000, 2000000, 3000000]

# Initialize data frame
df = pd.DataFrame(columns=['N', 'std_dev', 'prob', 'total_cost'])

# For each N value
for N in N_values:

    # Simulate true ROIs for each test based on hypothesis probabilities
    ROIs = np.random.choice([H0, H1], size=num_tests, p=[P_H0, P_H1])

    # Calculate standard deviation for each ROI_est
    std_devs = m * sigma * 2**0.5 / (c * N**0.5)

    # Simulate ROI_est for each test
    ROI_ests = np.random.normal(ROIs, std_devs)

    # Compute t-statistics
    t_stats = (ROI_ests - H0) / std_devs

    # Compute p-values
    p_values = 1 - norm.cdf(t_stats)

    # Find the occurrences where ROI = 20%
    ROI_20 = ROIs == H1

    # Compute the conditional probability
    prob = np.sum(p_values[ROI_20] < 0.05) / np.sum(ROI_20)

    # Compute total cost
    total_cost = N * c

    # Add row to data frame
    temp_df = pd.DataFrame({'N': [N], 'std_dev': [std_devs], 'prob': [prob], 'total_cost': [total_cost]})
    df = pd.concat([df, temp_df], ignore_index=True)

df

Unnamed: 0,N,std_dev,prob,total_cost
0,100000,1.197894,0.068979,14000.0
1,500000,0.535714,0.101879,70000.0
2,1000000,0.378807,0.134863,140000.0
3,2000000,0.267857,0.186078,280000.0
4,3000000,0.218704,0.232642,420000.0


In [8]:
# Now, assume that H0: ROI = -100% and H1: ROI  is uniformly distributed in the -100%, 200% 
# interval, with Prob(H0)=.5

In [9]:
# Given parameters
N = 100000
m = 0.30
sigma = 75
c = 0.14
num_tests = 200000

# Hypotheses
H0 = -1.0
P_H0 = 0.5

# Simulate true ROIs for each test based on hypothesis probabilities
# When not H0, ROI is drawn from a uniform distribution between -100% and 200%
ROIs = np.where(np.random.rand(num_tests) < P_H0, H0, np.random.uniform(-1, 2, num_tests))

# Calculate standard deviation for each ROI_est
std_devs = m * sigma * 2**0.5 / (c * N**0.5)

# Simulate ROI_est for each test
ROI_ests = np.random.normal(ROIs, std_devs)

# Print some of the generated ROI_ests for verification
print(ROI_ests[:10])


[ 0.50952025 -0.94451875 -1.69942794 -0.27425999  0.68917242  0.0796217
 -1.41156126  0.07803269 -2.14311174 -1.59901345]


In [10]:
# Find the occurrences where ROI_est is between 24% and 26%
ROI_est_25 = (ROI_ests > 0.24) & (ROI_ests < 0.26)

# Compute the conditional probability
prob = np.sum(ROIs[ROI_est_25] > 0.25) / np.sum(ROI_est_25)

print(prob)


0.37840785169029445


In [14]:
# When can we reasonably sure that ROI > 25% with prob ~ .9?

# Find the occurrences where ROI_est is between 24% and 26%
ROI_est_125 = (ROI_ests > 1.25) & (ROI_ests < 1.26)

# Compute the conditional probability
prob = np.sum(ROIs[ROI_est_125] > 0.25) / np.sum(ROI_est_125)

print(prob)


0.89568345323741


In [15]:
# Go back to finding the occurrences where ROI_est is between 24% and 26% but increase the sample size


In [16]:
# Given parameters
m = 0.30
sigma = 75
c = 0.14
num_tests = 200000

# Hypotheses
H0 = -1.0
P_H0 = 0.5

# N values
N_values = [100000, 500000, 1000000, 2000000, 8000000]

# For each N value
for N in N_values:

    # Simulate true ROIs for each test based on hypothesis probabilities
    # When not H0, ROI is drawn from a uniform distribution between -100% and 200%
    ROIs = np.where(np.random.rand(num_tests) < P_H0, H0, np.random.uniform(-1, 2, num_tests))

    # Calculate standard deviation for each ROI_est
    std_devs = m * sigma * 2**0.5 / (c * N**0.5)

    # Simulate ROI_est for each test
    ROI_ests = np.random.normal(ROIs, std_devs)

    # Find the occurrences where ROI_est is between 24% and 26%
    ROI_est_25 = (ROI_ests > 0.24) & (ROI_ests < 0.26)

    # Compute the conditional probability
    prob = np.sum(ROIs[ROI_est_25] > 0.20) / np.sum(ROI_est_25)

    print(f"For N={N}, probability: {prob}")


For N=100000, probability: 0.43625730994152045
For N=500000, probability: 0.5457227138643068
For N=1000000, probability: 0.5643879173290938
For N=2000000, probability: 0.6523031203566122
For N=8000000, probability: 0.7229832572298326
