motive: We are attempting to use a constant-scaling method to perform endpoint estimations for beta and (possibly) gamma distributions. 
Basic steps are as follows:
1. Simulate a bunch of points and take some upper proportion p (upper thirds, upper quartile, etc.)
2. Calculate the standard deviation of the sample
3. Figure out what constant $c * \hat{\sigma}$ added to the sample maximum would get close to 1
4. Repeat 1-3 and obtain average of $c$ s.
5. Test on different Beta distributions (and maybe Gamma distributions). Is there a general rule?

In [1]:
import numpy as np

In [2]:
#Start with one instance of Beta(2,5) distribution
rng = np.random.default_rng(seed=2024)
sample = rng.beta(2,5,1000) #sample 1000 points from distribution
cutoff = np.percentile(sample, 66.7) #find cutoff for top third
test_sample = sample[sample > cutoff] #get top third
sd = np.std(test_sample) #get standard deviation
c = (1-np.max(test_sample)) / sd #get constant
print(c)

1.4094303685479377


In [5]:
#Repeating steps 1-3 on the same Beta(2,5) distribution
for seed in np.array([2,34,567,2021,0]):
    rng = np.random.default_rng(seed=seed)
    sample = rng.beta(2,5,1000) #sample 1000 points from distribution
    cutoff = np.percentile(sample, 66.7) #find cutoff for top third
    test_sample = sample[sample > cutoff] #get top third
    sd = np.std(test_sample) #get standard deviation
    c = (1-np.max(test_sample)) / sd #get constant
    

1.7387708266295254
1.5347307766516332
1.368164842801356
1.4332018829413757
1.3064344030175945


In [7]:
cs = []
for seed in np.array([2,34,567,2021,0]):
    rng = np.random.default_rng(seed=seed)
    sample = rng.beta(2,5,1000) 
    cutoff = np.percentile(sample, 66.7) 
    test_sample = sample[sample > cutoff] 
    sd = np.std(test_sample) 
    c = (1-np.max(test_sample)) / sd 
    cs.append(c)
print(np.average(cs))

1.476260546408297


In [10]:
avg_cs = []
for sample_sizes in np.array([200,500,1000,5000]):
    cs = []
    for seed in np.array([2,34,567,2021,0]):
        rng = np.random.default_rng(seed=seed)
        sample = rng.beta(2,5,sample_sizes) 
        cutoff = np.percentile(sample, 66.7) 
        test_sample = sample[sample > cutoff] 
        sd = np.std(test_sample) 
        c = (1-np.max(test_sample)) / sd 
        cs.append(c)
    avg_cs.append(np.average(cs))
print(avg_cs)

[2.5413629722180695, 1.8449258635121517, 1.476260546408297, 1.2829521053560058]


In [12]:
parameter_cs = []
for b in np.linspace(0.5,5,10):
    sample_avg_cs = []
    for sample_sizes in np.array([200,500,1000,5000]):
        cs = []
        for seed in np.array([2,34,567,2021,0]):
            rng = np.random.default_rng(seed=seed)
            sample = rng.beta(2,b,sample_sizes) 
            cutoff = np.percentile(sample, 66.7) 
            test_sample = sample[sample > cutoff] 
            sd = np.std(test_sample) 
            c = (1-np.max(test_sample)) / sd 
            cs.append(c)
        sample_avg_cs.append(np.average(cs))
    parameter_cs.append(np.average(sample_avg_cs))
print(parameter_cs)

[0.0007143143689919895, 0.01182623376629679, 0.12740538018372408, 0.24249472650593012, 0.4345173669513564, 0.6256447408936548, 0.8664725621306946, 1.149091351673057, 1.4463983562538267, 1.786375371873631]
