motive: We are attempting to use a constant-scaling method to perform endpoint estimations for beta and (possibly) gamma distributions. 
Basic steps are as follows:
1. Simulate a bunch of points and take some upper proportion p (upper thirds, upper quartile, etc.)
2. Calculate the standard deviation of the sample
3. Figure out what constant $c * \hat{\sigma}$ added to the sample maximum would get close to 1
4. Repeat 1-3 and obtain average of $c$ s.
5. Test on different Beta distributions (and maybe Gamma distributions). Is there a general rule?

In [1]:
import numpy as np

In [2]:
#Start with one instance of Beta(2,5) distribution
rng = np.random.default_rng(seed=2024)
sample = rng.beta(2,5,1000) #sample 1000 points from distribution
cutoff = np.percentile(sample, 66.7) #find cutoff for top third
test_sample = sample[sample > cutoff] #get top third
sd = np.std(test_sample) #get standard deviation
c = (1-np.max(test_sample)) / sd #get constant
print(c)

1.4094303685479377


In [3]:
#Repeating steps 1-3 on the same Beta(2,5) distribution
for seed in np.array([2,34,567,2021,0]):
    rng = np.random.default_rng(seed=seed)
    sample = rng.beta(2,5,1000) #sample 1000 points from distribution
    cutoff = np.percentile(sample, 66.7) #find cutoff for top third
    test_sample = sample[sample > cutoff] #get top third
    sd = np.std(test_sample) #get standard deviation
    c = (1-np.max(test_sample)) / sd #get constant
    

In [4]:
cs = []
for seed in np.array([2,34,567,2021,0]):
    rng = np.random.default_rng(seed=seed)
    sample = rng.beta(2,5,1000) 
    cutoff = np.percentile(sample, 66.7) 
    test_sample = sample[sample > cutoff] 
    sd = np.std(test_sample) 
    c = (1-np.max(test_sample)) / sd 
    cs.append(c)
print(np.average(cs))

1.476260546408297


In [5]:
avg_cs = []
for sample_sizes in np.array([200,500,1000,5000]):
    cs = []
    for seed in np.array([2,34,567,2021,0]):
        rng = np.random.default_rng(seed=seed)
        sample = rng.beta(2,5,sample_sizes) 
        cutoff = np.percentile(sample, 66.7) 
        test_sample = sample[sample > cutoff] 
        sd = np.std(test_sample) 
        c = (1-np.max(test_sample)) / sd 
        cs.append(c)
    avg_cs.append(np.average(cs))
print(avg_cs)

[2.5413629722180695, 1.8449258635121517, 1.476260546408297, 1.2829521053560058]


In [6]:
parameter_cs = []
for b in np.linspace(0.5,5,10):
    sample_avg_cs = []
    for sample_sizes in np.array([200,500,1000,5000]):
        cs = []
        for seed in np.array([2,34,567,2021,0]):
            rng = np.random.default_rng(seed=seed)
            sample = rng.beta(2,b,sample_sizes) 
            cutoff = np.percentile(sample, 66.7) 
            test_sample = sample[sample > cutoff] 
            sd = np.std(test_sample) 
            c = (1-np.max(test_sample)) / sd 
            cs.append(c)
        sample_avg_cs.append(np.average(cs))
    parameter_cs.append(np.average(sample_avg_cs))
print(parameter_cs)

[0.0007143143689919895, 0.01182623376629679, 0.12740538018372408, 0.24249472650593012, 0.4345173669513564, 0.6256447408936548, 0.8664725621306946, 1.149091351673057, 1.4463983562538267, 1.786375371873631]


In [17]:
#I am paranoid now so I will generate different values of a and b first
parameter_pairs = []
for a in np.linspace(0.5,10):
    for b in np.linspace(0.5,10):
        parameter_pairs.append((a,b))

In [18]:
for pair in parameter_pairs:
    sample_avg_cs = []
    for sample_sizes in np.array([200,500,1000,5000]):
        cs = []
        for seed in np.array([2,34,567,2021,0]):
            rng = np.random.default_rng(seed=seed)
            sample = rng.beta(pair[0],pair[1],sample_sizes) 
            cutoff = np.median(sample)
            test_sample = np.argwhere(sample >= cutoff) #testing things with median, hopefully will change
            sd = np.std(test_sample) 
            c = (1-np.max(test_sample)) / sd 
            cs.append(c)
        sample_avg_cs.append(np.average(cs))
    parameter_cs.append(np.average(sample_avg_cs))

print(parameter_cs)

[0.0007143143689919895, 0.01182623376629679, 0.12740538018372408, 0.24249472650593012, 0.4345173669513564, 0.6256447408936548, 0.8664725621306946, 1.149091351673057, 1.4463983562538267, 1.786375371873631, 0.0, 0.00010934439604366393, 0.026312131362915946, 0.1622180143729139, 0.6820264956563069, 2.511935042480869, 4.2499903832276065, 5.246698586529939, 8.639947202602253, 10.698150932075313, 12.777865558624361, 11.485555960380129, 13.243909132274515, 15.177120931202928, 11.892969376058106, 13.420975385051303, 14.999205801482836, 18.473257293936822, 20.213649833300355, 22.333303520324076, 24.10736624534619, 26.655591780267283, 28.53259071727517, 23.70908926316697, 25.248384743821525, 26.42417366514838, 27.944660047563524, 29.50232171614448, 31.001786303082227, 32.53378313861203, 34.21246851330412, 35.821151951193734, 41.44543140073388, 43.15113625767836, 44.848614216309386, 46.5849107648548, 47.266707916117944, 48.96186818453168, 50.67986786129727, 52.33542013286196, 54.04438845397321, 55