motive: We are attempting to use a constant-scaling method to perform endpoint estimations for beta and (possibly) gamma distributions. 
Basic steps are as follows:
1. Simulate a bunch of points and take some upper proportion p (upper thirds, upper quartile, etc.)
2. Calculate the standard deviation of the sample
3. Figure out what constant $c * \hat{\sigma}$ added to the sample maximum would get close to 1
4. Repeat 1-3 and obtain average of $c$ s.
5. Test on different Beta distributions (and maybe Gamma distributions). Is there a general rule?

In [1]:
#Import libraries here
import numpy as np

In [2]:
#Start with one instance of Beta(2,5) distribution
rng = np.random.default_rng(seed=2024)
sample = rng.beta(2,5,1000) #sample 1000 points from distribution
cutoff = np.percentile(sample, 66.7) #find cutoff for top third
test_sample = sample[sample > cutoff] #get top third
sd = np.std(test_sample) #get standard deviation
c = (1-np.max(test_sample)) / sd #get constant
print(c)

1.4094303685479377


In [3]:
#Use different seeds (obtain different samples) to get average value of c
cs = []
for seed in [1356,1023,1748,489,1265,31,1776,888,1901,1437,619,1975,509,1342,745,1623,1288,1755,94,1389]: #seed numbers denoting different sampling schemes 
    #(I would collect 20 different random samples)
    rng = np.random.default_rng(seed)#repeat of method above:
    sample = rng.beta(2,5,1000)
    cutoff = np.percentile(sample, 66.7) 
    test_sample = sample[sample > cutoff]
    sd = np.std(test_sample)
    c = (1-np.max(test_sample)) / sd
    cs.append(c)
print(np.average(cs)) #Take the average of all n constants generated by different sampling schemes

1.4923027364652388


In [4]:
#Test for variation in different sample sizes
cs = []
for size in [200,500,1000,5000,10000,20000,50000,int(1e5),int(1e6)]:
    for seed in [1356,1023,1748,489,1265,31,1776,888,1901,1437,619,1975,509,1342,745,1623,1288,1755,94,1389]:
        rng = np.random.default_rng(seed)
        sample = rng.beta(2,5,size)
        cutoff = np.percentile(sample, 66.7) 
        test_sample = sample[sample > cutoff]
        sd = np.std(test_sample)
        c = (1-np.max(test_sample)) / sd
        cs.append(c)
    print(np.average(cs)) #c changes for some reason, but overall decreasing trend (tends to 1)

2.50212143430033
2.3043426925835746
2.033662707210796
1.8216338777491488
1.6590379454747983
1.5263353050848971
1.4080705812165621
1.3101256252370712
1.2128665765525808


In [5]:
#Test and graph for different alpha and beta parameters
cs = []
avg_cs = []
c1s = []
for a in [1,2,3,4,5,6,7,8,9,10]:
    for b in [1,2,3,4,5,6,7,8,9,10]: #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000,int(1e5)]:
            for seed in [1356,1023,1748,489,1265,31,1776,888,1901,1437,619,1975,509,1342,745,1623,1288,1755,94,1389]: #generate 20 different random samples based on list of schemes
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 20 random samples of n points
                cutoff = np.percentile(sample, 66.7) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #get standard deviation
                c = (1-np.max(test_sample)) / sd #calculate constant c
                cs.append(c)
            avg_c = np.average(cs) #calculate average constant c for sample size n
            avg_cs.append(avg_c)
        c1 = np.average(avg_cs) #calculate average constant c for Beta distribution with alpha = a and beta = b
        c1s.append((a,b,c1))        

In [5]:
import plotly.graph_objects as go

def graph_objects(arr):
    x, y, z = zip(*arr)
    z = list(map(float, z))
    fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
                                       mode='markers', 
                                       marker=dict(
                                       size=12,
                                       color=z,                # set color to an array/list of desired values
                                       colorscale='Viridis',   # choose a colorscale
                                       opacity=0.8
        ))])
    fig.update_layout(scene = dict(
                        xaxis_title='alpha',
                        yaxis_title='beta',
                        zaxis_title='scaling constant'),
                        margin=dict(r=20, b=10, l=10, t=10))
    fig.show()

In [7]:
graph_objects(c1s)

The constant increases very quickly as alpha increases and quickly as beta increases. both plateau off to a value close to 1.8. 

In [8]:
#Adjust sample cutoff by skew again?
cs = []
avg_cs = []
c2s = []
for a in [1,2,3,4,5,6,7,8,9,10]:
    for b in [1,2,3,4,5,6,7,8,9,10]: #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000,int(1e5)]: #sample size n 
            for seed in [1356,1023,1748,489,1265,31,1776,888,1901,1437,619,1975,509,1342,745,1623,1288,1755,94,1389]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 20 random samples of specific size from seed(scheme)
                skew = (2*(b-a)*np.sqrt(a+b+1))/((a+b+2)*np.sqrt(a*b))
                cutoff = np.percentile(sample, 66.7+skew) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c2 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c2s.append((a,b,c2)) 

In [9]:
#graph new constant generation method
graph_objects(c2s)

There's no change this time?

In [7]:
#Adjust sample cutoff?
cs = []
avg_cs = []
c2s = []
for a in np.linspace(0.1,5,100):
    for b in np.linspace(0.1,5,100): #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000]: #sample size n 
            for seed in [1356,489,1265,31,1776,888,619,1975,509,94]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 10 random samples of specific size from seed(scheme)
                cutoff = np.percentile(sample, 90) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c2 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c2s.append((a,b,c2)) 

In [8]:
graph_objects(c2s)

In [12]:
#behavior of cs when cutoff is consistently less than 99%:
#Adjust sample cutoff by skew again?
cs = []
avg_cs = []
c2s = []
for a in np.linspace(0.1,20):
    for b in np.linspace(0.1,20): #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000]: #sample size n 
            for seed in [1356,489,1265,31,1776,888,619,1975,509,94]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 10 random samples of specific size from seed(scheme)
                cutoff = np.percentile(sample, 95) #select cutoff
                test_sample = np.argwhere(sample >= cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c2 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c2s.append((a,b,c2)) 

In [13]:
graph_objects(c2s)

Changing cutoffs seems to mostly shift the values down. Though the slope of the graph gets smaller and smaller as p approaches the 99th percentile, the slope increases drastically at the 99th percentile and above. It is most likely caused by a very small resulting sample size. (Will test this theory later) We will move forward with the new cutoff at the 90th percentile.

In [16]:
#Adjust sample cutoff by skew?
#Add/subtract by skew
cs = []
avg_cs = []
c3s = []
for a in np.linspace(0.1,10,75):
    for b in np.linspace(0.1,10,75): #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000]: #sample size n 
            for seed in [1356,489,1265,31,1776,888,619,1975,509,94]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 10 random samples of specific size from seed(scheme)
                skew = (2*(b-a)*np.sqrt(a+b+1))/((a+b+2)*np.sqrt(a*b))
                cutoff = np.percentile(sample, 90+skew) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c3 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c3s.append((a,b,c3))

In [17]:
graph_objects(c3s)

We see the curve of the graph slightly flattening, but the curve still exists.

In [18]:
#correcting factor using skew as exponent:
cs = []
avg_cs = []
c3s = []
for a in np.linspace(0.1,10,75):
    for b in np.linspace(0.1,10,75): #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000]: #sample size n 
            for seed in [1356,489,1265,31,1776,888,619,1975,509,94]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 10 random samples of specific size from seed(scheme)
                skew = (2*(b-a)*np.sqrt(a+b+1))/((a+b+2)*np.sqrt(a*b))
                cutoff = np.percentile(sample, 90*(1.01**skew)) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c3 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c3s.append((a,b,c3))

In [19]:
graph_objects(c3s) #curve got curvier, moving on

In [22]:
#Change value of standard deviation?
cs = []
avg_cs = []
c4s = []
for a in np.linspace(0.1,10,75):
    for b in np.linspace(0.1,10,75): #test for a variety of alphas and betas
        for size in [200,500,1000,5000,10000]: #sample size n 
            for seed in [1356,489,1265,31,1776,888,619,1975,509,94]: #seed list denoting random samples
                rng = np.random.default_rng(seed)
                sample = rng.beta(a,b,size) #generate 10 random samples of specific size from seed(scheme)
                skew = (2*(b-a)*np.sqrt(a+b+1))/((a+b+2)*np.sqrt(a*b))
                cutoff = np.percentile(sample, 90) #select cutoff
                test_sample = np.argwhere(sample > cutoff) #select top (1-cutoff) percentile points
                sd = np.std(test_sample) - skew * 0.75 #calculate standard deviation
                c = (1-np.max(test_sample))/sd #obtain c
                cs.append(c)
            avg_c = np.average(cs) #take average of constant cs for sample size n
            avg_cs.append(avg_c)
        c4 = np.average(avg_cs) #get average constant c for Beta distribution with alpha = a and beta = b
        c4s.append((a,b,c4))

In [23]:
graph_objects(c4s) #the curve is flipped and although it is much flatter for the most part, there's still a part that's logarithmically increasing?