In [1]:
import powerlawrs 
import polars as pl
import numpy as np

In [2]:
file = "../reference_data/blackouts.txt"

# polars and pandas do NOT do a good job detecting headers, do not rely on them.
df = pl.read_csv(file, has_header=False)
data = df.to_series()

# API
## Fitting proceedure

In [3]:
# 1. Generate an alpha paramater via MLE for every x_min in the data 
x_mins, alphas = powerlawrs.estimation.find_alphas_fast(data)
print(f"n: {len(data)}, n_x_mins: {len(x_mins)}, n_alphas: {len(alphas)}")

n: 211, n_x_mins: 210, n_alphas: 210


In [4]:
# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
pareto_fit = powerlawrs.gof.gof(data, alphas=alphas, x_mins=x_mins)
print(f"{pareto_fit}")

ParetoFit(x_min=230000, alpha=1.2726372198302858, D=0.06067379629443781, len_tail=59)


Steps 1 and 2 above are abstracted away via ```powerlawrs.fit()```

## Numerical stability 
Comparison of ```find_alphas_exhaustive()``` and ```find_alphas_fast()``` given the former will be technically more accurate than the latter. Depending on the context, the precision is negligible given significant performance improvements. 

In [5]:
# 1. Generate an alpha paramater via MLE for every x_min in the data via find_alphas_exhaustive()
x_mins_ex, alphas_ex = powerlawrs.estimation.find_alphas_exhaustive(data)

# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
pareto_fit_ex = powerlawrs.gof.gof(data, alphas=alphas_ex, x_mins=x_mins_ex)

print(f"find_alphas_exhaustive() alpha: {pareto_fit_ex.alpha}")
print(f"find_alphas_fast() alpha:\t{pareto_fit.alpha}")
print(f"Difference: {pareto_fit.alpha - pareto_fit_ex.alpha}")

find_alphas_exhaustive() alpha: 1.2726372198302882
find_alphas_fast() alpha:	1.2726372198302858
Difference: -2.4424906541753444e-15


## Parameter uncertainty

In [6]:
xm_std, a_std = powerlawrs.estimation.param_est(data, m=1000)
print(f"stdev (sample) x_min: {xm_std}, stdev (sample) alpha: {a_std}")

stdev (sample) x_min: 79680.78585549034, stdev (sample) alpha: 0.2518366705720702


## Hypothesis test 

In [7]:
# Run the experiment
# Set a minimum precsion of our p value of the KS test.
precision = 0.01 # p value should be accurate to with 0.01 
H0 = powerlawrs.hypothesis.hypothesis_test(data, precision, pareto_fit.alpha, pareto_fit.x_min, pareto_fit.D)

Generating M = 2500 simulated datasets of length n = 211 with tail size 59 and probability of the tail P(tail|data) = 0.2796208530805687


In [8]:
# hypothesis_test() calls powerlawrs.util.sim.calculate_sim_params() to determine the number of simulated datasets required given the desired precision. 
simparams_dict = powerlawrs.util.sim.calculate_sim_params(precision, data, pareto_fit.x_min)

In [9]:
# Which will require 2500 synthetic datasets of length 211. 59 of the 211 samples will be drawn from a Pareto Type I with the paramaters found above
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

## Stats module

In [10]:
powerlawrs.stats.descriptive.mean(data)

253868.68246445496

In [11]:
powerlawrs.stats.descriptive.variance(data, 1)

372476564023.59814

In [12]:
powerlawrs.stats.random.random_choice(data, 3)

[160000.0, 464000.0, 1646.0]

In [13]:
powerlawrs.stats.random.random_uniform(3)

[0.2594305552535239, 0.48975471009024707, 0.38457219303300305]

In [14]:
# Define a standard normal CDF in Python
import math
norm_cdf = lambda x: 0.5 * (1 + math.erf(x / math.sqrt(2.0)))

sorted_data = [-1.1, -0.5, 0.1, 0.2, 1.5]

# Call the Rust function, passing the Python function as an argument
(d_plus, d_minus, d_max) = powerlawrs.stats.ks.ks_1sam_sorted(sorted_data, norm_cdf)

print(f"D+: {d_plus}")
print(f"D-: {d_minus}")
print(f"D max: {d_max}")

D+: 0.22074029056089706
D-: 0.13982783727702897
D max: 0.22074029056089706


## Util module

In [15]:
powerlawrs.util.linspace(0,10,5)

[0.0, 2.5, 5.0, 7.5, 10.0]

In [16]:
simparams_dict = powerlawrs.util.sim.calculate_sim_params(0.01, data, 230000)
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

In [17]:
# convert simparams dict to rust struct
simparams_struct = powerlawrs.util.sim.PySimParams(**simparams_dict)

# use the struct as an argument
sim_data = powerlawrs.util.sim.generate_synthetic_datasets(data, 230000, simparams_struct, 1.27)

In [18]:
#Note the library does not yet impliment zeta distribution for discrete data. 
pl.from_numpy(np.array(sim_data))

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_174,column_175,column_176,column_177,column_178,column_179,column_180,column_181,column_182,column_183,column_184,column_185,column_186,column_187,column_188,column_189,column_190,column_191,column_192,column_193,column_194,column_195,column_196,column_197,column_198,column_199,column_200,column_201,column_202,column_203,column_204,column_205,column_206,column_207,column_208,column_209,column_210
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
166000.0,2.3363e6,100000.0,207200.0,48000.0,74000.0,130000.0,43696.0,243140.327504,245621.81445,32000.0,92000.0,88000.0,70000.0,90000.0,357032.51301,351939.050115,1.1133e6,207200.0,124000.0,51000.0,901503.419786,70000.0,46000.0,289945.757997,59000.0,813349.628464,51000.0,512115.442866,278121.69709,25000.0,59000.0,234714.694695,330649.959451,18351.0,60000.0,40000.0,…,418755.832105,158000.0,92000.0,75000.0,588866.223007,520713.075472,206000.0,1.1013e6,403176.289937,100000.0,393939.644003,71000.0,92000.0,60000.0,258674.93946,273814.767503,1000.0,45000.0,4150.0,10000.0,51000.0,106850.0,120000.0,2.5627e7,147000.0,290341.583048,25000.0,282273.787963,40000.0,461483.849766,372461.497156,729378.538235,90000.0,1800.0,91000.0,90000.0,440882.533839
1.2762e6,94285.0,242104.037627,33000.0,358490.295088,684510.810474,133000.0,50000.0,307413.500455,94285.0,300605.603655,63500.0,755040.410793,75000.0,608896.47093,60000.0,416633.519319,426266.074358,70000.0,113200.0,50000.0,267993.951839,60000.0,33000.0,142000.0,75000.0,436571.546754,25000.0,207200.0,147000.0,40000.0,56000.0,6.6155e6,430454.131777,50000.0,70000.0,29900.0,…,2.4298e6,50000.0,240064.03016,235788.686218,50462.0,60000.0,50000.0,340814.430303,671960.715105,813773.531435,266401.866859,15000.0,29900.0,19000.0,18351.0,40911.0,145000.0,361286.330099,10000.0,50462.0,50000.0,18351.0,8000.0,163000.0,445918.506106,30001.0,206000.0,376804.560684,431837.46832,25000.0,30500.0,130000.0,100000.0,750940.335484,35000.0,50000.0,60000.0
39500.0,50000.0,124000.0,2.5356e6,20000.0,113200.0,50000.0,285036.082416,145000.0,318409.951697,37000.0,71000.0,160000.0,65000.0,460599.708374,18819.0,56000.0,80000.0,1.3369e6,71000.0,256469.536985,71000.0,11529.0,70000.0,564454.720771,238642.135713,24000.0,293460.024903,60000.0,2000.0,331949.158057,130000.0,50000.0,100000.0,240599.151024,160000.0,1.4786e6,…,48000.0,10300.0,50000.0,25000.0,248524.71721,45000.0,50000.0,672865.442779,51000.0,1.6362e6,35000.0,120000.0,191000.0,90000.0,25000.0,30500.0,399212.352881,30001.0,70000.0,60000.0,240335.32384,374292.516834,94285.0,81000.0,25000.0,210882.0,1800.0,124000.0,60000.0,245296.89873,18000.0,18000.0,50000.0,80000.0,46000.0,10300.0,402522.462202
284046.945991,56000.0,142000.0,100000.0,40000.0,36073.0,100000.0,2900.0,15000.0,278911.703308,1.7234e6,100000.0,19000.0,32000.0,160000.0,56000.0,126000.0,58000.0,74000.0,5300.0,877170.392578,88000.0,50000.0,74000.0,17000.0,50000.0,50000.0,40000.0,210882.0,51000.0,1.7929e6,4150.0,80000.0,166000.0,831653.980045,2000.0,9000.0,…,124000.0,160000.0,50000.0,4.9982e6,1.8005e6,51000.0,394303.554407,55000.0,3.5188e6,95630.0,1.6395e6,51000.0,190000.0,190000.0,32000.0,581889.360321,18000.0,25000.0,71000.0,50000.0,36073.0,30000.0,32000.0,2.6358e6,736285.74375,17000.0,1.9556e6,18351.0,1646.0,50000.0,40911.0,18000.0,37000.0,11000.0,1.3312e6,1.9465e6,65000.0
15000.0,570681.060098,24000.0,113200.0,206000.0,267723.050595,2.3200e6,662901.094227,50000.0,691560.84149,200000.0,25000.0,245447.519847,623543.092418,113200.0,32000.0,258565.175478,273216.559885,18000.0,18000.0,115000.0,25000.0,11529.0,51000.0,60000.0,638587.202943,272533.72174,252176.551114,14273.0,160000.0,206000.0,29900.0,70000.0,145000.0,160000.0,40000.0,535792.297776,…,62000.0,112000.0,300733.881739,95630.0,283698.720508,124000.0,852379.080484,65000.0,71000.0,37000.0,100000.0,277340.362797,340152.872064,71000.0,45000.0,163000.0,406889.803391,395609.081065,2.7527e6,2.9873e6,868573.864489,247699.619076,126000.0,534269.028463,60000.0,145000.0,62000.0,106850.0,60000.0,18000.0,142000.0,115000.0,29000.0,325281.397287,12000.0,551106.386857,56000.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
100000.0,75000.0,1.2393e6,58000.0,70000.0,25000.0,160000.0,60000.0,8000.0,158000.0,206000.0,342881.079814,100000.0,18351.0,318023.052976,60000.0,66005.0,10300.0,232220.165005,65000.0,272718.32156,56000.0,574648.017903,70000.0,60000.0,145000.0,50000.0,18351.0,18000.0,59000.0,60000.0,237920.37518,320515.604156,479124.6829,325938.382358,92000.0,7500.0,…,120000.0,158000.0,25000.0,38500.0,859112.820823,358000.016205,210882.0,10000.0,10000.0,2900.0,4150.0,547629.001535,71000.0,341151.536785,50462.0,190000.0,480415.12176,130000.0,210882.0,33000.0,122000.0,45000.0,65000.0,173000.0,113200.0,5300.0,359436.903344,556247.506037,71000.0,50000.0,309116.344645,50000.0,285779.780694,90000.0,25000.0,190000.0,329070.554259
51000.0,46000.0,71000.0,361121.73372,14273.0,236816.031429,298238.169923,1.5408e6,318534.364684,92000.0,191000.0,148000.0,50000.0,50000.0,50000.0,80000.0,50000.0,260804.147149,146000.0,1.1916e6,75000.0,91000.0,1.0209e6,1.4196e6,18000.0,126000.0,240423.750982,36073.0,20000.0,71000.0,113200.0,114500.0,33000.0,5.9190e6,24506.0,270604.430925,431044.22813,…,258622.850305,191000.0,18819.0,40000.0,33000.0,906244.19439,58000.0,233011.701693,75000.0,164500.0,191000.0,1646.0,90000.0,19000.0,20000.0,25000.0,707764.464234,1.4506e6,353374.8321,100000.0,40000.0,249191.956722,547427.102171,249276.705439,5300.0,1.0519e7,972272.96796,378598.186487,50000.0,9000.0,56000.0,2.2327e6,30000.0,2900.0,115000.0,265976.20053,65000.0
10300.0,2.0336e6,2900.0,2900.0,50000.0,10000.0,37000.0,71000.0,219000.0,24506.0,268983.236374,58000.0,207200.0,33000.0,5300.0,361995.853228,330666.308746,40000.0,60000.0,33000.0,268199.555349,50000.0,7500.0,94285.0,113200.0,70000.0,146000.0,62000.0,340435.253185,147000.0,1800.0,248516.78511,210882.0,142000.0,50000.0,40000.0,38500.0,…,70000.0,9000.0,56000.0,314043.51881,230905.100971,32000.0,164500.0,4.7627e6,294979.748205,8000.0,2.3089e7,421158.226052,163000.0,18819.0,11000.0,349901.395147,302399.626299,122000.0,100000.0,351711.693078,56000.0,25000.0,173000.0,58000.0,200000.0,90000.0,10300.0,50000.0,485189.36403,36073.0,60000.0,404865.700036,3.1390e6,33000.0,60000.0,160000.0,203000.0
715586.370562,50462.0,124000.0,391180.301722,310270.078983,146000.0,40911.0,380327.270911,541762.504716,1.1964e6,7500.0,80000.0,95000.0,148000.0,43000.0,274857.163253,133000.0,114500.0,25000.0,145000.0,91000.0,39500.0,43000.0,24000.0,664390.593404,115000.0,746572.146279,43696.0,124000.0,270098.038839,3.8203e6,55000.0,133000.0,112000.0,50000.0,100000.0,120000.0,…,39500.0,408202.757896,55000.0,133000.0,70000.0,24000.0,145000.0,203000.0,25000.0,2000.0,2.0909e6,10000.0,70000.0,124000.0,114500.0,303196.780282,51000.0,308608.682595,25000.0,100000.0,2.6983e6,5300.0,29000.0,29000.0,53000.0,19000.0,145000.0,25000.0,32000.0,753082.300904,25000.0,25000.0,29000.0,95000.0,51000.0,395546.805656,243553.192533


# Distributions
## Generic Power-Law
$Cf(x)$ s.t. $f(x) = x^{-a}$

In [19]:
# instantiate the class
pl_class = powerlawrs.dist.powerlaw.Powerlaw(2.2726, 230000)

In [20]:
# pdf
pl_class.pdf(500000)

9.47430869971139e-07

In [21]:
# cdf
pl_class.cdf(500000)

0.627757791147596

In [22]:
# ccdf
pl_class.ccdf(500000)

0.372242208852404

In [23]:
# rv
# generate random U(0,1)
u = np.random.rand()
pl_class.rv(u)

243140.57739326556

In [24]:
# Log Likelihood of first 10 data
pl_class.loglikelihood(data[:10])

[-14.167286692248991,
 -11.907555004554373,
 -11.670580405021845,
 -8.447163936291041,
 -6.18495414433208,
 -13.122953520503547,
 -9.52761182944572,
 -6.437725522683576,
 -13.680018818629309,
 -9.232906008194748]

## Pareto Type I

In [25]:
# instantiate the class
pareto_class = powerlawrs.dist.pareto.Pareto(1.2726, 230000)

In [26]:
# pdf
pareto_class.pdf(500000)

9.474308699711417e-07

In [27]:
# cdf
pareto_class.cdf(500000)

0.6277577911475959

In [28]:
# ccdf
pareto_class.ccdf(500000)

0.3722422088524041

In [29]:
# rv
# generate random U(0,1)
u = np.random.rand()
pareto_class.rv(u)

796546.7687338194

In [30]:
# Log Likelihood of first 10 data
# Note the -inf as x < pareto_class.x_min
pareto_class.loglikelihood(data[:10])

[-14.167286692248988,
 -inf,
 -inf,
 -inf,
 -inf,
 -13.122953520503545,
 -inf,
 -inf,
 -13.680018818629307,
 -inf]

In [31]:
data[:10]

column_1
i64
570000
210882
190000
46000
17000
360000
74000
19000
460000
65000


## Shifted Exponential
$f(x:\lambda, x_{min}) = \lambda * e^{-\lambda * (x-x_{min})}$

In [32]:
# instantiate the class
expo_class = powerlawrs.dist.exponential.Exponential(1.5, 0.0)

In [33]:
# pdf
expo_class.pdf(2)

0.07468060255179593

In [34]:
# cdf
expo_class.cdf(2)

0.950212931632136

In [35]:
# ccdf
expo_class.ccdf(2)

0.04978706836786395

In [36]:
# rv
# generate random U(0,1)
u = np.random.rand()
expo_class.rv(u)

0.02915563809405971

In [37]:
# Log Likelihood of 10 rv's
X = [expo_class.rv(np.random.rand()) for x in range(0,10)]
expo_class.loglikelihood(X)

[-4.757748440477651,
 0.36748655267537583,
 0.34152999219370983,
 0.3593940221924733,
 -0.15855049194404514,
 0.15342434867124405,
 -0.1206203448145686,
 -0.461076587718936,
 0.11931708394925528,
 -1.3475374228457226]

## Log-Normal

In [38]:
# instantiate the class
lognormal_class = powerlawrs.dist.lognormal.Lognormal(5,1)

In [39]:
# pdf
lognormal_class.pdf(50)

0.004414730410382431

In [40]:
# cdf
lognormal_class.cdf(50)

0.1383026725766271

In [41]:
# ccdf
lognormal_class.ccdf(50)

0.8616973274233729

In [42]:
# rv
# generate random U(0,1)
u = np.random.rand()
lognormal_class.rv(u)

1701.977917182561

In [43]:
# Log Likelihood of first 10 data
lognormal_class.loglikelihood(data[:10])

[-48.23156695306425,
 -39.52492513891642,
 -38.669151666115724,
 -28.108458617854605,
 -21.898298898560117,
 -44.08491931904303,
 -31.42411507354539,
 -22.543027350758262,
 -46.270534238528505,
 -30.497310074517163]