In [1]:
import powerlawrs 
import polars as pl
import numpy as np

In [2]:
file = "../reference_data/blackouts.txt"

# polars and pandas do NOT do a good job detecting headers, do not rely on them.
df = pl.read_csv(file, has_header=False)
data = df.to_series()

# API
## Fitting proceedure

In [3]:
# 1. Generate an alpha paramater via MLE for every x_min in the data 
x_mins, alphas = powerlawrs.estimation.find_alphas_fast(data)
print(f"n: {len(data)}, n_x_mins: {len(x_mins)}, n_alphas: {len(alphas)}")

n: 211, n_x_mins: 210, n_alphas: 210


In [4]:
# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
best_fit = powerlawrs.gof.gof(data, alphas=alphas, x_mins=x_mins)
print(f"{best_fit}")

Fitment(x_min=230000, alpha=1.2726372198302858, D=0.06067379629443781, len_tail=59)


Steps 1 and 2 above are abstracted away via ```powerlawrs.fit()```

## Numerical stability 
Comparison of ```find_alphas_exhaustive()``` and ```find_alphas_fast()``` given the former will be technically more accurate than the latter. Depending on the context, the precision is negligible given significant performance improvements. 

In [5]:
# 1. Generate an alpha paramater via MLE for every x_min in the data via find_alphas_exhaustive()
x_mins_ex, alphas_ex = powerlawrs.estimation.find_alphas_exhaustive(data)

# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
best_fit_ex = powerlawrs.gof.gof(data, alphas=alphas_ex, x_mins=x_mins_ex)

print(f"find_alphas_exhaustive() alpha: {best_fit_ex.alpha}")
print(f"find_alphas_fast() alpha:\t{best_fit.alpha}")
print(f"Difference: {best_fit.alpha - best_fit_ex.alpha}")

find_alphas_exhaustive() alpha: 1.2726372198302882
find_alphas_fast() alpha:	1.2726372198302858
Difference: -2.4424906541753444e-15


## Parameter uncertainty

In [6]:
xm_std, a_std = powerlawrs.estimation.param_est(data, m=1000)
print(f"stdev (sample) x_min: {xm_std}, stdev (sample) alpha: {a_std}")

stdev (sample) x_min: 80369.39663728651, stdev (sample) alpha: 0.25521307649482294


## Hypothesis test 

In [7]:
# Run the experiment
# Set a minimum precsion of our p value of the KS test.
precision = 0.01 # p value should be accurate to with 0.01 
H0 = powerlawrs.hypothesis.hypothesis_test(data, precision, best_fit.alpha, best_fit.x_min, best_fit.D)

Generating M = 2500 simulated datasets of length n = 211 with tail size 59 and probability of the tail P(tail|data) = 0.2796208530805687


In [8]:
# hypothesis_test() calls powerlawrs.util.sim.calculate_sim_params() to determine the number of simulated datasets required given the desired precision. 
simparams_dict = powerlawrs.util.sim.calculate_sim_params(precision, data, best_fit.x_min)

In [9]:
# Which will require 2500 synthetic datasets of length 211. 59 of the 211 samples will be drawn from a Pareto Type I with the paramaters found above
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

## Stats module

In [10]:
powerlawrs.stats.descriptive.mean(data)

253868.68246445496

In [11]:
powerlawrs.stats.descriptive.variance(data, 1)

372476564023.59814

In [12]:
powerlawrs.stats.random.random_choice(data, 3)

[115000.0, 207200.0, 240000.0]

In [13]:
powerlawrs.stats.random.random_uniform(3)

[0.1264625194892548, 0.2884762360301478, 0.009410010175556982]

In [14]:
# Define a standard normal CDF in Python
import math
norm_cdf = lambda x: 0.5 * (1 + math.erf(x / math.sqrt(2.0)))

sorted_data = [-1.1, -0.5, 0.1, 0.2, 1.5]

# Call the Rust function, passing the Python function as an argument
(d_plus, d_minus, d_max) = powerlawrs.stats.ks.ks_1sam_sorted(sorted_data, norm_cdf)

print(f"D+: {d_plus}")
print(f"D-: {d_minus}")
print(f"D max: {d_max}")

D+: 0.22074029056089706
D-: 0.13982783727702897
D max: 0.22074029056089706


## Util module

In [15]:
powerlawrs.util.linspace(0,10,5)

[0.0, 2.5, 5.0, 7.5, 10.0]

In [16]:
simparams_dict = powerlawrs.util.sim.calculate_sim_params(0.01, data, 230000)
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

In [17]:
# convert simparams dict to rust struct
simparams_struct = powerlawrs.util.sim.PySimParams(**simparams_dict)

# use the struct as an argument
sim_data = powerlawrs.util.sim.generate_synthetic_datasets(data, 230000, simparams_struct, 1.27)

In [18]:
#Note the library does not yet impliment zeta distribution for discrete data. 
pl.from_numpy(np.array(sim_data))

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_174,column_175,column_176,column_177,column_178,column_179,column_180,column_181,column_182,column_183,column_184,column_185,column_186,column_187,column_188,column_189,column_190,column_191,column_192,column_193,column_194,column_195,column_196,column_197,column_198,column_199,column_200,column_201,column_202,column_203,column_204,column_205,column_206,column_207,column_208,column_209,column_210
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
277160.615496,81000.0,445533.302839,45000.0,30001.0,63500.0,145000.0,40911.0,422168.752351,126000.0,473886.810829,18000.0,479603.140199,65000.0,11529.0,361627.25092,32000.0,190000.0,14273.0,56000.0,240308.937267,319241.438826,36073.0,344729.30755,115000.0,1.7991e6,36073.0,25000.0,60000.0,19000.0,71000.0,62000.0,63500.0,39500.0,81000.0,71000.0,113200.0,…,5300.0,25000.0,11000.0,336568.974514,59000.0,25000.0,30000.0,231803.930812,60000.0,305663.503156,270337.39343,114000.0,40000.0,364253.327969,142000.0,1.5211e6,278311.627937,60000.0,114500.0,173000.0,65000.0,316781.20194,163000.0,70000.0,243882.531897,50000.0,32000.0,173000.0,219000.0,50000.0,81000.0,71000.0,130000.0,90000.0,40000.0,1.2399e6,147000.0
82500.0,1.1877e6,1.8570e6,576622.028956,40000.0,50000.0,345134.103334,317407.116674,957041.537026,60000.0,393822.865932,122000.0,50000.0,70000.0,2000.0,65000.0,71000.0,386142.594492,219000.0,173000.0,43000.0,302600.216741,112000.0,20000.0,18000.0,25000.0,234408.668333,345971.862978,60000.0,265200.525494,30000.0,191000.0,1000.0,25000.0,130000.0,53000.0,126000.0,…,2.3172e6,53000.0,50000.0,949204.533993,43000.0,2900.0,190000.0,391076.54023,114000.0,788610.882714,30001.0,190000.0,2000.0,50000.0,60000.0,422615.368205,17000.0,11000.0,398351.755121,53000.0,350834.826045,304751.596524,2000.0,203000.0,19000.0,70000.0,158000.0,1000.0,2900.0,277993.069092,158000.0,75000.0,1.0285e6,45000.0,423639.867894,32000.0,32000.0
206000.0,264637.852356,92000.0,32000.0,11000.0,50000.0,40911.0,59000.0,32000.0,71000.0,25000.0,7.2625e6,1.0579e6,74000.0,45000.0,298150.76603,158000.0,2.3016e6,82500.0,74000.0,375286.267574,59000.0,535814.773245,285518.007736,92000.0,308149.585279,62000.0,1646.0,1.7058e6,12000.0,470182.561498,306469.346376,259136.555247,511684.154762,5300.0,82500.0,145000.0,…,2.0492e6,9000.0,207200.0,70000.0,5300.0,50000.0,92000.0,191000.0,5300.0,94285.0,300031.733818,510216.76317,115000.0,274723.362007,112000.0,92000.0,130000.0,25000.0,20000.0,164500.0,626486.098597,24000.0,32000.0,18000.0,1.1788e6,166000.0,1646.0,10000.0,71000.0,128000.0,55000.0,25000.0,38500.0,80000.0,35000.0,120000.0,65000.0
50000.0,15000.0,11000.0,146000.0,853654.817723,24506.0,442500.251567,148000.0,523527.023309,15000.0,3.0666e6,80000.0,50000.0,203000.0,38500.0,1.4832e6,572285.463176,342231.458805,3.0507e6,728504.891114,263887.67455,348791.403494,263225.38729,70000.0,92000.0,203000.0,5300.0,30001.0,142000.0,1.7615e6,114500.0,10000.0,50000.0,160000.0,40000.0,414575.413334,90000.0,…,115000.0,92000.0,284125.450976,120000.0,128000.0,775340.133317,45000.0,100000.0,43696.0,300166.805923,380987.8098,8000.0,1.4816e6,350171.872038,32000.0,65000.0,40000.0,92000.0,324171.508755,246086.551245,239293.037547,464994.551228,51000.0,51000.0,60000.0,71000.0,203000.0,30000.0,569500.170829,18351.0,74000.0,843633.964661,826708.837874,15000.0,10000.0,50000.0,206000.0
263493.02997,160000.0,397339.406172,38500.0,173000.0,164500.0,240377.453746,160000.0,340054.312494,173000.0,50000.0,62000.0,147000.0,50000.0,81000.0,29900.0,1.1847e6,10000.0,32000.0,55000.0,243007.056475,246701.297204,12000.0,173000.0,90000.0,264936.59955,203000.0,244256.137233,5300.0,18000.0,708512.631748,71000.0,362845.997628,130000.0,65000.0,46000.0,50000.0,…,1.0393e6,66005.0,81000.0,345632.794312,200000.0,346181.671778,60000.0,10000.0,18000.0,5300.0,130000.0,92000.0,130000.0,160000.0,158000.0,36073.0,457925.069597,24506.0,247257.618859,158000.0,18819.0,53000.0,82500.0,491683.412452,48000.0,330863.183666,291141.330144,11529.0,94285.0,70000.0,100000.0,329842.320192,55000.0,25000.0,100000.0,9000.0,80000.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
435745.543968,437342.720629,17000.0,70000.0,100000.0,50000.0,51000.0,40000.0,160000.0,361500.18621,290954.279797,145000.0,206000.0,50462.0,25000.0,128000.0,45000.0,2000.0,658521.299325,1.2153e6,465448.044011,56000.0,8000.0,490554.99528,10300.0,160000.0,51000.0,113200.0,173000.0,65000.0,460663.393146,40911.0,63500.0,65000.0,81000.0,270061.762327,18819.0,…,148000.0,37000.0,285795.321146,18351.0,331134.221017,25000.0,112000.0,55000.0,440828.130581,142000.0,71000.0,70000.0,56000.0,37000.0,50000.0,130000.0,45000.0,70000.0,60000.0,25000.0,359890.414285,33000.0,238015.771731,18819.0,762413.024247,65000.0,30001.0,12000.0,24506.0,71000.0,65000.0,74000.0,60000.0,130000.0,286764.104127,277625.293849,259208.382506
29000.0,81000.0,95000.0,18351.0,301012.812693,206000.0,283548.861671,289359.008047,70000.0,554882.60207,60000.0,80000.0,40000.0,637160.830876,55000.0,10000.0,2900.0,160000.0,29900.0,18819.0,106850.0,120000.0,4150.0,907511.043363,304187.173677,15000.0,55000.0,130000.0,91000.0,18351.0,1.1501e6,309954.459798,38500.0,4.5437e6,18000.0,147000.0,231081.638237,…,12000.0,1800.0,120000.0,387051.568315,349786.337002,56000.0,26334.0,25000.0,100000.0,15000.0,91000.0,2900.0,266775.845751,518571.978617,122000.0,2.1624e6,25000.0,11529.0,30001.0,410900.505399,91000.0,332904.367387,285121.847744,37000.0,1646.0,82500.0,191000.0,452328.004998,25000.0,25000.0,160000.0,51000.0,326892.216107,354766.431375,282676.958609,60000.0,173000.0
7500.0,130000.0,210882.0,25000.0,1.6843e6,510667.370976,51000.0,305865.398824,262152.146267,246583.209087,82500.0,100000.0,95000.0,15000.0,43696.0,834757.983629,66005.0,598907.29735,290422.760031,686286.516791,25000.0,891532.899351,25000.0,231683.527866,40000.0,92000.0,100000.0,100000.0,65000.0,43000.0,70000.0,324805.767907,71000.0,46000.0,2.8550e6,60000.0,39500.0,…,865684.18998,256941.011019,130000.0,65000.0,40000.0,81000.0,81000.0,29000.0,59000.0,267782.740471,113200.0,405892.448063,62000.0,43696.0,71000.0,51000.0,10000.0,146000.0,1.8033e6,327760.184989,50000.0,393057.716976,2.7449e6,120000.0,50000.0,146000.0,37000.0,539068.799685,55000.0,524529.300247,262414.606926,80000.0,10000.0,24506.0,18351.0,50000.0,431592.639271
397454.429393,450874.802322,145000.0,126000.0,95630.0,92000.0,95000.0,30000.0,25000.0,33000.0,40000.0,92000.0,18000.0,50000.0,100000.0,25000.0,39500.0,365507.504567,145000.0,51000.0,126000.0,19000.0,3.8289e6,60000.0,219000.0,53000.0,282963.395943,428618.373778,25000.0,283678.544637,60000.0,244773.045092,388511.615405,95630.0,12000.0,142000.0,55000.0,…,160000.0,18000.0,26334.0,367049.485481,60000.0,75000.0,18000.0,51000.0,244151.37164,207200.0,333153.928344,71000.0,534048.164804,60000.0,25000.0,234335.85938,30000.0,25000.0,219000.0,50000.0,207200.0,1.4791e6,32000.0,219000.0,314015.645474,53000.0,130000.0,80000.0,859207.924406,48000.0,36073.0,1800.0,2900.0,203000.0,247445.273683,100000.0,56000.0


# Distributions
## Generic Power-Law

In [19]:
# instantiate the class
pl_class = powerlawrs.dist.powerlaw.Powerlaw(2.2726, 230000)

In [20]:
# pdf
pl_class.pdf(500000)

9.47430869971139e-07

In [21]:
# cdf
pl_class.cdf(500000)

0.627757791147596

In [22]:
# ccdf
pl_class.ccdf(500000)

0.372242208852404

In [23]:
# rv
# generate random U(0,1)
u = np.random.rand()
pl_class.rv(u)

537113.5927357532

In [24]:
# Log Likelihood of first 10 data
pl_class.loglikelihood(data[:10])

[-14.167286692248991,
 -11.907555004554373,
 -11.670580405021845,
 -8.447163936291041,
 -6.18495414433208,
 -13.122953520503547,
 -9.52761182944572,
 -6.437725522683576,
 -13.680018818629309,
 -9.232906008194748]

## Pareto Type I

In [25]:
# instantiate the class
pareto_class = powerlawrs.dist.pareto.Pareto(1.2726, 230000)

In [26]:
# pdf
pareto_class.pdf(500000)

9.474308699711417e-07

In [27]:
# cdf
pareto_class.cdf(500000)

0.6277577911475959

In [28]:
# ccdf
pareto_class.ccdf(500000)

0.3722422088524041

In [29]:
# rv
# generate random U(0,1)
u = np.random.rand()
pareto_class.rv(u)

51451015.27777986

In [30]:
# Log Likelihood of first 10 data
# Note the -inf as x < pareto_class.x_min
pareto_class.loglikelihood(data[:10])

[-14.167286692248988,
 -inf,
 -inf,
 -inf,
 -inf,
 -13.122953520503545,
 -inf,
 -inf,
 -13.680018818629307,
 -inf]

In [31]:
data[:10]

column_1
i64
570000
210882
190000
46000
17000
360000
74000
19000
460000
65000


## Exponential

In [32]:
# instantiate the class
expo_class = powerlawrs.dist.exponential.Exponential(1.5)

In [33]:
# pdf
expo_class.pdf(2)

0.07468060255179593

In [34]:
# cdf
expo_class.cdf(2)

0.950212931632136

In [35]:
# ccdf
expo_class.ccdf(2)

0.04978706836786395

In [36]:
# rv
# generate random U(0,1)
u = np.random.rand()
expo_class.rv(u)

0.09378764755647294

In [37]:
# Log Likelihood of 10 rv's
X = [expo_class.rv(np.random.rand()) for x in range(0,10)]
expo_class.loglikelihood(X)

[-0.6617424038689317,
 0.2012269903579113,
 0.27288404800078336,
 -0.17270558322804436,
 0.39063858163902804,
 0.06537008296445235,
 -0.9702015767530623,
 -3.255122581468005,
 0.0012518142296774428,
 -0.7600362443808244]

## Log-Normal

In [38]:
# instantiate the class
lognormal_class = powerlawrs.dist.lognormal.Lognormal(5,1)

In [39]:
# pdf
lognormal_class.pdf(50)

0.004414730410382431

In [40]:
# cdf
lognormal_class.cdf(50)

0.1383026725766271

In [41]:
# ccdf
lognormal_class.ccdf(50)

0.8616973274233729

In [42]:
# rv
# generate random U(0,1)
u = np.random.rand()
lognormal_class.rv(u)

58.62619744059047

In [43]:
# Log Likelihood of first 10 data
lognormal_class.loglikelihood(data[:10])

[-48.23156695306425,
 -39.52492513891642,
 -38.669151666115724,
 -28.108458617854605,
 -21.898298898560117,
 -44.08491931904303,
 -31.42411507354539,
 -22.543027350758262,
 -46.270534238528505,
 -30.497310074517163]