In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import scipy.stats as sps
from scipy.special import gammaln, betaln
plt.style.use('ggplot')
from sympy import *
from matplotlib import cm
%matplotlib inline
def scan_from_csv(filename):
    return pd.read_csv(filename,na_values='None')

def write_answer_to_file(answer,file):
    with open(file, 'w') as answer_file:  
        answer_file.write(answer)

$\Omega^2_n = \int_{\mathbb{R}} \frac{(F^*_n(x) - F_0(x))^2}{F_0(x)(1 - F_0(x)} dF_0(x)$

Так как статистика критерия не зависит от распределения $F_0$, то будем использовать $F_0 \sim Uniform(0,1)$

Критерий: $\{ n\Omega^2_n > u_{1-\alpha}\}$

In [51]:
def calc_Omega_n(X,F):
    N = len(X)
    Y = np.sort(X)
    stat = -2* np.sum((2*np.arange(1,N+1)-1)/(2*N)*np.log([F(x) for x in Y]) + 
                      (1-(2*np.arange(1,N+1)-1)/(2*N))*(1-np.log([F(x) for x in Y])))
    return stat - N

In [52]:
N = [25, 100, 500, 2000]
alpha = [0.01 ,0.05, 0.1]
quant = np.zeros((4,3))

In [53]:
def get_quant(N,alpha,gen,genF,params,repeat=500):
    ks = np.zeros(repeat)
    for i in range(repeat):
        X = gen(**params)
        ks[i] = calc_Omega_n(X,genF)
    ks = np.sort(ks)
    return ks[(1-alpha)*(repeat-1)]

In [54]:
%%time
R = 30
for i in range(4):
    for j in range(3):
        params = {'size':N[i]}
        gen = sps.uniform.rvs
        genF = sps.uniform.cdf
        quant[i][j] = get_quant(N[i],alpha[j],gen,genF,params,R)
        print('for N = ' + str(N[i]) + ', alpha = ' + str(alpha[j]) + ', criteria Omega^2_N > ' + str(quant[i][j]))



for N = 25, alpha = 0.01, criteria Omega^2_N > -66.3417951145
for N = 25, alpha = 0.05, criteria Omega^2_N > -67.0631310748
for N = 25, alpha = 0.1, criteria Omega^2_N > -68.1514810623
for N = 100, alpha = 0.01, criteria Omega^2_N > -276.170760138
for N = 100, alpha = 0.05, criteria Omega^2_N > -279.453704859
for N = 100, alpha = 0.1, criteria Omega^2_N > -287.206039992
for N = 500, alpha = 0.01, criteria Omega^2_N > -1459.08835941
for N = 500, alpha = 0.05, criteria Omega^2_N > -1463.09179226
for N = 500, alpha = 0.1, criteria Omega^2_N > -1474.9060218
for N = 2000, alpha = 0.01, criteria Omega^2_N > -5917.0172203
for N = 2000, alpha = 0.05, criteria Omega^2_N > -5907.38314138
for N = 2000, alpha = 0.1, criteria Omega^2_N > -5949.89841096
CPU times: user 31.1 s, sys: 277 ms, total: 31.4 s
Wall time: 31.5 s


Сравним мощность критерия с мощностью ks-test на  сравнении гипотез, H_0: X:

In [61]:
%%time
R = 100
D=100
# ps = sps.norm.rvs(size=R/2)
# ls = sps.t.rvs(size=R/2,df=10)
for i in range(4):
    for j in range(3):
        ans = np.zeros((2,2),dtype=int)
        for t in range(R):
            k = 0
            if t < R/2:
                X = sps.norm.rvs(size=D/2)
                k = calc_Omega_n(X,sps.norm.cdf)
            else:
                X = sps.t.rvs(size=R/2,df=10)
                k = calc_Omega_n(X,sps.norm.cdf)
            if k > quant[i][j]:
                if t < R/2:
                    ans[1][1] += 1
                else:
                    ans[0][1] += 1
            else:
                if t < R/2:
                    ans[0][0] += 1
                else:
                    ans[1][0] += 1
        print('for N = ' + str(N[i]) + ', alpha = ' + str(alpha[j]) + ':')
        print('TP: ' + str(ans[1][1]))
        print('FP: ' + str(ans[0][1]))
        print('TN: ' + str(ans[1][0]))
        print('FN: ' + str(ans[0][0]))
        print('err1: ' + str(ans[0][0]/(ans[0][0] + ans[1][1])))

  return self._random_state.standard_normal(self._size)
  return self._random_state.standard_t(df, size=self._size)


for N = 25, alpha = 0.01:
TP: 0
FP: 0
TN: 50
FN: 50
err1: 1.0
for N = 25, alpha = 0.05:
TP: 0
FP: 0
TN: 50
FN: 50
err1: 1.0
for N = 25, alpha = 0.1:
TP: 0
FP: 0
TN: 50
FN: 50
err1: 1.0
for N = 100, alpha = 0.01:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 100, alpha = 0.05:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 100, alpha = 0.1:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 500, alpha = 0.01:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 500, alpha = 0.05:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 500, alpha = 0.1:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 2000, alpha = 0.01:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 2000, alpha = 0.05:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
for N = 2000, alpha = 0.1:
TP: 50
FP: 50
TN: 0
FN: 0
err1: 0.0
CPU times: user 10.5 s, sys: 201 ms, total: 10.7 s
Wall time: 10.9 s


Как видно критерий работает хорошо: ошибка 1-го рода совпадает с заявленной, при этом ошибка 2 го рода тоже небольшая

[3 1 2]
