In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import math
import scipy.stats as sps
from sympy import *
from matplotlib import cm
plt.style.use('ggplot')
%matplotlib inline

In [4]:
data = pd.read_csv('hw7t4v0.txt', delimiter='\\s+', header=-1)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,966.779,946.308,950.472,960.653,955.635,967.399,952.516,947.648,953.183,957.431
1,969.802,943.855,950.85,960.727,957.679,967.203,953.587,948.68,951.859,959.116
2,965.144,943.902,951.904,961.541,958.057,966.074,952.162,948.333,952.853,957.064
3,965.781,946.646,953.755,961.203,957.303,966.833,952.829,949.485,951.089,957.979
4,966.701,944.852,949.146,959.864,953.777,965.27,952.181,945.852,950.558,957.599


In [6]:
samples = np.array([[data[c][i] for c in data.columns ] for i in range(len(data[0]))])

In [8]:
print(samples.shape)

(10, 10)


Сначала проверим общие условия применимости критерия Фишера:

$\forall j  \quad  D(X_{ij} - X_{kj})$ не зависит от $j$ (номер выборки)

Проверим это с помощью критериев Левина и Бартлетта для проверки равенства дисперсий.
Также проверим выборки на равенство эффектов обработки критериями Фридмана и Пэйджа(моделированием). Также проверим выборки на нормальность для обоснованности приминения критериев, которые работают только для нормальных выборок.
Проверку проведем низходящим методом Холма при $FWER \leq 0.05$. 

In [10]:
fisher_check = []
k = len(samples[0])
for j in range(len(samples)):
    cur = []
    for i in range(k-1):
        cur.append(samples[j][i] - samples[j][k-1])
    fisher_check.append(cur)
fisher_check = np.array(fisher_check)
print(fisher_check)

[[  9.348 -11.123  -6.959   3.222  -1.796   9.968  -4.915  -9.783  -4.248]
 [ 10.686 -15.261  -8.266   1.611  -1.437   8.087  -5.529 -10.436  -7.257]
 [  8.08  -13.162  -5.16    4.477   0.993   9.01   -4.902  -8.731  -4.211]
 [  7.802 -11.333  -4.224   3.224  -0.676   8.854  -5.15   -8.494  -6.89 ]
 [  9.102 -12.747  -8.453   2.265  -3.822   7.671  -5.418 -11.747  -7.041]
 [ 10.172  -9.603  -6.05    3.979   0.871   9.994  -4.249  -6.705  -3.341]
 [  9.527  -3.925  -6.485   4.639   3.055   9.532  -8.567 -10.     -5.143]
 [  7.766 -11.065  -8.782   3.407  -0.147   9.02   -4.444  -6.905  -5.141]
 [ 11.242  -9.151  -5.687   6.474   0.472  12.237  -2.251  -5.476  -1.823]
 [  9.405 -10.549  -8.543   2.561   2.111   9.853  -4.479  -8.342  -4.401]]


In [17]:
from statsmodels.sandbox.stats.multicomp import multipletests  # Holm method
from scipy.stats import shapiro  # Shapiro-Wilk test
from scipy.stats import normaltest  # D'Agostino test
from scipy.stats import jarque_bera  # Jarque-Bera test
from scipy.stats import f_oneway  # Fisher test
from scipy.stats import kruskal  # Kruskal-Wallis test
from scipy.stats import levene  # Levene test
from scipy.stats import bartlett  # Bartlett test
from scipy.stats import friedmanchisquare  # Friedman test

In [65]:
def get_stats(statf,n,k=100):
    stats = np.zeros(k)
    gens = sps.norm.rvs(size=(k,n))
    for i in range(k):
        stats[i] = statf(gens[i])[0]
    stats = np.sort(stats)
    return np.array(stats)

def get_mult_stats(statf, n, l, k=100):
    stats = np.zeros(k)
    gens = []
    for i in range(n):
        gens.append(sps.norm.rvs(size=(k,l)))
    for i in range(k):
        cur = []
        for j in range(n):
              cur.append(gens[j][i])
        stats[i] = statf(*cur)[0]
    stats = np.sort(stats)
    return np.array(stats)

def run_stats(x,statf,stats):
    st = statf(x)[0]
    p = np.sum((st > stats))/len(stats)
    return 2*min(p,1-p)  # two-tailed p-value

def run_stats_pointer(x,statf,stats):
    st = statf(*x)[0]
    p = np.sum((st > stats))/len(stats)
    return 2*min(p,1-p)  # two-tailed p-value

In [60]:
def my_page_stat(*args):
    X = [x for x in args]
    Y = []
    for i in range(len(X[0])):
        Z = []
        for x in X:
            Z.append(x[i])
        Z = sorted(Z)
        Y.append(np.array(Z))
    R = np.zeros((len(X),len(X[0])))
    for i in range(len(X)):
        for j in range(len(X[0])):
            R[i][j] = np.sum(Z[i] < X[i][j])
    Rg = np.mean(R,axis=1)
    return np.sum(np.arange(len(X))*Rg),1  
# можно все считать в средних, так как при моделировании это не будет важно

def my_2_way_fisher(*args):
    X = np.array([x for x in args])
    Xm = np.mean(X)
    X_j = np.mean(X,axis=1)
    X_i = np.mean(X,axis=0)
    k = len(X)
    n = len(X[0])
    return (n*(n-1)*np.sum((X_j - Xm)**2)
            /(np.sum((np.ravel(X) - Xm)**2) - k * np.sum((X_i - Xm)**2))), 1

In [18]:
%%time
stats_sw = get_stats(shapiro,k,1000)  # Shapiro-Wilk stats
stats_da = get_stats(normaltest,k,1000)  # D'Agostino stats
stats_jb = get_stats(jarque_bera,k,1000)  # Jarkue-Bera stats

  "anyway, n=%i" % int(n))


CPU times: user 1.15 s, sys: 26.2 ms, total: 1.18 s
Wall time: 1.21 s


In [62]:
%%time
stats_friedman = get_mult_stats(friedmanchisquare, len(samples), len(samples[0]), 10000)
stats_page = get_mult_stats(my_page_stat,  len(samples), len(samples[0]), 10000)
stats_fisher= get_mult_stats(my_2_way_fisher,  len(samples), len(samples[0]), 10000)

CPU times: user 18.3 s, sys: 174 ms, total: 18.5 s
Wall time: 18.7 s


In [66]:
# adaptive criteries
run_criteries = [
    (lambda x: run_stats(x, shapiro, stats_sw))
    , (lambda x: run_stats(x, normaltest, stats_da))
    , (lambda x: run_stats(x, jarque_bera, stats_jb))
    , (lambda s: run_stats_pointer(s, friedmanchisquare, stats_friedman))
    , (lambda s: run_stats_pointer(s, my_page_stat, stats_page))
    , (lambda s: run_stats_pointer(s, my_2_way_fisher, stats_fisher))
    , (lambda s: f_oneway(*s)[1])
    , (lambda s: kruskal(*s)[1])
    , (lambda s: levene(*s)[1])
    , (lambda s: bartlett(*s)[1])
]

In [67]:
alphas = [0.05]
for a in alphas:
    p_values = []
    for x in samples:
        p_values.append(run_criteries[0](x))
        p_values.append(run_criteries[1](x))
        p_values.append(run_criteries[2](x))
    for ex in run_criteries[3:]:
        p_values.append(ex(samples))
    reject = multipletests(pvals=p_values, method='holm',alpha=a)[0]
    print('for alpha = ' + str(a) + ' : ' + str(np.sum(reject)) + ' simple hypotheses are rejected:')
    print('p_values: ' + str(p_values))
    print('reject_bits: ' + str(reject))
    print()

for alpha = 0.05 : 0 simple hypotheses are rejected:
p_values: [0.96199999999999997, 0.71399999999999997, 0.93399999999999994, 0.21199999999999997, 0.23000000000000001, 0.55200000000000005, 0.37200000000000011, 0.22600000000000001, 0.53600000000000003, 0.54800000000000004, 0.61799999999999999, 0.91400000000000003, 0.84800000000000009, 0.79200000000000004, 0.90199999999999991, 0.90000000000000002, 0.78600000000000003, 0.83600000000000008, 0.81000000000000005, 0.77200000000000002, 0.67599999999999993, 0.65999999999999992, 0.76400000000000001, 0.98999999999999999, 0.80600000000000005, 0.80600000000000005, 0.81200000000000006, 0.81000000000000005, 0.84999999999999998, 0.89599999999999991, 0.016199999999999992, 0.64840000000000009, 0.1472, 0.99989309029479412, 0.99939435998802173, 0.99987588945987049, 0.99996892301991724]
reject_bits: [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False

  "anyway, n=%i" % int(n))


Как видно, ничего не отвергается, 

следовательно считаем, что 

1)выборки распределены нормально на уровне 0.05, 

2)нужные дисперсии совпадают(то есть фишера можно применять),

3)остальные критерии говорят, что в рамках задачи двухфакторного дисперсионного анализа совпадение средних/дисперсий/эффектов обработки не отвергается