# Normality test

In [2]:
from scipy.stats import anderson, shapiro, kstest, norm, tstd
import numpy as np
from numba import jit

def percentil(x):
    return np.array([((i+1)-3/8)/(len(x)+1/4) for i in range(len(x))])

class NormalityTest:
    def __init__(self):
        pass

    def anderson_darling(self, data):
        ad = anderson(data, dist='norm')
        return ad.statistic
    
    def shapiro_wilk(self, data):
        sw =  shapiro(data)
        return sw.statistic, sw.pvalue
    
    def kolmogorov_smirnov(self, data):
        ks = kstest(rvs=data, cdf=norm.cdf)
        return ks.statistic, ks.pvalue
    
    def ryan_joiner(self, data):
        data = np.sort(data)
        z = norm.ppf(percentil(data))

        # critical value
        cv = lambda n: 1.0063 - 0.1288/np.sqrt(n) - 0.6118/n + 0.13505/n**2

        # test statistic
        r = np.sum(data*z)/np.sqrt( tstd(data)**2 * (len(data)-1) * np.sum(z**2) )
        r = 1/r
            
        return r
    

def p_value(test, data, N=10_000):
    R = np.zeros(N)
    r = test(data)

    for i in range(N):
        R[i] = test(
            np.random.normal(
                loc= np.mean(data),
                scale = np.std(data),
                size= len(data)
            )
        )

    p = len(R[R>r])/len(R)
    return p, R, r


Anderson-Darling,
Shapiro-Wilk,
Ryan-Joiner,
Kolmogorov-Smirnov.

# Data

In [3]:
import pandas as pd

data_excel = pd.ExcelFile('/home/ppiper/Dropbox/local/github/explain/data/DADOS DOE 1.xlsx')
data_excel.sheet_names

['GRÁFICO EFEITO',
 'GRÁFICO INTERAÇÕES',
 'TESTE NORMALIDADE',
 'TESTE t  AMSOTRA ÚNICA',
 'TESTE t DUAS AMOSTRAS',
 'TESTE t PAREADO']

In [4]:
data = data_excel.parse('TESTE NORMALIDADE', usecols='A').dropna()
#data = data_excel.parse('TESTE t  AMSOTRA ÚNICA', usecols='A').dropna()
data

Unnamed: 0,Dados
0,148.0
1,154.0
2,158.0
3,160.0
4,161.0
5,162.0
6,166.0
7,170.0
8,182.0
9,195.0


In [5]:
nt = NormalityTest()


In [6]:

nt.anderson_darling(data.values.flatten())

0.946771879598888

In [7]:
p_value(nt.anderson_darling, data.values.flatten())

(0.0096,
 array([0.323961  , 0.19446256, 0.22698405, ..., 0.58129747, 0.35870835,
        0.35040328]),
 0.946771879598888)

In [8]:
nt.kolmogorov_smirnov(data.values.flatten())

(1.0, 0.0)

In [70]:
from scipy.stats import t
kstest(data.values.flatten(),
       #cdf = norm.cdf, 
       #cdf = np.random.normal(
       #         loc= 0,
       #         scale = 1,
       #         size= len(data.values.flatten()))
       cdf = norm.cdf(data.values.flatten(), loc=np.mean(data.values.flatten()), scale=np.std(data.values.flatten()))
       #args=(np.mean(data.values.flatten()), np.std(data.values.flatten())), 
       #args = (
       #    np.mean(data.values.flatten()),
       #    np.std( data.values.flatten(),         
       #       #ddof=len(data.values.flatten())-1
       #    ),
       #),
       #cdf = norm.cdf, #args=(np.mean(data.values.flatten()), np.std(data.values.flatten())),
       #"t",
       #(len(data.values.flatten()),),
      )

KstestResult(statistic=1.0, pvalue=2.835142154027603e-06, statistic_location=0.996428690478421, statistic_sign=-1)

In [66]:
t.cdf(data.values.flatten(), len(data.values.flatten())-1, loc=np.mean(data.values.flatten()), scale=np.std(data.values.flatten()))

array([1.79346798e-10, 2.99797404e-09, 3.38617650e-08, 1.46070441e-07,
       3.29615568e-07, 7.94776588e-07, 6.60544302e-05, 3.66940174e-02,
       9.99999205e-01, 1.00000000e+00, 1.00000000e+00])

In [38]:
# pvalue=0.006703833118081093
nt.shapiro_wilk(data.values.flatten())


(0.7888147830963135, 0.006703833118081093)

In [39]:
rj = nt.ryan_joiner(data.values.flatten())
rj

1.1385844985498723

In [40]:
p_value(nt.ryan_joiner, data.values.flatten())

(0.0074,
 array([1.03722052, 1.04432793, 1.02443816, ..., 1.06013863, 1.05463379,
        1.01571637]),
 1.1385844985498723)