# Tests of Variability

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.graphics.api as smg

sns.set_style('whitegrid')

In [2]:
def chi2_1samp_var_stats(s2, sigma2, n, alternative='two-sided'):
    '''
    This function runs the one-sample chi-squared test for variance
    based on known values. Use chi2_1samp_var_data if you have raw
    data rather than summary statistics.
    
    Inputs:
    s2     : the observed variance
    sigma2 : the null hypothesized variance
    n      : the sample size
    alternative : the alternative hypothesis ('two-sided', 'greater', 'less')
    
    Returns:
    test   : the chi-squared test statistic
    pval   : the p-value for the test
    '''
    import math
    import scipy.stats as stats
    dist = stats.chi2(df=n-1)
    test = (n-1) * s2 / sigma2
    pval = 0
    if alternative == 'less':
        pval = dist.cdf(test)
    elif alternative == 'greater':
        pval = dist.sf(test)
    else:
        pval = 2 * min(dist.cdf(test),dist.sf(test))
    return test, pval

# $\chi^2$ Test

In [3]:
def chi2_1samp_var_data(data, sigma2, alternative='two-sided'):
    '''
    This function runs the one-sample chi-squared test for variance
    based on given data. Use chi2_1samp_var_stats if you know values
    but do not have the raw data.
    
    Inputs:
    data   : the observed data
    sigma2 : the null hypothesized variance
    alternative : the alternative hypothesis ('two-sided', 'greater', 'less')
    
    Returns:
    test   : the chi-squared test statistic
    pval   : the p-value for the test
    '''
    import math
    import scipy.stats as stats
    import numpy as np
    s2 = np.var(data, ddof=1)
    n = len(data)
    dist = stats.chi2(df=n-1)
    test = (n-1) * s2 / sigma2
    pval = 0
    if alternative == 'less':
        pval = dist.cdf(test)
    elif alternative == 'greater':
        pval = dist.sf(test)
    else:
        pval = 2 * min(dist.cdf(test),dist.sf(test))
    return test, pval

In [18]:
sigma2 = 1.3**2
s2 = 1.4**2
n=31
alternative = 'greater'
chi2_1samp_var_stats(s2=s2, sigma2=sigma2, n=n, alternative=alternative)

(34.792899408284015, 0.2502540602893593)

In [10]:
dist = stats.chi2(df=n-1)
dist.isf(q=0.05)

123.22522145336181

In [14]:
data = stats.norm(scale=1.4).rvs(size=31)
print(np.std(data,ddof=1))
print(chi2_1samp_var_data(data=data, sigma2=sigma2, alternative='greater'))

1.5359959667527208
(41.88077413989276, 0.07327248826746881)


# F Test

In [19]:
def F_2samp_var_stats(var1, var2, n1, n2, alternative='two-sided'):
    '''
    This function runs the two-sample F test for variance based on
    known values. Use F_2samp_var_data if you have raw data.
    
    Inputs:
    var1     : the observed variance for sample 1
    var2     : the observed variance for sample 2
    n1       : the sample size for sample 1
    n2       : the sample size for sample 2
    alternative : the alternative hypothesis ('two-sided', 'greater', 'less')
    
    Returns:
    test   : the F test statistic
    pval   : the p-value for the test
    '''
    import math
    import scipy.stats as stats
    dist = stats.f(dfn=n1-1, dfd=n2-1)
    test = var1/var2
    pval = 0
    if alternative == 'less':
        pval = dist.cdf(test)
    elif alternative == 'greater':
        pval = dist.sf(test)
    else:
        pval = 2 * min(dist.cdf(test),dist.sf(test))
    return test, pval

In [20]:
def F_2samp_var_data(data1, data2, alternative='two-sided'):
    '''
    This function runs the two-sample F test for variance based on
    raw data. Use F_2samp_var_stats if you have summary statistics.
    
    Inputs:
    data1   : sample 1
    data2   : sample 2
    alternative : the alternative hypothesis ('two-sided', 'greater', 'less')
    
    Returns:
    test   : the F test statistic
    pval   : the p-value for the test
    '''
    import math
    import scipy.stats as stats
    import numpy as np
    var1 = np.var(data1, ddof=1)
    var2 = np.var(data2, ddof=1)
    n1 = len(data1)
    n2 = len(data2)
    dist = stats.f(dfn=n1-1, dfd=n2-1)
    test = var1/var2
    pval = 0
    if alternative == 'less':
        pval = dist.cdf(test)
    elif alternative == 'greater':
        pval = dist.sf(test)
    else:
        pval = 2 * min(dist.cdf(test),dist.sf(test))
    return test, pval

In [36]:
var1 = 5
var2 = 8
n1 = 15
n2 = 18
alternative = 'two-sided'

In [37]:
F_2samp_var_stats(var1=var1, var2=var2, n1=n1, n2=n2, alternative=alternative)

(0.625, 0.37950927903309983)

In [23]:
stats.f(dfn=n1-1, dfd=n2-1).ppf(0.025)

0.34479685483195366

In [24]:
stats.f(dfn=n1-1, dfd=n2-1).isf(0.025)

2.7526407074274255

In [25]:
data1 = stats.norm(loc=65, scale=math.sqrt(5)).rvs(size=15)
data2 = stats.norm(loc=75, scale=math.sqrt(8)).rvs(size=18)
F_2samp_var_data(data1, data2, alternative='two-sided')

(0.3293450651488108, 0.041342236341834857)

---
# Progress Check

In [42]:
ano = pd.read_csv('Anorexia-1.dat', sep='\s+')
ano

Unnamed: 0,subject,therapy,before,after
0,1,cb,80.5,82.2
1,2,cb,84.9,85.6
2,3,cb,81.5,81.4
3,4,cb,82.6,81.9
4,5,cb,79.9,76.4
...,...,...,...,...
67,68,c,84.4,84.7
68,69,c,79.6,81.4
69,70,c,77.5,81.2
70,71,c,72.3,88.2


In [44]:
ano['difference'] = ano['after'] - ano['before']
cbg = ano[ano['therapy']=='cb']['difference']
cg = ano[ano['therapy']=='c']['difference']

n1 = len(cbg)
n2 = len(cg)
var1 = cbg.var()
var2 = cg.var()
alternative = 'two-sided'

In [45]:
F_2samp_var_stats(var1=var1, var2=var2, n1=n1, n2=n2, alternative=alternative)

(0.836959238933647, 0.6448601708323876)

In [46]:
F_2samp_var_data(data1=cbg, data2=cg, alternative=alternative)

(0.836959238933647, 0.6448601708323876)

In [53]:
n = len(ano)
sigma = 6.5
s = ano['difference'].std()
sigma2 = sigma**2
s2 = s**2
alpha = 0.05
alternative = 'greater'

chi2_1samp_var_stats(s2=s2, sigma2=sigma2, n=n, alternative=alternative)

(107.10973044049969, 0.003639258387666616)