# statistics notes

Ryan Reece <ryan@cerebras.net>        
created: 2018-10-25

In [1]:
import math
import numpy as np
from scipy import stats

## What is &chi;2?

$$
\chi^2 = \sum_i \frac{(x_i -\mu_i)^2}{\sigma_i}
$$

## A/B testing example I worked at Insight

This example is taken from [here](https://github.com/rreece/insight-workarea/blob/master/data-challenges/a-b-testing/confidence-intervals-and-a-b-testing.ipynb).

In [2]:
N_A1 = 976.  # not converted, sample A
N_A2 =  43.  # converted, sample A
N_B1 = 992.  # not converted, sample B
N_B2 =  34.  # converted, sample B

obs = [[N_A1, N_A2], [N_B1, N_B2]]
chi2, p, dof, ex = stats.chi2_contingency(obs, correction=False)
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))
print('dof  = %.5g' % (dof))
print('expected = %s' % (ex))

chi2 = 1.1581
p    = 0.28186
dof  = 1
expected = [[980.63178484  38.36821516]
 [987.36821516  38.63178484]]


In [3]:
#______________________________________________________________________________
def a_b_test_chi2(N_A1, N_A2, N_B1, N_B2):
    N_A = N_A1 + N_A2
    N_B = N_B1 + N_B2
    N_1 = N_A1 + N_B1
    N_2 = N_A2 + N_B2
    rho = N_2/(N_1+N_2)
    chi2 = ((N_A1 - N_A*(1-rho))**2)/(N_A*(1-rho)) + \
           ((N_A2 - N_A*rho)**2)/(N_A*rho) + \
           ((N_B1 - N_B*(1-rho))**2)/(N_B*(1-rho)) + \
           ((N_B2 - N_B*rho)**2)/(N_B*rho)
    ## checking the chi2 term-by-term:
    #print '%.5g' % (((N_A1 - N_A*(1-rho))**2)/(N_A*(1-rho)))
    #print '%.5g' % (((N_A2 - N_A*rho)**2)/(N_A*rho))
    #print '%.5g' % (((N_B1 - N_B*(1-rho))**2)/(N_B*(1-rho)))
    #print '%.5g' % (((N_B2 - N_B*rho)**2)/(N_B*rho))
    return chi2
        
chi2 = a_b_test_chi2(N_A1, N_A2, N_B1, N_B2)
print('chi2 = %.5g' % (chi2))

chi2 = 1.1581


## scipy.stats

Trying [`scipy.stats.chisquare`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html) and [`scipy.stats.chi2_contingency`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html).

In [4]:
chi2, p = stats.chisquare([N_A1, N_A2], [N_B1, N_B2], ddof=0)
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))

chi2 = 2.6404
p    = 0.10418


In [5]:
def my_chi2(xs, ms, ss=None):
    assert isinstance(xs, list)
    assert isinstance(ms, list)
    assert len(xs) == len(ms)
    if ss is None:
        ss = [ math.sqrt(m) for m in ms ]
    assert isinstance(ss, list)
    chi2 = sum([ (x-m)*(x-m)/(s*s) for x, m, s in zip(xs, ms, ss) ])
    return chi2

In [6]:
chi2 = my_chi2([N_A1, N_A2], [N_B1, N_B2])
print('chi2 = %.5g' % (chi2))

chi2 = 2.6404


Note that `stats.chi2_contingency` gives a different $\chi^2$.  This is because it calculates $\mu_i$, the expected shape in each bin, from the combined data of the two given distributions (instead of taking the later one directly as $\mu_i$).

In [9]:
obs = np.array([[N_A1, N_A2], [N_B1, N_B2]])

In [11]:
chi2, p, dof, ex = stats.chi2_contingency(obs, correction=False)
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))
print('dof  = %.5g' % (dof))
print('expected = %s' % (ex))

chi2 = 1.1581
p    = 0.28186
dof  = 1
expected = [[980.63178484  38.36821516]
 [987.36821516  38.63178484]]


In [19]:
def my_chi2b(xs, ms, ss=None):
    assert isinstance(xs, list)
    assert isinstance(ms, list)
    assert len(xs) == len(ms)
    nx = sum(xs)
    nm = sum(ms)
    n = nm + nx
    es1  = [ nx*(x+m)/n for x, m in zip(xs, ms) ]
    es2  = [ nm*(x+m)/n for x, m in zip(xs, ms) ]
    es = list(es1)
    es.extend(es2)
    bs = list(xs)
    bs.extend(ms)
    if ss is None:
        ss = [ math.sqrt(m) for m in es ]
    assert isinstance(ss, list)
    chi2 = sum([ (x-m)*(x-m)/(s*s) for x, m, s in zip(bs, es, ss) ])
    p = 0 # TODO
    dof = len(xs) - 1
    ex = es # HACK
    return chi2, p, dof, ex

In [20]:
chi2, p, dof, ex = my_chi2b([N_A1, N_A2], [N_B1, N_B2])
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))
print('dof  = %.5g' % (dof))
print('expected = %s' % (ex))

chi2 = 1.1581
p    = 0
dof  = 1
expected = [980.6317848410758, 38.36821515892421, 987.3682151589242, 38.63178484107579]


In [22]:
chi2, p = stats.chisquare([12, 22], [11, 25])
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))

chi2 = 0.45091
p    = 0.5019


In [23]:
chi2, p = stats.chisquare([12, 22], [25, 2])
print('chi2 = %.5g' % (chi2))
print('p    = %.5g' % (p))

chi2 = 206.76
p    = 6.9947e-47


## See also

-   [en.wikipedia.org/wiki/Chi-squared_test](https://en.wikipedia.org/wiki/Chi-squared_test)
-   [en.wikipedia.org/wiki/Chi-squared_distribution](https://en.wikipedia.org/wiki/Chi-squared_distribution)
-   [physics.ucsc.edu/~drip/133/ch4.pdf](http://physics.ucsc.edu/~drip/133/ch4.pdf)
-   [scipy.stats.chisquare](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html)
-   [scipy.stats.chi2_contingency](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)
-   [home.ubalt.edu/ntsbarsh/Business-stat/StatistialTables.pdf](https://home.ubalt.edu/ntsbarsh/Business-stat/StatistialTables.pdf)