Skip to content

Commit

Permalink
Python chi-squared test: chi2(), chi2p(), gammai(), erfc().
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom De Smedt committed May 28, 2012
1 parent 375de6b commit a57c89e
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 5 deletions.
157 changes: 153 additions & 4 deletions pattern/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# http://www.clips.ua.ac.be/pages/pattern

from time import time
from math import sqrt, floor, modf
from math import sqrt, floor, modf, exp, pi, log

#### PROFILER ######################################################################################

Expand Down Expand Up @@ -404,9 +404,158 @@ def C(n, k):

fisher_test = fisher_exact_test

FISHER = "fisher"
FISHER, CHI2 = "fisher", "chi2"
def significance(*args, **kwargs):
""" Returns the significance for the given test (FISHER).
""" Returns the significance for the given test (FISHER, CHI2).
"""
if kwargs.get("test", FISHER) == FISHER:
test = kwargs.pop("test", FISHER)
if test == FISHER:
return fisher_exact_test(*args, **kwargs)
if test == CHI2:
return pearson_chi_squared_test(*args, **kwargs)[1]

#--- PEARSON'S CHI-SQUARED TEST --------------------------------------------------------------------

LOWER = "lower"
UPPER = "upper"

def pearson_chi_squared_test(observed=[], expected=[], df=None, tail=UPPER):
""" Returns (X2, P) for the given observed and expected data, containing absolute frequencies.
If expected is None, an equal distribution over all classes is assumed.
P < 0.05: significant
"""
# The P-value (upper tail area) is obtained from the incomplete gamma integral:
# P(X2 | v) = igf(v/2, x/2) with v degrees of freedom.
# See: Cephes, https://github.com/scipy/scipy/blob/master/scipy/special/cephes/chdtr.c
O = observed
E = expected
if not df:
df = len(O) - 1
if not E:
s = float(sum(O))
E = [s / (len(O) or 1)] * len(O)
X2 = sum((O[i] - E[i]) ** 2.0 / E[i] for i in range(len(O)))
P = gammai(df * 0.5, X2 * 0.5, tail)
return (X2, P)

chi2 = chi_squared = pearson_chi_squared_test

def chi2p(X2, df=1, tail=UPPER):
""" Returns P-value for given X2 and degrees of freedom.
"""
return gammai(df * 0.5, X2 * 0.5, tail)

#o1, e1 = [44,56], [50,50]
#o2, e2 = [20,20,0,0], [10,10,10,10]
#assert round(chi_squared(o1, e1)[0], 4) == 1.4400
#assert round(chi_squared(o1, e1)[1], 4) == 0.2301
#assert round(chi_squared(o2, e2)[0], 4) == 40.0000
#assert round(chi_squared(o2, e2)[1], 12) == 1.0655e-08

#### SPECIAL FUNCTIONS #############################################################################

#--- INCOMPLETE GAMMA FUNCTION ---------------------------------------------------------------------
# Based on: http://www.johnkerl.org/python/sp_funcs_m.py.txt, Tom Loredo (2009)
# See also: http://www.mhtl.uwaterloo.ca/courses/me755/web_chap1.pdf

def gammaln(x):
""" Returns the logarithm of the gamma function.
"""
x = x - 1.0
y = x + 5.5
y = (x + 0.5) * log(y) - y
n = 1.0
for i in range(6):
x += 1
n += (76.18009173, -86.50532033, 24.01409822, -1.231739516e0, 0.120858003e-2, -0.536382e-5)[i] / x
return y + log(2.50662827465 * n)

def gamma(x):
return exp(gammaln(x))

def gammai(a, x, tail=UPPER):
""" Returns the incomplete gamma function for LOWER or UPPER tail.
"""

# Series approximation to the incomplete gamma function.
def gs(a, x, epsilon=3.e-7, iterations=700):
ln = gammaln(a)
s = 1.0 / a
d = 1.0 / a
for i in range(1, iterations):
d = d * x / (a + i)
s = s + d
if abs(d) < abs(s) * epsilon:
return (s * exp(-x + a * log(x) - ln), ln)
raise StopIteration, (abs(d), abs(s) * epsilon)

# Continued fraction approximation of the incomplete gamma function.
def gcf(a, x, epsilon=3.e-7, iterations=200):
ln = gammaln(a)
g0 = 0.0
a0 = 1.0
b0 = 0.0
a1 = x
b1 = 1.0
fac = 1.0
for i in range(1, iterations):
a0 = (a1 + a0 * (i - a)) * fac
b0 = (b1 + b0 * (i - a)) * fac
a1 = x * a0 + a1 * i * fac
b1 = x * b0 + b1 * i * fac
if a1 != 0.0:
fac = 1.0 / a1
g = b1 * fac
if abs((g - g0) / g) < epsilon:
return (g * exp(-x + a * log(x) - ln), ln)
g0 = g
raise StopIteration, (abs((g-g0) / g))

if x < 0.0:
raise ValueError, "x < 0.0"
if a <= 0.0:
raise ValueError, "a <= 0.0"
if x < a + 1:
if tail == LOWER:
return gs(a, x)[0]
return 1 - gs(a, x)[0]
else:
if tail == UPPER:
return gcf(a, x)[0]
return 1 - gcf(a, x)[0]

#--- COMPLEMENTARY ERROR FUNCTION ------------------------------------------------------------------

def erfc(x):
""" Complementary error function.
"""
z = abs(x)
t = 1 / (1 + 0.5 * z)
r = 0
for y in [
0.17087277,
-0.82215223,
1.48851587,
-1.13520398,
0.27886807,
-0.18628806,
0.09678418,
0.37409196,
1.00002368,
-1.26551223]:
r = y + t * r
r = t * exp(-z*z + r)
if x >= 0:
return r
return 2 - r

def cdf(x, mu=0, sigma=1):
""" Cumulative distribution function.
"""
return min(1.0, 0.5 * erfc((-x + mu) / (sigma * 2**0.5)))

def pdf(x, mu=0, sigma=1):
""" Probability density function.
"""
u = (x - mu) / abs(sigma)
return (1 / sqrt(2*pi) * abs(sigma)) * exp(-u*u / 2)
49 changes: 48 additions & 1 deletion test/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,53 @@ def test_fisher_test(self):
self.assertAlmostEqual(v, 0.0028, places=4)
v = metrics.fisher_exact_test(a=45, b=15, c=75, d=45)
self.assertAlmostEqual(v, 0.1307, places=4)
print "pattern.mettrics.fisher_test()"
print "pattern.metrics.fisher_test()"

def test_chi_squared(self):
# Assert chi-squared test (upper tail).
o1, e1 = [44,56], [50,50]
o2, e2 = [20,20,0,0], [10,10,10,10]
v1 = metrics.chi2(o1, e1)
v2 = metrics.chi2(o2, e2)
self.assertAlmostEqual(v1[0], 1.4400, places=4)
self.assertAlmostEqual(v1[1], 0.2301, places=4) # Tests gammai.gs().
self.assertAlmostEqual(v2[0], 40.0000, places=4)
self.assertAlmostEqual(v2[1], 1.0655e-08, places=4) # Tests gammai.gcf().
print "pattern.metrics.chi2()"

def test_chi_squared_p(self):
# Assert chi-squared P-value (upper tail).
for df, X2 in [
(1, ( 3.85, 5.05, 6.65, 7.90)),
(2, ( 6.00, 7.40, 9.25, 10.65)),
(3, ( 7.85, 9.40, 11.35, 12.85)),
(4, ( 9.50, 11.15, 13.30, 14.90)),
(5, (11.10, 12.85, 15.10, 16.80))]:
for i, x2 in enumerate(X2):
v = metrics.chi2p(x2, df, tail=metrics.UPPER)
self.assertTrue(v < (0.05, 0.025, 0.01, 0.005)[i])
print "pattern.metrics.chi2p()"

class TestSpecialFunctions(unittest.TestCase):

def setUp(self):
pass

def test_erfc(self):
# Assert complementary error function.
for x, y in [
(-3.00, 2.000),
(-2.00, 1.995),
(-1.00, 1.843),
(-0.50, 1.520),
(-0.25, 1.276),
( 0.00, 1.000),
( 0.25, 0.724),
( 0.50, 0.480),
( 1.00, 0.157),
( 2.00, 0.005),
( 3.00, 0.000)]:
self.assertAlmostEqual(metrics.erfc(x), y, places=3)

#---------------------------------------------------------------------------------------------------

Expand All @@ -254,6 +300,7 @@ def suite():
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestProfiling))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestStringFunctions))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestStatistics))
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSpecialFunctions))
return suite

if __name__ == "__main__":
Expand Down

0 comments on commit a57c89e

Please sign in to comment.