# scipy/scipy

### Subversion checkout URL

You can clone with
or
.

BUG: stats: Merge pull request #87 (handle array_like in kruskal, imp…

`…rove code in kruskal and f_oneway)`
commit ea3d301172f6cdf1f6703ec1ae1ffcedec7124ec 2 parents 0496569 + 9807ac1
Warren Weckesser authored
Showing with 92 additions and 32 deletions.
1. +32 −32 scipy/stats/stats.py
2. +60 −0 scipy/stats/tests/test_stats.py
64 scipy/stats/stats.py
 @@ -2152,26 +2152,25 @@ def f_oneway(*args): .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ - na = len(args) # ANOVA on 'na' groups, each in it's own array - tmp = map(np.array,args) + args = map(np.asarray, args) # convert to an numpy array + na = len(args) # ANOVA on 'na' groups, each in it's own array alldata = np.concatenate(args) bign = len(alldata) - sstot = ss(alldata)-(square_of_sums(alldata)/float(bign)) + sstot = ss(alldata) - (square_of_sums(alldata) / float(bign)) ssbn = 0 for a in args: - ssbn = ssbn + square_of_sums(array(a))/float(len(a)) - ssbn = ssbn - (square_of_sums(alldata)/float(bign)) - sswn = sstot-ssbn - dfbn = na-1 + ssbn += square_of_sums(a) / float(len(a)) + ssbn -= (square_of_sums(alldata) / float(bign)) + sswn = sstot - ssbn + dfbn = na - 1 dfwn = bign - na - msb = ssbn/float(dfbn) - msw = sswn/float(dfwn) - f = msb/msw - prob = fprob(dfbn,dfwn,f) + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + f = msb / msw + prob = fprob(dfbn, dfwn, f) return f, prob - def pearsonr(x, y): """Calculates a Pearson correlation coefficient and the p-value for testing non-correlation. @@ -3548,30 +3547,31 @@ def kruskal(*args): .. [1] http://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance """ - if len(args) < 2: + args = map(np.asarray, args) # convert to a numpy array + na = len(args) # Kruskal-Wallis on 'na' groups, each in it's own array + if na < 2: raise ValueError("Need at least two groups in stats.kruskal()") - n = map(len,args) - all = [] - for i in range(len(args)): - all.extend(args[i].tolist()) - ranked = list(rankdata(all)) - T = tiecorrect(ranked) - args = list(args) - for i in range(len(args)): - args[i] = ranked[0:n[i]] - del ranked[0:n[i]] - rsums = [] - for i in range(len(args)): - rsums.append(np.sum(args[i],axis=0)**2) - rsums[i] = rsums[i] / float(n[i]) - ssbn = np.sum(rsums,axis=0) - totaln = np.sum(n,axis=0) - h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) - df = len(args) - 1 + n = np.asarray(map(len, args)) + + alldata = np.concatenate(args) + + ranked = rankdata(alldata) # Rank the data + T = tiecorrect(ranked) # Correct for ties if T == 0: raise ValueError('All numbers are identical in kruskal') + + # Compute sum^2/n for each group and sum + j = np.insert(np.cumsum(n), 0, 0) + ssbn = 0 + for i in range(na): + ssbn += square_of_sums(ranked[j[i]:j[i+1]]) / float(n[i]) + + totaln = np.sum(n) + h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) + df = na - 1 h = h / float(T) - return h, chisqprob(h,df) + return h, chisqprob(h, df) + def friedmanchisquare(*args):
60 scipy/stats/tests/test_stats.py
 @@ -1873,5 +1873,65 @@ def test_basic(self): # result in F being exactly 2.0. assert_equal(F, 2.0) + +class TestKruskal(TestCase): + + def test_simple(self): + """A really simple case for stats.kruskal""" + x = [1] + y = [2] + h, p = stats.kruskal(x, y) + assert_equal(h, 1.0) + assert_approx_equal(p, stats.chisqprob(h, 1)) + h, p = stats.kruskal(np.array(x), np.array(y)) + assert_equal(h, 1.0) + assert_approx_equal(p, stats.chisqprob(h, 1)) + + def test_basic(self): + """A basic test, with no ties.""" + x = [1, 3, 5, 7, 9] + y = [2, 4, 6, 8, 10] + h, p = stats.kruskal(x, y) + assert_approx_equal(h, 3./11, significant=10) + assert_approx_equal(p, stats.chisqprob(3./11, 1)) + h, p = stats.kruskal(np.array(x), np.array(y)) + assert_approx_equal(h, 3./11, significant=10) + assert_approx_equal(p, stats.chisqprob(3./11, 1)) + + def test_simple_tie(self): + """A simple case with a tie.""" + x = [1] + y = [1, 2] + h_uncorr = 1.5**2 + 2*2.25**2 - 12 + corr = 0.75 + expected = h_uncorr / corr # 0.5 + h, p = stats.kruskal(x, y) + # Since the expression is simple and the exact answer is 0.5, it + # should be safe to use assert_equal(). + assert_equal(h, expected) + + def test_another_tie(self): + """Another test of stats.kruskal with a tie.""" + x = [1, 1, 1, 2] + y = [2, 2, 2, 2] + h_uncorr = (12. / 8. / 9.) * 4 * (3**2 + 6**2) - 3 * 9 + corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8) + expected = h_uncorr / corr + h, p = stats.kruskal(x, y) + assert_approx_equal(h, expected) + + def test_three_groups(self): + """A test of stats.kruskal with three groups, with ties.""" + x = [1, 1, 1] + y = [2, 2, 2] + z = [2, 2] + h_uncorr = (12. / 8. / 9.) * (3*2**2 + 3*6**2 + 2*6**2) - 3 * 9 # 5.0 + corr = 1 - float(3**3 - 3 + 5**3 - 5) / (8**3 - 8) + expected = h_uncorr / corr # 7.0 + h, p = stats.kruskal(x, y, z) + assert_approx_equal(h, expected) + assert_approx_equal(p, stats.chisqprob(h, 2)) + + if __name__ == "__main__": run_module_suite()