In [1]:
import numpy as np
from scipy import stats
import pandas as pd

# Permutation Test

* is nonparametric statistics
* you can test anything with it


## In this example we test if two groups have different means

In [2]:
s1 = np.array([63.8, 56.4, 55.2, 58.5, 64.0, 51.6, 54.6, 71.0])
s2 = np.array([75.5, 83.9, 75.7, 72.5, 56.2, 73.4, 67.7, 87.9])

# obsorved difference
delta = abs(s1.mean() - s2.mean())

pooled = np.hstack([s1, s2])

n_permutations = 1000000
extreme_count = 0

for _ in range(n_permutations):
    # just to get the same result
    #np.random.seed(1)
    pooled = np.random.permutation(pooled)
    diff = abs(pooled[:len(s1)].mean() - pooled[len(s2):].mean())
    # count how many are more extreme then the obsorved difference
    if diff >= delta:
        extreme_count += 1

# calculate the p-value
pvalue = extreme_count / n_permutations
pvalue

0.00366

## Vectorized version
This method is faster. The calculation of permutation is split on memory chunks.

In [89]:
import math

s1 = np.array([63.8, 56.4, 55.2, 58.5, 64.0, 51.6, 54.6, 71.0])
s2 = np.array([75.5, 83.9, 75.7, 72.5, 56.2, 73.4, 67.7, 87.9])

n_permutations = 10000000

block_bytes = 298435456 # quarter of a gigabyte


# obsorved difference
delta = abs(s1.mean() - s2.mean())

pooled = np.hstack([s1, s2]).astype(np.float32)


bytes_per_float = np.float64(0).nbytes
permutations_per_block = block_bytes // (bytes_per_float * len(pooled))
if permutations_per_block > n_permutations:
    permutations_per_block = n_permutations

n_blocks = (bytes_per_float * len(pooled) * n_permutations) / block_bytes
n_blocks = math.ceil(n_blocks)

extreme_count = 0
n_permutations = 0
for _ in range(n_blocks):
    matrix = pd.DataFrame(np.random.rand(permutations_per_block, len(pooled)))
    matrix = matrix.rank(axis=1, method='first').astype(np.int64) - 1
    matrix = pooled[matrix]

    # the calculations
    # this could be replaced with median, std or other
    diff = matrix[:,:len(s1)].mean(axis=1) - matrix[:,len(s1):].mean(axis=1)
    extreme_count += (np.abs(diff) > delta).sum()
    n_permutations += permutations_per_block
# calculate the p-value
pvalue = extreme_count / n_permutations
pvalue

0.0038158683129125249

Same with t-test

In [88]:
tstat, pvalue = stats.ttest_ind(s1, s2)
print('test statisic: ', tstat)
print('pvalue:', pvalue)
print('The probability that the two distributions have equal means is ', pvalue)

test statisic:  -3.58752154254
pvalue: 0.00297115145528
The probability that the two distributions have equal means is  0.00297115145528
