In [9]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy import stats

In [6]:
data = pd.read_csv('data/AUCs.txt', sep = '\t', index_col=0)

In [7]:
data

Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
adult (sample),0.763,0.768,0.771,0.798
breast cancer,0.599,0.591,0.59,0.569
breast cancer wisconsin,0.954,0.971,0.968,0.967
cmc,0.628,0.661,0.654,0.657
ionosphere,0.882,0.888,0.886,0.898
iris,0.936,0.931,0.916,0.931
liver disorders,0.661,0.668,0.609,0.685
lung cancer,0.583,0.583,0.563,0.625
lymphography,0.775,0.838,0.866,0.875
mushroom,1.0,1.0,1.0,1.0


In [49]:
wilcoxon = []
for i, col_i in enumerate(data.columns):
    for j, col_j in enumerate(data.columns):
        if i >= j:
            continue
        wilcoxon.append([col_i, 
                           col_j, 
                           stats.wilcoxon(data[col_i], data[col_j]).pvalue, 
                           stats.wilcoxon(data[col_i], data[col_j]).statistic])

In [52]:
df_wilc = pd.DataFrame(wilcoxon, columns=['col_i', 'col_j', 'pv', 'stat']).sort_values(by='pv', ascending=True)   
df_wilc

Unnamed: 0,col_i,col_j,pv,stat
0,C4.5,C4.5+m,0.010757,6.5
2,C4.5,C4.5+m+cf,0.015906,11.0
5,C4.5+cf,C4.5+m+cf,0.022909,10.0
3,C4.5+m,C4.5+cf,0.046333,17.0
4,C4.5+m,C4.5+m+cf,0.327826,22.0
1,C4.5,C4.5+cf,0.861262,43.0


### метод Холма

In [53]:
reject, p_corrected, a1, a2 = multipletests(df_wilc.pv, 
                                            alpha = 0.05, 
                                            method = 'holm')

In [55]:
multipletests(df_wilc.pv,  
              alpha = 0.05,
              method = 'holm')

(array([False, False, False, False, False, False], dtype=bool),
 array([ 0.0645428 ,  0.07953222,  0.0916364 ,  0.13899819,  0.65565135,
         0.86126233]),
 0.008512444610847103,
 0.008333333333333333)

### Метод Бенджамини-Хохберга

In [56]:
multipletests(df_wilc.pv,  
              alpha = 0.05,
              method = 'fdr_bh')

(array([ True,  True,  True, False, False, False], dtype=bool),
 array([ 0.0458182 ,  0.0458182 ,  0.0458182 ,  0.06949909,  0.39339081,
         0.86126233]),
 0.008512444610847103,
 0.008333333333333333)

## вариант с альтернативным перебором пар

In [57]:
from itertools import combinations

In [69]:
wilc = [[pair, 
        stats.wilcoxon(data[pair[0]], data[pair[1]]).pvalue, 
        stats.wilcoxon(data[pair[0]], data[pair[1]]).statistic] 
        for pair in combinations(data.columns, 2)]

pd.DataFrame(wilc, columns=['pair', 'pv', 'stat']).sort_values(by='pv', ascending=True)

Unnamed: 0,pair,pv,stat
0,"(C4.5, C4.5+m)",0.010757,6.5
2,"(C4.5, C4.5+m+cf)",0.015906,11.0
5,"(C4.5+cf, C4.5+m+cf)",0.022909,10.0
3,"(C4.5+m, C4.5+cf)",0.046333,17.0
4,"(C4.5+m, C4.5+m+cf)",0.327826,22.0
1,"(C4.5, C4.5+cf)",0.861262,43.0
