In [1]:
import numpy as np
import pandas as pd
import time
import permutationTest as ppt
from scipy.stats import ttest_ind, mannwhitneyu

In [None]:
%load_ext cython
%load_ext rpy2.ipython

In [None]:
%%R

install.packages("coin")

install.packages("ggplot2")

install.packages("perm")

install.packages("exactRankTests")


library(ggplot2)
library(coin)

In [None]:
%%R
    runMcCoin <- function(x,y,batches,output,n_samples) {
       e <- list(mode="vector",length=n_samples)
        for (i in 1:n_samples) {
            
            DV <- c(x[[i]], y[[i]])
            IV <- factor(rep(c("A", "B"), c(length(x[[i]]), length(y[[i]]))))
                    # for oneway_test(), pvalue()
            output[[i]] <- pvalue(oneway_test(DV ~ IV, distribution=approximate(batches)))
            #output[[i]] <- pvalue(oneway_test(DV ~ IV, distribution="split-up""))
      
            
      
            
            }
    

    return(output)
        
    }

In [None]:
def McCoin(A,B, batches):
    x = pd.DataFrame(A, columns=list(range(A.shape[1]))).T
    y = pd.DataFrame(B, columns=list(range(B.shape[1]))).T
    n_samples = x.shape[1]
    outputDf = pd.DataFrame(n_samples * [0], columns=["output"]).T
    
    %R -i x
    %R -i y
    %R -i batches
    %R -i outputDf
    %R -i n_samples
    
    
    %R out <- runMcCoin(x,y, batches, outputDf, n_samples)
    
    
    %R -o out
    return list(out.T["output"].values)



In [None]:
def exactTest(A,B, batches, bins=10, one_side=False):
    return pt.GreenFloatCuda(A,B,bins, batch_size=batches)
    
    
def MWU(A, B, one_side=False):
    p_mw = list()
    for a,b in zip(A, B):
        if one_side:
            p_mw.append(mannwhitneyu(a,b, alternative="less")[1])
        else:
            p_mw.append(mannwhitneyu(a,b, alternative="two-sided")[1])
    return p_mw

def ttests(A,B, one_side=False):
    p_t = list()
    for x, y in zip(A, B):
        t, p = ttest_ind(y, x)
        if one_side:
            p = p/2
            if t<0:
                p = 1-p
        p_t.append(p)
    return p_t

In [None]:
def getPrediction(A,B):
    Data = dict()
    A,B = np.array(A), np.array(B)
    Data["eList"] = exactTest(A,B, int(A.shape[0]/500), 100, False)
    Data["MC_1_000"] = McCoin(A,B, 1_000)
    Data["MC_10_000"] = McCoin(A,B, 10_000)
    Data["MC_100_000"] = McCoin(A,B, 100_000)
    Data["MC_200_000"] = McCoin(A,B, 200_000)
    Data["mwuList"] = MWU(A, B, False)
    Data["ttList"] = ttests(A, B, False)
    
    
                                                      
    return Data

    

    

# $A \sim N(0,1)\ and B \sim N(0,1)$. $ |A|=20$ and $|B|=20$ with $2000$ samples.

In [None]:
def getSynteticData(func, setN=20, sampleN=2_000, mean=0, std=1,seed=1):
    """Generate synthetic data"""
    np.random.seed(seed)
    AN, BN = [func(mean,std,setN) for i in range(sampleN)], [func(0,std,setN) for i in range(sampleN)]
    return AN, BN

In [None]:
S = 200_000

In [None]:
AN, BN = getSynteticData(np.random.normal,mean=0,sampleN=S)
%time DataNormNoDiff = getPrediction(AN, BN)

In [None]:
def getdf(P, num_examples, test=None):
    P.sort()
    p_arr = np.array(P)
    offset = 1.0/float(num_examples)
    ideal_arr = np.linspace(offset,1.0-offset,num_examples)
    if test:
        Pdf = pd.DataFrame({'Observed p-value':p_arr,'Theoretical p-value':ideal_arr, "Test":[test]*ideal_arr.shape[0]})
    return Pdf

In [None]:
normMC_1_000_Df = getdf(DataNormNoDiff["MC_1_000"], S, "Coin Monte Carlo, b=1000")
normMC_10_000_Df = getdf(DataNormNoDiff["MC_10_000"], S, "Coin Monte Carlo, b=10000")
normMC_100_000_Df = getdf(DataNormNoDiff["MC_100_000"], S, "Coin Monte Carlo, b=100000")
normMC_200_000_Df = getdf(DataNormNoDiff["MC_100_000"], S, "Coin Monte Carlo, b=200000")
normexactDf = getdf(DataNormNoDiff["eList"], S, "Parallel Green")
normttDf = getdf(DataNormNoDiff["ttList"], S, "$\it{T}$ test")
normmwuDf = getdf(DataNormNoDiff["mwuList"], S, "Mann–Whitney $\it{U}$ test")

In [None]:
def multiple_plot(df, save_name):
    sns.set(style="white")
    sns.set_context("talk")
    low = min(df["Theoretical p-value"])/2
    hi = max(df["Theoretical p-value"])
    g=sns.lmplot(x='Theoretical p-value', y ='Observed p-value', data=df, 
                  fit_reg=False, height=7, truncate=True, scatter_kws={"s": 15}, hue="Test")
    
    g.set(xscale="log", yscale="log")
    g._legend.remove()


    plt.xlabel(r'Theoretical $\it{p}$ value', fontsize=24)
    plt.ylabel(r'Observed $\it{p}$ value', fontsize=24)
    
    axes = g.axes
    g.set(ylim=(low,hi), xlim=(low,hi))
    plt.plot([low,hi],[low,hi], "k", linewidth=1)
    plt.plot([2 * low,2 * hi],[low,hi], "--k", linewidth=1)
    plt.plot([low / 2,hi / 2],[low,hi], "--k", linewidth=1)
    sns.set_style("ticks")
    sns.despine()
    g.fig.tight_layout()
    
    
    plt.legend().set_title('')
    plt.legend(loc=2,prop={'size': 24})
    plt.xticks(size = 24)
    plt.yticks(size = 24)


    
    g.savefig(save_name)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
multiple_plot(pd.concat((normMC_1_000_Df, normMC_10_000_Df, normMC_100_000_Df,normexactDf, normttDf, normmwuDf)), "./figures/experiemnt5/allTests_0_1_50")
              
              

# $A \sim logN(0,1)\ and B \sim logN(0,1)$. $ |A|=20$ and $|B|=20$ with $2000$ samples.

In [None]:
AN, BN = getSynteticData(np.random.lognormal, mean=0, sampleN=S)
DataLogNormNonDiff = getPrediction(AN, BN)

In [None]:
logfpDf = getdf(DataLogNormNonDiff["fpList"], S, "FastPerm")
logexactDf = getdf(DataLogNormNonDiff["eList"], S, "Parallel Green")
logttDf = getdf(DataLogNormNonDiff["ttList"], S, "$\it{T}$ test")
logmwuDf = getdf(DataLogNormNonDiff["mwuList"], S, "Mann–Whitney $\it{U}$ test")

In [None]:
multiple_plot(pd.concat((logfpDf, logexactDf, logttDf, logmwuDf)), "figures/calibration/allTestsLog_0_1_50")
