In [1]:
from significance_of_mean_cuda import significance_of_mean_cuda
from utils import significance_of_mean, getdf, my_scatter_plot
import numpy as np
import time
import multiprocessing
import concurrent.futures as cf
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
from IPython.display import Image

import time
from scipy.stats import ttest_ind, ttest_rel, chisquare, ks_2samp
import pandas as pd
import seaborn as sns

import matplotlib as mpl

mpl.rcParams['text.usetex'] = False  # not really needed

from scipy import stats
from scipy.stats import mannwhitneyu

In [2]:
def p_value_calc(args):
    a,b, bins = args
    p=significance_of_mean(a,b, bins)[0]
    return p

### Load data from MC approximation of permutation test

In [3]:
def loadMcResult(sampleShape, mean):
    X_list = list()
    y_list = list()
    p_val_list = list()
    time_listMc = list()
    for S in sampleShape:
        X = pd.read_csv("dataFastPerm/uniform/data_{}/X_{}.csv".format(mean, S))
        Y = pd.read_csv("dataFastPerm/uniform/data_{}/y_{}.csv".format(mean, S))
        p_val = pd.read_csv("dataFastPerm/uniform/data_{}/error_{}.csv".format(mean, S))
        timeMc = pd.read_csv("dataFastPerm/uniform/data_{}/Time_{}.csv".format(mean, S))
    
    
        X.columns = range(X.shape[1])
        Y.columns = range(Y.shape[1])
        p_val.columns = range(p_val.shape[1])
        timeMc.columns = range(timeMc.shape[1])
    
    
        X_list.append([np.array(x.split(" ")[1:]).astype(float) for x in X[0].values])
        y_list.append([np.array(y.split(" ")[1:]).astype(float) for y in Y[0].values])
        p_val_list.append(np.array([float(p[0].split(" ")[1]) for p in p_val.values]))
        time_listMc.append([float(t[0].split(" ")[1]) for t in timeMc.values])
    
    return X_list, y_list, p_val_list, time_listMc

### Get p-value for the the parallelized permutation test

In [4]:
def run_test(X,Y,bins, parallel=True, midP=False):

    
    
    if parallel:
        #Exact test
        SGM = significance_of_mean_cuda(bins, dtype_v=np.uint32,dtype_A=np.float64)
        SGM.run(X.reshape(1,-1),Y.reshape(1,-1), midP)
        p_val = SGM.p_values[0]
    else:
        p_val = p_value_calc([list(X), list(Y), bins])

    return p_val

### Get all p-values from the prallelized exact test from the same data as the MC version

In [5]:
def shiftMethod(X_list, y_list, bins, parallel=True, midP=False):
    pe_list = list()
    TIME = list()

    for Xp, yp in zip(X_list, y_list):
        Xp = np.asarray(Xp).T
        yp = np.asarray(yp).T
        
        p_e = list()
        time_list = list()
    
        for x, y in zip(Xp, yp):
            
            start = time.time()
            p_e.append(run_test(y, x, bins, parallel, midP))
            end = time.time()
        
            time_list.append(end - start)
    
        pe_list.append(p_e)
        TIME.append(time_list)
    
    return pe_list, TIME
    

In [6]:
def ttest(X_list, y_list):
    pt_list = list()

    for Xp, yp in zip(X_list, y_list):
        Xp = np.asarray(Xp).T
        yp = np.asarray(yp).T
        
        p_t = list()
    
        for x, y in zip(Xp, yp):
            
            t, p = ttest_ind(y, x)
            p = p/2
            if t<0:
                p = 1-p
       
            p_t.append(p)
            
        pt_list.append(p_t)

    
    return pt_list
    

In [7]:
def mannWhitney(X_list, y_list, alternative="less"):
    assert alternative in ["less","greater","two-tail"]
    pmw_list = list()

    for Xp, yp in zip(X_list, y_list):
        Xp = np.asarray(Xp).T
        yp = np.asarray(yp).T
        
        p_mw = list()
    
        for x, y in zip(Xp, yp):
            

            p = mannwhitneyu(x, y, alternative=alternative)[1]
            
            
            p_mw.append(p)
            
        pmw_list.append(p_mw)

    
    return pmw_list

### Compare parallelized and MC version

### Calibration-plot

In [8]:
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

def my_scatter_plot(df,save_name):
    sns.set(style="white")
    sns.set_context("talk")
    low = min(df["Theoretical p-value"])
    hi = max(df["Theoretical p-value"])
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(xscale="log", yscale="log")
    
    err2 = r2(df["Observed p-value"], pDf["Theoretical p-value"])
    
    g=sns.regplot(x='Theoretical p-value', y ='Observed p-value', data=df,  ax=ax, fit_reg=False, 
                  scatter_kws={"s": 5})
    g.plot([low,hi], [low,hi], 'k-', linewidth=.5)
    
    sns.despine()
    plt.title("Pearson correlation coefficient : {}".format(np.around(err2,6)))
    f.tight_layout()
    f.savefig(save_name)

In [9]:
def getPATH(path, suffix, prefix):
    return path + '/'+ suffix + '/' + prefix

In [10]:
path = "figures/paralellVsFastApprox"

## Sample sizes of X and Y

In [11]:
sampleShape = [10,50,100,150,200,250,300]

# Calibration: with $X \sim N(5.0,1)$ and $Y\sim N(5.0,1)$

## Load data

In [12]:
mean=5

In [13]:
X_list_, y_list_, p_val_list_, time_listMc_ = loadMcResult(sampleShape, mean)

In [None]:
pe_list_, TIME_ = shiftMethod(np.asarray(X_list_), np.asarray(y_list_), 40)


# $|n|=|m|=10$

## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[0], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[0], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pmw_list5_2 = mannWhitney(np.asarray(X_list_), np.asarray(y_list_))

In [None]:
pDf = getdf(pmw_list5_2[0], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pt_list5_2 = ttest(np.asarray(X_list_), np.asarray(y_list_))

In [None]:
pDf = getdf(pt_list5_2[0], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

# $|n|=|m|=50$

## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[1], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[1], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann whitney

In [None]:
pDf = getdf(pmw_list5_2[1], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[1], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

# $|n|=|m|=100$

## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[2], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximations

In [None]:
pDf = getdf(p_val_list_[2], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pDf = getdf(pmw_list5_2[2], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[2], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

# $|n|=|m|=150$

## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[3], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[3], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pDf = getdf(pmw_list5_2[3], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[3], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

# $|n|=|m|=200$

## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[4], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[4], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pDf = getdf(pmw_list5_2[4], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[4], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## $|n|=|m|=250$


## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[5], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[5], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pDf = getdf(pmw_list5_2[5], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[5], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## $|n|=|m|=300$


## Parallelized exact test

In [None]:
pDf = getdf(pe_list_[6], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Fast approximation

In [None]:
pDf = getdf(p_val_list_[6], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## Mann Whitney

In [None]:
pDf = getdf(pmw_list5_2[6], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")

## t-test

In [None]:
pDf = getdf(pt_list5_2[6], 1000)
my_scatter_plot(pDf,"figures/normal_calibration")