In [1]:
from significance_of_mean_cuda import significance_of_mean_cuda
from utils import getdf
import numpy as np
import time
import multiprocessing
import concurrent.futures as cf
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
from IPython.display import Image
import sklearn
import time
from scipy.stats import ttest_ind, ttest_rel, chisquare, ks_2samp
import pandas as pd
import seaborn as sns

import matplotlib as mpl

mpl.rcParams['text.usetex'] = False  # not really needed

from scipy import stats
from scipy.stats import mannwhitneyu
import pickle
import matplotlib.pyplot as plt

### Calibration-plot

In [2]:
def sns_plot(df, save_name=None):
    #sns.set(font_scale=1.5)
    sns.set(style="white")
    sns.set_context("talk")
    c1, c2 = df.columns
    low = min(min(df[c1]), min(df[c2]))
    hi = max(max(df[c1]), max(df[c2]))
    
    g=sns.lmplot(x=c1, y =c2, data=df, 
                  fit_reg=False, height=7, truncate=True, scatter_kws={"s": 15})
    
    g.set(xscale="log", yscale="log")
    axes = g.axes
    g.set(ylim=(low,hi), xlim=(low,hi))
    plt.plot([low,hi],[low,hi], "k", linewidth=1)
    sns.set_style("ticks")
    sns.despine()
    g.fig.tight_layout()
    if save_name:
        g.savefig(save_name)

In [3]:
def multiple_plot(df, save_name):
    #sns.set(font_scale=1.5)
    sns.set(style="white")
    sns.set_context("talk")
    low = min(df["Theoretical p-value"])
    hi = max(df["Theoretical p-value"])
    g=sns.lmplot(x='Theoretical p-value', y ='Observed p-value', data=df, 
                  fit_reg=False, height=7, truncate=True, scatter_kws={"s": 15}, hue="Test")
    
    g.set(xscale="log", yscale="log")
    axes = g.axes
    g.set(ylim=(low,hi), xlim=(low,hi))
    plt.plot([low,hi],[low,hi], "k", linewidth=1)
    sns.set_style("ticks")
    sns.despine()
    g.fig.tight_layout()
    g.savefig(save_name)

In [4]:
def exactTest(A,B, bins=10, one_side=True):
    SGM = significance_of_mean_cuda(bins, dtype_v=np.uint64,dtype_A=np.float64)
    SGM.run(np.asarray(A),np.asarray(B))
    if one_side:
        return SGM.get_p_values()
    else:
        return [ 2*p for p in SGM.get_p_values()]
        #return [2 * min( p, (1-p)) for p in SGM.get_p_values()]

def MWU(A, B, one_side=True):
    p_mw = list()
    for a,b in zip(A, B):
        if one_side:
            p_mw.append(mannwhitneyu(a,b, alternative="less")[1])
        else:
            p_mw.append(mannwhitneyu(a,b, alternative="two-sided")[1])
    return p_mw

def ttests(A,B, one_side=True):
    p_t = list()
    for x, y in zip(A, B):
        t, p = ttest_ind(y, x)
        if one_side:
            p = p/2
            if t<0:
                p = 1-p
        p_t.append(p)
    return p_t


In [5]:
def runTestInBatch(Abatch, Bbatch, sample_n, w):
    allPval = list()
    batchSize = len(Abatch)
    for j in range(0, batchSize, sample_n):
        Abatch_div = Abatch[j:j+sample_n]
        Bbatch_div = Bbatch[j:j+sample_n]
        pExactListNorm = exactTest(Abatch_div, Bbatch_div, w, True)
        allPval.append(pExactListNorm)
    return allPval

def getTimeSeries(setNList, sampleNList, NwList, batchsize, sampleRangeMax):
    ALLTimeList = list()
    for set_n, sample_n, w in zip(setNList, sampleNList, NwList):
        allTimes = list()
        AN, BN = [np.random.normal(0,1,set_n) for i in range(sampleRangeMax)], [np.random.normal(0,1,set_n) for i in range(sampleRangeMax)]
        for i in range(0, len(AN), batchsize):
            Abatch = AN[:i+batchsize]
            Bbatch = BN[:i+batchsize]
            print("Batch len {}".format(len(Abatch)))
        
            start = time.time()
            allPval = runTestInBatch(Abatch, Bbatch, sample_n, w)
            print(allPval)
            end = time.time()
            dt = end - start
            print(dt)
            allTimes.append(dt)
            print("------------")
        ALLTimeList.append(allTimes)
    return ALLTimeList
        
    

    

    
    

In [6]:
def preparePandas(timeData, sampleSizes, experimentName=None):
    preparePd = list()
    for time, sample in zip(timeData, sampleSizes):
        if experimentName:
            preparePd.append([str(experimentName),time, sample])
        else:
            preparePd.append([time, sample])
    return preparePd

In [7]:
def timePlotSNS(df,binVar=False, log=False, path=None):
    fig, ax = plt.subplots()

    sns.set(style="white")
    sns.set_context("talk")
    
    
    
    pdData = df
    
    palette = dict(zip(set(df.Experiment), ["r", "g", "b"]))
    
    x = pdData["time(s)"].values
    
    if log:        
        MAX = max(x)
        MIN = min(x)
        


        RANGE = np.arange(np.floor(MIN), np.ceil(MAX))
        snsPlot = sns.lineplot(x="Sample size", y="time(s)",
             hue="Experiment",
             data=pdData, palette=palette)#.set(yticks = RANGE, yticklabels=10**RANGE)
        plt.yticks(RANGE, 10.0**RANGE)
        
        
    else:
        snsPlot = sns.lineplot(x="Sample size", y="time(s)",
             hue="Experiment",
             data=pdData,palette=palette)
    
    if binVar:
        plt.xlabel(r"$n$")
        
    else:
        plt.xlabel(r"$n$")
    
    
    
    plt.legend(loc='upper left')
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles=handles[1:], labels=labels[1:])
    

    plt.setp(snsPlot.get_legend().get_texts(), fontsize='12')
    
    sns.despine()
    plt.tight_layout()
    plt.gcf().subplots_adjust(left=0.30)


In [8]:
def getPATH(path, suffix, prefix):
    return path + '/'+ suffix + '/' + prefix

In [9]:
path = "figures/paralellVsFastApprox"

# Set size 20 and sample size 10000

In [10]:
import numpy as np
from significance_of_mean_cuda import significance_of_mean_cuda
from scipy.stats import mannwhitneyu, ttest_ind
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import getdf

In [None]:
np.random.seed(10)

In [None]:
setN = [500,500,500]
sampleN = [2,20,4]
W = [2000,200,1000]
batchsize = 1
sampleRangeMax =24

In [None]:
setN = [500]
sampleN = [2]
W = [2000]
batchsize = 1
sampleRangeMax =1

In [None]:
P = getTimeSeries(setN, sampleN, W, 100, sampleRangeMax)

In [None]:
P

In [None]:
#pickle.dump(ALLTime2, open("data/memeroytest/LargeSetSize", "wb"))
LargeSetSize = pickle.load(open("data/memeroytest/LargeSetSize", "rb"))

In [None]:
AN, BN = [np.random.normal(0,1,setN[0]) for i in range(sampleRangeMax)], [np.random.normal(0,1,setN[0]) for i in range(sampleRangeMax)]
batchList = list()
for i in range(0, len(AN), 1):
    Abatch = AN[:i+1]
    batchList.append(len(Abatch))

In [None]:
data_0 = preparePandas(batchList, LargeSetSize[0], "$n_{w}=2000$")
data_1 = preparePandas(batchList, LargeSetSize[1], "$n_{w}=200$")
data_2 = preparePandas(batchList, LargeSetSize[2], "$n_{w}=1000$")

In [None]:
data = data_0 + data_1 + data_2
pdData = pd.DataFrame(data, columns=['Experiment','Sample size', 'time(s)',])

In [None]:
x0, y0 = getScatterData(sampleN[0],LargeSetSize[0], sampleRangeMax, 1)
x1, y1 = getScatterData(sampleN[1],LargeSetSize[1], sampleRangeMax, 1)
x2, y2 = getScatterData(sampleN[2],LargeSetSize[2], sampleRangeMax, 1)

In [None]:
timePlotSNS(pdData, log=False)
plt.scatter(y0, x0, marker="o", color="g")
plt.scatter(y1, x1, marker="o", color="r")
plt.scatter(y2, x2, marker="o", color="b")
plt.savefig("figures/memoryPlot/largeN")

In [None]:
plt.plot(batchList, LargeSetSize[0])
plt.plot(batchList, LargeSetSize[1])
plt.plot(batchList, LargeSetSize[2])

In [None]:
#data_0 = preparePandas(batchList, ALLTime2[0], "nw=2000")
#data_1 = preparePandas(batchList, ALLTime2[1], "nw=200")
#data_2 = preparePandas(batchList, ALLTime2[2], "nw=1000")

data_0 = preparePandas(batchList, np.log10(ALLTime2[0]), "nw=2000")
data_1 = preparePandas(batchList, np.log10(ALLTime2[1]), "nw=200")
data_2 = preparePandas(batchList, np.log10(ALLTime2[2]), "nw=1000")

In [None]:
#data = data_0 + data_1 + data_2
data = data_0 + data_2

In [None]:
pdData = pd.DataFrame(data, columns=['Experiment','Sample size', 'time(s)',])

In [None]:
timePlotSNS(pdData, log=True)

In [None]:
batchList = list()
for i in range(0, len(AN), batchsize):
    Abatch = AN[:i+batchsize]
    batchList.append(len(Abatch))

In [None]:
batchsize = 100



In [None]:
setN = [40,40,40]
sampleN = [400,2500,650]
W = [1250,200,800]
sampleRangeMax= 2700
#setN = [40]
#sampleN = [400]
#W = [1250]

In [None]:
#ALLTimeList = getTimeSeries(setN, sampleN, W, 100, sampleRangeMax)

In [None]:
AN, BN = [np.random.normal(0,1,setN[0]) for i in range(sampleRangeMax)], [np.random.normal(0,1,setN[0]) for i in range(sampleRangeMax)]



In [None]:
batchList = list()
for i in range(0, len(AN), 100):
    Abatch = AN[:i+100]
    batchList.append(len(Abatch))

In [None]:
data_0 = preparePandas(batchList, ALLTimeList[0], "$n_{w}=1250$")
data_1 = preparePandas(batchList, ALLTimeList[1], "$n_{w}=200$")
data_2 = preparePandas(batchList, ALLTimeList[2], "$n_{w}=800$")

In [None]:
data = data_0 + data_1 + data_2
pdData = pd.DataFrame(data, columns=['Experiment','Sample size', 'time(s)',])

In [None]:
x0, y0 = getScatterData(sampleN[0],ALLTimeList[0], sampleRangeMax, 100)
x1, y1 = getScatterData(sampleN[1],ALLTimeList[1], sampleRangeMax, 100)
x2, y2 = getScatterData(sampleN[2],ALLTimeList[2], sampleRangeMax, 100)

In [None]:
!pwd

In [None]:

timePlotSNS(pdData, log=False)
plt.scatter(y0, x0, marker="o", color="b")
plt.scatter(y1, x1, marker="o", color="r")
plt.scatter(y2, x2, marker="o", color="g")
plt.savefig("figures/memoryPlot/smallN")

In [None]:
def getScatterData(sN,tList, maxRange, batchSize):
    y=list()
    x=list()
    for i, j in enumerate(range(0, maxRange, batchSize)):
        if j % sN ==0 and j!=0:
            y.append(j)
            x.append(tList[i-1])
    return x, y
    
    

In [None]:
x0, y0 = getScatterData(sampleN[0],ALLTimeList[0], sampleRangeMax, 100)
x1, y1 = getScatterData(sampleN[1],ALLTimeList[1], sampleRangeMax, 100)
x2, y2 = getScatterData(sampleN[2],ALLTimeList[2], sampleRangeMax, 100)

In [93]:
A, B = [np.random.normal(0,1,515) for i in range(1)], [np.random.normal(0,1,515) for i in range(1)]


In [107]:
SGM = significance_of_mean_cuda(100, dtype_v=np.uint64,dtype_A=np.float64)
SGM.run(np.asarray(A),np.asarray(B))
SGM.get_p_values()

This data requires 244.94726400000002 MiB on the GPU.


array([0.24908181])

In [108]:
max(SGM.numerator)

array([2.23884673e+305])

In [109]:
sys.float_info.max

1.7976931348623157e+308

In [103]:
#|A|=520, bins = 300, 8.77112071e+307

In [None]:

y=list()
x=list()
for i, j in enumerate(range(0, sampleRangeMax, 100)):
    if j % sampleN[0]==0 and j!=0:
        y.append(j)
        x.append(ALLTimeList[0][i-1])
    

In [None]:
y

In [None]:
x

In [None]:

AN, BN = [np.random.normal(0,1,set_n) for i in range(sample_n)], [np.random.normal(0,1,set_n) for i in range(sample_n)]


# Exact test

In [None]:
%time pExactListNorm = exactTest(BN,AN, 1000, True)
#pDfExact = getdf(pExactListNorm, sample_n, "parallelized shift")
#my_scatter_plot(pDfExact,"figures/calibration/largeSample_10000_20/e_norm_0_1_50")

In [None]:
pExactListNorm

# Mann-Whitney

In [None]:
pMWUListNorm = MWU(AN, BN, True)
#pDfMW = getdf(pMWUListNorm, sample_n, "MWU")
#my_scatter_plot(pDfMW,"figures/calibration/largeSample_10000_20/mw_norm_0_1_50")

# ttest

In [None]:
pTtestListNorm = ttests(AN, BN, True)
#pDfTtest = getdf(pTtestListNorm, sample_n, "t test")
#my_scatter_plot(pDfTtest,"figures/calibration/largeSample_10000_20/t_norm_0_1_50")

In [None]:
#multiple_plot(pd.concat((pDfTtest, pDfMW, pDfExact)), "figures/calibration/largeSample_10000_20/allTests_0_1_50")

In [None]:
dataExactTtestNorm = pd.DataFrame(data={'parallelized': pExactListNorm, 'ttest': pTtestListNorm})
dataMWUTtestNorm = pd.DataFrame(data={'MWU': pMWUListNorm, 'ttest': pTtestListNorm})
dataEaxactMWUNorm = pd.DataFrame(data={'parallelized': pExactListNorm, 'MWU': pMWUListNorm})

In [None]:
sns_plot(dataExactTtestNorm)

In [None]:
sns_plot(dataEaxactMWUNorm)

In [None]:
sns_plot(dataMWUTtestNorm)

# $log N(0,2)$

In [None]:
std = 2

In [None]:
ALn, BLn = [np.random.lognormal(0,std,set_n) for i in range(sample_n)], [np.random.lognormal(0,std,set_n) for i in range(sample_n)]


# Exact test

In [None]:
pExactListLog = exactTest(BLn,ALn, 50, True)
#pDfExact = getdf(pExactListLog, sample_n, "parallelized shift")
#my_scatter_plot(pDfExact,"figures/calibration/largeSample_10000_20/e_Lnorm_0_1_50")

# Mann Whitney

In [None]:
pMWUListLog = MWU(ALn, BLn, True)
#pDfMW = getdf(pMWUListLog, sample_n, "MWU")
#my_scatter_plot(pDfMW,"figures/calibration/largeSample_10000_20/mw_Lnorm_0_1_50")

# ttest

In [None]:
pTtestListLog = ttests(ALn, BLn, True)
#pDfTtest = getdf(pTtestListLog, sample_n, "t test")
#my_scatter_plot(pDfTtest,"figures/calibration/largeSample_10000_20/t_Lnorm_0_1_50")

In [None]:
#multiple_plot(pd.concat((pDfTtest, pDfMW, pDfExact)), "figures/calibration/largeSample_10000_20/allTests_Lnorm_0_1_50")

In [None]:
dataExactTtestLog = pd.DataFrame(data={'parallelized': pExactListLog, 'ttest': pTtestListLog})
dataMWUTtestLog = pd.DataFrame(data={'MWU': pMWUListLog, 'ttest': pTtestListLog})
dataEaxactMWULog = pd.DataFrame(data={'parallelized': pExactListLog, 'MWU': pMWUListLog})

In [None]:
sns_plot(dataExactTtestLog)

In [None]:
sns_plot(dataEaxactMWULog)

In [None]:
sns_plot(dataMWUTtestLog)