In [1]:
import numpy as np
import pandas as pd
import time
import parallelPermutationTest as ppt
from scipy.stats import ttest_ind, mannwhitneyu

In [None]:
%load_ext cython
%load_ext rpy2.ipython

In [None]:
%%R

install.packages("coin")

install.packages("ggplot2")

install.packages("perm")

install.packages("exactRankTests")


library(ggplot2)
library(coin)

In [None]:
url = "https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip"
!mkdir "data/"
!wget -P "data/" "https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip"
!unzip "./data/41586_2016_BFnature18003_MOESM111_ESM.zip" -d "./data/"

In [None]:
path = "./data/nature18003-s2/"
T1 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable01.xlsx")
T3 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable03.xlsx")

## Extract important column headings

In [None]:
paiteint_id = T1.columns[0]
stats_col = T1.columns[5: 5 + 3]

### Patient column  

In [None]:
paiteint_id

### Cancer phenotype coulmns

In [None]:
stats_col

### Check viable phenotype status

In [None]:
all_lab = list()
for r in T1[stats_col].iterrows():
    all_lab += list(r[1].values)

In [None]:
list(set(all_lab))

### Let's drop patient with "Equivocal" and "Indeterminate" status

In [None]:
drop_label = ['Indeterminate', 'Equivocal']

In [None]:
T1.head()

In [None]:
mask1 = [True if (len(set(r[1].values) & set(drop_label)) == 0) else False for r in T1[stats_col].iterrows()]
T1 = T1[mask1]
T1.reset_index(drop=True, inplace=True)

### Let's divide the dataframe into two dataframes with triple negative (TN) and non-triple negative (NTN)

In [None]:
mask2 = [all([True if v=='Negative' else False for v in r[1].values] ) for r in T1[stats_col].iterrows()]

T1_TNP = T1[np.array(mask2)]
T1_TNP.reset_index(drop=True, inplace=True)

T1_not_TNP = T1[~np.array(mask2)]
T1_not_TNP.reset_index(drop=True, inplace=True)



### Obtain patient ids for NT and NTN

In [None]:
TNPpateintIds = T1_TNP[paiteint_id].values
NotTNPpateintIds = T1_not_TNP[paiteint_id].values

### Remove genes (rows) containing NaN for both TN and NTN i.e., make sure all patients have same the gene when comparing.

In [None]:
def getpatientId(ids):
    patitentList = list()
    for i in ids:
        p_id = T3[T3.columns[[i in c for c in T3.columns]]].columns
        patitentList += list(p_id)
    return patitentList

In [None]:
only_patientDf = T3[getpatientId(NotTNPpateintIds) + getpatientId(TNPpateintIds)]

In [None]:
only_patientDf.dropna(inplace=True)

In [None]:
NotTNP_df =  only_patientDf[getpatientId(NotTNPpateintIds)]
TNP_df =  only_patientDf[getpatientId(TNPpateintIds)]

In [None]:
NotTNP_df.to_csv("experiment_data/experiment6/notTNPdf", index=False)
TNP_df.to_csv("experiment_data/experiment6/TNPdf", index=False)

In [2]:
NotTNP_df = pd.read_csv("experiment_data/experiment6/notTNPdf")
TNP_df = pd.read_csv("experiment_data/experiment6/TNPdf")

### There are 80 patients without triple negative (NTN) and 26 patients with triple negtive (TN), and with a total of 8051 quantified genes.

# Experiment time

## Let's start with a rather small binsize=6.

In [3]:
n_bins = 100
batch_size = int(TNP_df.shape[0] / 4)

In [4]:
%time p_values, dperm = ppt.GreenFloatCuda(NotTNP_df.values, TNP_df.values, 100, return_dperm=True,batch_size=batch_size)

CPU times: user 4.22 s, sys: 632 ms, total: 4.86 s
Wall time: 4.87 s


In [None]:
p_values[6813]

In [None]:
p_values[6813]

In [None]:
np.argmin(p_values)

In [5]:
%time p_values, dperm = ppt.GreenFloat(NotTNP_df.values[6813], TNP_df.values[6813], 5, return_dperm=True)

CPU times: user 1.27 ms, sys: 292 µs, total: 1.56 ms
Wall time: 1.09 ms


In [6]:
p_values

array([4.87531746e-17])

In [9]:
%time p_values, dperm = ppt.CoinShiftFloat(NotTNP_df.values[6813],TNP_df.values[6813], 5, return_dperm=True)

CPU times: user 639 µs, sys: 0 ns, total: 639 µs
Wall time: 395 µs


In [10]:
p_values

array([4.87531746e-17])

In [None]:
%time p_values, dperm = ppt.GreenFloat(TNP_df.values[6813], NotTNP_df.values[6813], 100, return_dperm=True)

In [None]:
p_values

In [None]:
%time p_values, dperm = ppt.GreenFloatMultiThread(TNP_df.values[6813], NotTNP_df.values[6813], 100, return_dperm=True)

In [None]:
p_values

In [None]:
p_values

In [None]:
a ="[2 3 2 3 2 2 1 3 2 2 2 3 2 2 2 3 2 2 3 3 2 3 2 2 2 3 3 2 3 3 2 2 3 4 2 3 3 2 2 3 3 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 3 3 2 3 1 1 1 1 1 2 1 1 1 1 0 1 1 2 1 1 2 1 1 1]"
a.replace("  ",",").replace(" ",",")

In [None]:
b ="[2 2 2 3 2 3 3 2 2 3 2 2 2 3 3 3 2 3 3 1 2 1 1 2 1 0]"
b.replace("  ",",").replace(" ",",")

In [None]:
b = [2,3,2,3,2,2,1,3,2,2,2,3,2,2,2,3,2,2,3,3,2,3,2,2,2,3,3,2,3,3,2,2,3,4,2,3,3,2,2,3,3,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,3,3,2,3,1,1,1,1,1,2,1,1,1,1,0,1,1,2,1,1,2,1,1,1]

In [None]:
a = [2,2,2,3,2,3,3,2,2,3,2,2,2,3,3,3,2,3,3,1,2,1,1,2,1,0]

In [None]:
sum(a)

In [None]:
score = sum(a)

In [None]:
NN = dperm

In [None]:
NN.shape

In [None]:
if len(a)>len(b):
    score = sum(b)
else:
    score = sum(a)

In [None]:
score

In [None]:
sum(a)

In [None]:
NN.shape

In [None]:
NN[59]

In [None]:
one_side = NN[score]

one_side += min(np.sum(NN[score+1:]), np.sum(NN[:score]))
p=one_side/float(np.sum(NN))

In [None]:
one_side

In [None]:
p

In [None]:
p = 0.3567760297255399

In [None]:
p = np.sum(np.divide(dperm,np.sum(dperm))[sum(a):])

In [None]:
p

In [None]:
perm = np.divide(dperm,np.sum(dperm))

In [None]:
2*min(p,1-p)

In [None]:
p

In [None]:
dperm

In [None]:
dperm.shape

In [None]:
p

In [None]:
2*min(p,1-p)

In [None]:
dperm.shape

In [None]:
%%time
%%R
x1 <- c(2,3,2,3,2,2,1,3,2,2,2,3,2,2,2,3,2,2,3,3,2,3,2,2,2,3,3,2,3,3,2,2,3,4,2,3,3,2,2,3,3,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,3,3,2,3,1,1,1,1,1,2,1,1,1,1,0,1,1,2,1,1,2,1,1,1)
y1 <- c(2,2,2,3,2,3,3,2,2,3,2,2,2,3,3,3,2,3,3,1,2,1,1,2,1,0)

DV <- c(x1, y1)
IV <- factor(rep(c("A", "B"), c(length(x1), length(y1))))

pvalue(oneway_test(DV ~ IV, distribution=exact(c("shift"))))


In [None]:
perm[perm!=0]

In [None]:
perm[sum(a):79:]

In [None]:
Z = [ 0,  3,  6,  8, 14, 18, 18, 19, 20, 21, 25, 25, 26, 27, 33, 33, 38, 39, 39, 39, 40, 42, 42, 44,
 48, 50, 52, 54, 54, 54, 55, 58, 59, 59, 59, 59, 60, 60, 60, 61, 61, 61, 63, 63, 63, 63, 63, 63,
 64, 65, 65, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70,71, 71, 71, 71, 72, 72, 73, 73,
 73, 74, 76, 78, 78, 78, 79, 80, 80, 80, 82, 82, 82, 83, 83, 83, 84, 84, 84, 84, 85, 85, 86, 86,
 87, 89, 90, 90, 91, 92, 95, 95, 95, 99]

In [None]:
sum(Z[26:])

In [None]:
a_ = [20,0,  6, 33, 21, 27, 39, 65, 19, 25, 42, 39, 8, 14, 63, 44, 18, 25, 40, 33, 26, 39, 54, 38, 18, 42]
s = 5828

In [None]:
p = np.sum(dperm[int(sum(a_)) : (int(s) + 1)])

In [None]:
1 - np.sum(dperm)

In [None]:
ix

In [None]:
sum(dperm)

In [None]:
p_values[ix]

In [None]:
dperm

In [None]:
ix = np.argmin(p_values)

In [None]:
ix

In [None]:
p_values[ix[0]]

In [None]:
np.float(1.3322676295501878e-15)


In [None]:
%%time 
p_val_list = list()
for A,B in zip(TNP_Arr,NotTNP_Arr):
    pval, pdist = GreenFloatCuda(A, B,40,return_dperm=True)
    p_val_list.append(pval)

In [None]:
pval, pdist = GreenFloatCuda(A, B,40,return_dperm=True)

In [None]:
pdist[0][pdist[0]!=0]

In [None]:
from significance_of_mean_cuda import significance_of_mean_cuda


In [None]:
%%time

SGM = significance_of_mean_cuda(20, dtype_v=np.uint32,dtype_A=np.float64)
SGM.run(TNP_Arr, NotTNP_Arr)


In [None]:
A,B = dpermTNP_Arr[0], NotTNP_Arr[0]

In [None]:
A,B = A[np.newaxis,:], B[np.newaxis,:]

In [None]:
pval, pdist = GreenFloatCuda(A, B,20,return_dperm=True)

In [None]:
pval

In [None]:
SGM = significance_of_mean_cuda(20, dtype_v=np.uint32,dtype_A=np.float64)
SGM.run( A[,:], B[:,np.newaxis])


In [None]:
SGM.digitized

In [None]:
NN = SGM.numerator.ravel()

In [None]:
NN= NN[NN!=0]

In [None]:
NN / NN.sum()

In [None]:
pdist[pdist!=0]

In [None]:
np.array(P_e_list)

In [None]:
x = pd.DataFrame(TNP_Arr, columns=list(range(TNP_Arr.shape[1]))).T
y = pd.DataFrame(NotTNP_Arr, columns=list(range(NotTNP_Arr.shape[1]))).T
n_samples = x.shape[1]
outputDf = pd.DataFrame(n_samples * [0], columns=["output"]).T

In [None]:
%R -i x
%R -i y
%R -i outputDf
%R -i n_samples

In [None]:
%%R
runFastPerm <- function(x,y,output, n_samples) {
    e <- list(mode="vector",length=n_samples)
        for (i in 1:n_samples) {
      
            mStopDiffMean(x[[i]], y[[i]])
            valX = fastPerm(x[[i]], y[[i]], testStat = diffMean)
            
            print(valX)
            valX <-unlist(valX)

      
            output[[i]] <- as.numeric(valX[1])
      
            }
    

    return(output)


}


In [None]:
start = time.time()
%R out <- runFastPerm(x,y, outputDf, n_samples)
end = time.time()
print(end - start)

In [None]:
2738 / 60

In [None]:
%R -o out

In [None]:
outT = out.T

In [None]:
P_fp_list =list(outT.values.ravel())

### Parallelized exact test takes ~3s

In [None]:
start = time.time()

P_mw_list = MWU(TNP_Arr, NotTNP_Arr)

end = time.time()
print(end - start)

### Mann-Whitney exact test takes ~1s

In [None]:
start = time.time()

P_t_list = ttests(TNP_Arr, NotTNP_Arr, True)

end = time.time()
print(end - start)

In [None]:
df_e = pd.DataFrame({"p":P_e_list})
df_mw = pd.DataFrame({"p":P_mw_list})
df_tt = pd.DataFrame({"p":P_t_list})
df_fp = pd.DataFrame({"p":list(np.array(P_fp_list) )})

In [None]:
df_e = qvalues(df_e, pi0=None)
df_mw = qvalues(df_mw, pi0=None)
df_tt = qvalues(df_tt, pi0=None)
df_fp = qvalues(df_fp, pi0=None)

In [None]:
df_e["q"][df_e["q"]<0.01].shape[0]


In [None]:
df_mw["q"][df_mw["q"]<0.01].shape[0]

In [None]:
df_tt["q"][df_tt["q"]<0.01].shape[0]

In [None]:
df_fp["q"][df_fp["q"]<0.01].shape[0]