In [1]:
import numpy as np
import pandas as pd
import time
import parallelPermutationTest as ppt
from scipy.stats import ttest_ind, mannwhitneyu

In [2]:
%load_ext cython
%load_ext rpy2.ipython

In [3]:
%%capture
%%R

require(ggplot2)
install.packages("devtools")
library("devtools")
install_github("bdsegal/fastPerm")

library(fastPerm)


In [4]:
#url = "https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip"
#!mkdir "data/"
#!wget -P "data/" "https://static-content.springer.com/esm/art%3A10.1038%2Fnature18003/MediaObjects/41586_2016_BFnature18003_MOESM111_ESM.zip"
#!unzip "./data/41586_2016_BFnature18003_MOESM111_ESM.zip" -d "./data/"

In [5]:
path = "./data/nature18003-s2/"
T1 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable01.xlsx")
T3 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable03.xlsx")

## Extract important column headings

In [6]:
paiteint_id = T1.columns[0]
stats_col = T1.columns[5: 5 + 3]

### Patient column  

In [7]:
paiteint_id

'Sample ID'

### Cancer phenotype coulmns

In [8]:
stats_col

Index(['ER Status', 'PR Status', 'HER2 Status'], dtype='object')

### Check viable phenotype status

In [9]:
all_lab = list()
for r in T1[stats_col].iterrows():
    all_lab += list(r[1].values)

In [10]:
list(set(all_lab))

['Indeterminate', 'Positive', 'Negative', 'Equivocal']

### Let's drop patient with "Equivocal" and "Indeterminate" status

In [11]:
drop_label = ['Indeterminate', 'Equivocal']

In [12]:
T1.head()

Unnamed: 0,Sample ID,TCGA ID,Biospecimen Barcode Side,UIUD,PAM50,ER Status,PR Status,HER2 Status,QC Status,Proteome Cluster (see Fig. 3b),...,iTRAQReporterIon,CommonControl,GATA3 Mutation,PIK3CA Mutation,TP53 Mutation,PIK3CA missense mutation in helical domain all tumors,PIK3CA missense mutation in kinase domain all tumors,TP53 Nonsense/Frameshift all tumors,TP53 Missense mutation all tumors,TP53 missense mutations in DNA binding domain all tumors
0,A2-A0CM,TCGA-A2-A0CM,TCGA-A2-A0CM-01A-11-A21V-30,330F7598-824C-4CD6-9303-A27FE74A6695,Basal,Negative,Negative,Negative,pass,1.0,...,116,YES,,,Frame_Shift_Del|Somatic|p.E204fs,0.0,0.0,1.0,,
1,A2-A0D2,TCGA-A2-A0D2,TCGA-A2-A0D2-01A-12-A21W-30,308CCD79-C164-4397-92BC-A1CD243C8E7D,Basal,Negative,Negative,Negative,pass,1.0,...,114,NO,,,Frame_Shift_Del|Somatic|p.P318fs,0.0,0.0,1.0,,
2,A2-A0EQ,TCGA-A2-A0EQ,TCGA-A2-A0EQ-01A-41-A21W-30,04217F65-2907-478B-B0C5-EB65370198DA,Her2,Negative,Negative,Positive,pass,1.0,...,116,NO,,Missense_Mutation|Somatic|p.H1047R,In_Frame_Del|Somatic|p.IY162in_frame_delN,,1.0,,,
3,A2-A0EV,TCGA-A2-A0EV,TCGA-A2-A0EV-01A-41-A21V-30,38F98E5F-7FA4-4C89-8D81-516CF865BEEB,LumA,Positive,Positive,Negative,pass,3.0,...,114,NO,,In_Frame_Ins|Somatic|p.E469in_frame_insDK,,,,0.0,0.0,0.0
4,A2-A0EX,TCGA-A2-A0EX,TCGA-A2-A0EX-01A-41-A21V-30,78271500-147B-40C5-B789-0CF0C7CDBCE2,LumA,Positive,Positive,Negative,pass,3.0,...,116,NO,,Missense_Mutation|Somatic|p.E545K,,1.0,,0.0,0.0,0.0


In [13]:
mask1 = [True if (len(set(r[1].values) & set(drop_label)) == 0) else False for r in T1[stats_col].iterrows()]
T1 = T1[mask1]
T1.reset_index(drop=True, inplace=True)

### Let's divide the dataframe into two dataframes with triple negative (TN) and non-triple negative (NTN)

In [14]:
mask2 = [all([True if v=='Negative' else False for v in r[1].values] ) for r in T1[stats_col].iterrows()]

T1_TNP = T1[np.array(mask2)]
T1_TNP.reset_index(drop=True, inplace=True)

T1_not_TNP = T1[~np.array(mask2)]
T1_not_TNP.reset_index(drop=True, inplace=True)



### Obtain patient ids for NT and NTN

In [15]:
TNPpateintIds = T1_TNP[paiteint_id].values
NotTNPpateintIds = T1_not_TNP[paiteint_id].values

### Remove genes (rows) containing NaN for both TN and NTN i.e., make sure all patients have same the gene when comparing.

In [16]:
def getpatientId(ids):
    patitentList = list()
    for i in ids:
        p_id = T3[T3.columns[[i in c for c in T3.columns]]].columns
        patitentList += list(p_id)
    return patitentList

In [17]:
only_patientDf = T3[getpatientId(NotTNPpateintIds) + getpatientId(TNPpateintIds)]

In [18]:
only_patientDf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
NotTNP_df =  only_patientDf[getpatientId(NotTNPpateintIds)]
TNP_df =  only_patientDf[getpatientId(TNPpateintIds)]

In [20]:
NotTNP_df.to_csv("experiment_data/experiment6/notTNPdf", index=False)
TNP_df.to_csv("experiment_data/experiment6/TNPdf", index=False)

In [21]:
NotTNP_df = pd.read_csv("experiment_data/experiment6/notTNPdf")
TNP_df = pd.read_csv("experiment_data/experiment6/TNPdf")

In [22]:
TNP_Arr = TNP_df.values
NotTNP_Arr = NotTNP_df.values

In [23]:
%%R
runFastPerm <- function(x,y,output, n_samples) {
    e <- list(mode="vector",length=n_samples)
        for (i in 1:n_samples) {
      
            mStopDiffMean(x[[i]], y[[i]])
            valX = fastPerm(x[[i]], y[[i]], testStat = diffMean)
            
            print(valX)
            valX <-unlist(valX)

      
            output[[i]] <- as.numeric(valX[1])
      
            }
    

    return(output)


}


In [24]:
def run_fastperm(A,B):
    
    x = pd.DataFrame(A, columns=list(range(TNP_Arr.shape[1]))).T
    y = pd.DataFrame(B, columns=list(range(NotTNP_Arr.shape[1]))).T
    n_samples = x.shape[1]
    outputDf = pd.DataFrame(n_samples * [0], columns=["output"]).T
    
    %R -i x
    %R -i y
    %R -i outputDf
    %R -i n_samples

    start = time.time()
    %R out <- runFastPerm(x,y, outputDf, n_samples)
    end = time.time()
    run_time = end - start
    return run_time
    
    

In [25]:
%%capture
run_time = run_fastperm(TNP_Arr, NotTNP_Arr)

In [26]:
round(run_time / 60, 2)

41.64

### There are 80 patients without triple negative (NTN) and 26 patients with triple negtive (TN), and with a total of 8051 quantified genes.

# Experiment time

In [27]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_8 = ppt.GreenFloatCuda(a,b, 8)

CPU times: user 5.06 s, sys: 704 ms, total: 5.76 s
Wall time: 5.83 s


In [28]:
%%time
p_val_bin_8 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 8)

CPU times: user 1.13 s, sys: 80 ms, total: 1.21 s
Wall time: 1.21 s


In [29]:
%%time 
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_16 = ppt.GreenFloatCuda(a,b, 16)

CPU times: user 5.02 s, sys: 1e+03 ms, total: 6.02 s
Wall time: 6.05 s


In [30]:
%%time
p_val_bin_16 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 16)

CPU times: user 1.34 s, sys: 200 ms, total: 1.54 s
Wall time: 1.53 s


In [31]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_32 = ppt.GreenFloatCuda(a,b, 32)

CPU times: user 6.06 s, sys: 708 ms, total: 6.77 s
Wall time: 6.81 s


In [32]:
%%time 
p_val_bin_32 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 32)

CPU times: user 1.86 s, sys: 288 ms, total: 2.14 s
Wall time: 2.14 s


In [33]:
%%time 
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_64 = ppt.GreenFloatCuda(a,b, 64)

CPU times: user 6.29 s, sys: 880 ms, total: 7.17 s
Wall time: 7.21 s


In [34]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_64 = ppt.GreenFloatCuda(a,b, 64)

CPU times: user 6.26 s, sys: 784 ms, total: 7.04 s
Wall time: 7.08 s


In [35]:
%%time 
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_128 = ppt.GreenFloatCuda(a,b, 128)

CPU times: user 7.32 s, sys: 896 ms, total: 8.21 s
Wall time: 8.26 s


In [36]:
%%time 
p_val_bin_128 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 128, batch_size = TNP_Arr.shape[0] / 2)

CPU times: user 5.2 s, sys: 668 ms, total: 5.87 s
Wall time: 5.85 s


In [37]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_256 = ppt.GreenFloatCuda(a,b, 256)

CPU times: user 9.53 s, sys: 1.27 s, total: 10.8 s
Wall time: 10.9 s


In [38]:
%%time
p_val_bin_256 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 256, batch_size = TNP_Arr.shape[0] / 3)

CPU times: user 9.54 s, sys: 1.17 s, total: 10.7 s
Wall time: 10.7 s


In [39]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_512 = ppt.GreenFloatCuda(a,b, 512)

CPU times: user 15.2 s, sys: 2.37 s, total: 17.6 s
Wall time: 17.6 s


In [40]:
%%time
p_val_bin_512 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 512, batch_size = TNP_Arr.shape[0] / 6)

CPU times: user 18.2 s, sys: 2.15 s, total: 20.4 s
Wall time: 20.4 s


In [41]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_1280 = ppt.GreenFloatCuda(a,b, 1028)

CPU times: user 33 s, sys: 3.96 s, total: 37 s
Wall time: 37.1 s


In [42]:
%%time
p_val_bin_1280 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 1028, batch_size = TNP_Arr.shape[0] / 12)

CPU times: user 35.6 s, sys: 4.08 s, total: 39.7 s
Wall time: 39.7 s


In [43]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_val_bin_2056 = ppt.GreenFloatCuda(a,b, 2056)

CPU times: user 1min, sys: 6.91 s, total: 1min 7s
Wall time: 1min 7s


In [44]:
%time p_val_bin_2056 = ppt.GreenFloatCuda(TNP_Arr, NotTNP_Arr, 2056, batch_size = TNP_Arr.shape[0] / 24)

CPU times: user 1min 10s, sys: 8.13 s, total: 1min 18s
Wall time: 1min 18s


### Parallelized exact test takes ~3s

In [45]:
def MWU(A, B, one_side=False):
    p_mw = list()
    for a,b in zip(A, B):
        if one_side:
            p_mw.append(mannwhitneyu(a,b, alternative="less")[1])
        else:
            p_mw.append(mannwhitneyu(a,b, alternative="two-sided")[1])
    return p_mw

In [46]:
%time p_mwu = MWU(TNP_Arr, NotTNP_Arr, one_side=False)

CPU times: user 1.4 s, sys: 17 µs, total: 1.41 s
Wall time: 1.4 s


### Mann-Whitney exact test takes ~1s

In [47]:
%%time
for a, b in zip(TNP_Arr, NotTNP_Arr):
    p_ttest = ttest_ind(a, b)[1]

CPU times: user 1.31 s, sys: 12.3 ms, total: 1.32 s
Wall time: 1.29 s
