In [67]:
import pandas as pd
import numpy as np
from significance_of_mean_cuda import significance_of_mean_cuda
import time
from utils import MW, ttests
import sklearn

In [2]:
!ls "./data/nature18003-s2/"

CPTAC_BC_SupplementaryTable01.xlsx  CPTAC_BC_SupplementaryTable03.xlsx


In [3]:
path = "./data/nature18003-s2/"

In [4]:
T1 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable01.xlsx")
T3 = pd.read_excel(path + "CPTAC_BC_SupplementaryTable03.xlsx")

## Extract important column headings

In [5]:
paiteint_id = T1.columns[0]
stats_col = T1.columns[5: 5 + 3]

### Patient column  

In [6]:
paiteint_id

'Sample ID'

### Cancer phenotype coulmns

In [7]:
stats_col

Index(['ER Status', 'PR Status', 'HER2 Status'], dtype='object')

### Check viable phenotype status

In [8]:
all_lab = list()
for r in T1[stats_col].iterrows():
    all_lab += list(r[1].values)

In [9]:
list(set(all_lab))

['Equivocal', 'Positive', 'Indeterminate', 'Negative']

### Let's drop patient with "Equivocal" and "Indeterminate" status

In [10]:
drop_label = ['Indeterminate', 'Equivocal']

In [11]:
T1.head()

Unnamed: 0,Sample ID,TCGA ID,Biospecimen Barcode Side,UIUD,PAM50,ER Status,PR Status,HER2 Status,QC Status,Proteome Cluster (see Fig. 3b),...,iTRAQReporterIon,CommonControl,GATA3 Mutation,PIK3CA Mutation,TP53 Mutation,PIK3CA missense mutation in helical domain all tumors,PIK3CA missense mutation in kinase domain all tumors,TP53 Nonsense/Frameshift all tumors,TP53 Missense mutation all tumors,TP53 missense mutations in DNA binding domain all tumors
0,A2-A0CM,TCGA-A2-A0CM,TCGA-A2-A0CM-01A-11-A21V-30,330F7598-824C-4CD6-9303-A27FE74A6695,Basal,Negative,Negative,Negative,pass,1.0,...,116,YES,,,Frame_Shift_Del|Somatic|p.E204fs,0.0,0.0,1.0,,
1,A2-A0D2,TCGA-A2-A0D2,TCGA-A2-A0D2-01A-12-A21W-30,308CCD79-C164-4397-92BC-A1CD243C8E7D,Basal,Negative,Negative,Negative,pass,1.0,...,114,NO,,,Frame_Shift_Del|Somatic|p.P318fs,0.0,0.0,1.0,,
2,A2-A0EQ,TCGA-A2-A0EQ,TCGA-A2-A0EQ-01A-41-A21W-30,04217F65-2907-478B-B0C5-EB65370198DA,Her2,Negative,Negative,Positive,pass,1.0,...,116,NO,,Missense_Mutation|Somatic|p.H1047R,In_Frame_Del|Somatic|p.IY162in_frame_delN,,1.0,,,
3,A2-A0EV,TCGA-A2-A0EV,TCGA-A2-A0EV-01A-41-A21V-30,38F98E5F-7FA4-4C89-8D81-516CF865BEEB,LumA,Positive,Positive,Negative,pass,3.0,...,114,NO,,In_Frame_Ins|Somatic|p.E469in_frame_insDK,,,,0.0,0.0,0.0
4,A2-A0EX,TCGA-A2-A0EX,TCGA-A2-A0EX-01A-41-A21V-30,78271500-147B-40C5-B789-0CF0C7CDBCE2,LumA,Positive,Positive,Negative,pass,3.0,...,116,NO,,Missense_Mutation|Somatic|p.E545K,,1.0,,0.0,0.0,0.0


In [12]:
mask1 = [True if (len(set(r[1].values) & set(drop_label)) == 0) else False for r in T1[stats_col].iterrows()]
T1 = T1[mask1]
T1.reset_index(drop=True, inplace=True)

### Let's divide the dataframe into two dataframes with triple negative (TN) and non-triple negative (NTN)

In [13]:
mask2 = [all([True if v=='Negative' else False for v in r[1].values] ) for r in T1[stats_col].iterrows()]

T1_TNP = T1[np.array(mask2)]
T1_TNP.reset_index(drop=True, inplace=True)

T1_not_TNP = T1[~np.array(mask2)]
T1_not_TNP.reset_index(drop=True, inplace=True)



### Obtain patient ids for NT and NTN

In [14]:
TNPpateintIds = T1_TNP[paiteint_id].values
NotTNPpateintIds = T1_not_TNP[paiteint_id].values

### Remove genes (rows) containing NaN for both TN and NTN i.e., make sure all patients have same the gene when comparing.

In [15]:
def getpatientId(ids):
    patitentList = list()
    for i in ids:
        p_id = T3[T3.columns[[i in c for c in T3.columns]]].columns
        patitentList += list(p_id)
    return patitentList

In [16]:
only_patientDf = T3[getpatientId(NotTNPpateintIds) + getpatientId(TNPpateintIds)]

In [17]:
only_patientDf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
NotTNP_df =  only_patientDf[getpatientId(NotTNPpateintIds)]
TNP_df =  only_patientDf[getpatientId(TNPpateintIds)]

In [19]:
NotTNP_Arr, TNP_Arr =NotTNP_df.values, TNP_df.values

In [20]:
NotTNP_Arr.shape

(8051, 80)

In [21]:
TNP_Arr.shape

(8051, 26)

### There are 80 patients without triple negative (NTN) and 26 patients with triple negtive (TN), and with a total of 8051 quantified genes.

# Experiment time

## Let's start with a rather small binsize=8.

In [152]:
start = time.time()

SGM = significance_of_mean_cuda(8, dtype_v=np.uint32,dtype_A=np.float64)
SGM.run(TNP_Arr, NotTNP_Arr)
P_e_list = list(1 - SGM.p_values)

end = time.time()
print(end - start)

3.1616718769073486


### Parallelized exact test takes ~3s

In [153]:
start = time.time()

P_mw_list = MW(TNP_Arr, NotTNP_Arr)

end = time.time()
print(end - start)

1.2535901069641113


### Mann-Whitney exact test takes ~1s

In [154]:
start = time.time()

P_t_list = ttests(TNP_Arr, NotTNP_Arr)

end = time.time()
print(end - start)

1.131826639175415


### Let's compare how close MannWhitney and exact test are compared to the t-test

In [155]:
def diffAbs(a,b):
    return np.abs(a - b)
    

def MAE(a,b):
    return np.mean(np.abs(a - b))

def relative_abolute_error(a,b):
    return np.abs((a-b)/a)

def mean_relative_abolute_error(a,b):
    return np.mean(relative_abolute_error(a,b))

In [156]:
P_mw, P_t, P_e = np.array(P_mw_list), np.array(P_t_list), np.array(P_e_list) 

In [157]:
np.around(MAE(P_e, P_t), 3)

0.039

In [158]:
np.around(MAE(P_mw, P_t),3)

0.055

# It seems that the exact test has slightly lower Mean Absolute Error (MAE) ,i.e, $MAE_{exact\ test}\approx 0.039$ and $MAE_{Mann-Whitney}\approx 0.055$

In [159]:
np.round(mean_relative_abolute_error(P_t, P_mw),3)

389698.768

In [160]:
np.round(mean_relative_abolute_error(P_t, P_e),3)

177.342

# The exact test seems to have much lower mean relative error (missing small $p$-values penalizes as much as missing high p-values).

## Let's check what type of $p$-values that Mann-Whitney test misses

In [161]:
rel_err = relative_abolute_error(P_t, P_mw)

### t-test

In [162]:
P_t[rel_err>1000]

array([1.18452058e-14, 1.01958742e-14, 1.55910462e-22, 1.72003584e-18])

### Mann-Whitney

In [163]:
P_mw[rel_err>1000]

array([3.95953722e-11, 3.09980303e-11, 4.88820808e-13, 3.76426971e-12])

### Exact test

In [164]:
P_e[rel_err>1000]

array([ 1.82964754e-13,  3.95905531e-13, -2.22044605e-16,  2.99760217e-15])

### It seems that the exact is pretty close to two $p$-values which MW-test struggles with.

## Let's check what type of $p$-values that Exact test misses

In [165]:
rel_err2 = relative_abolute_error(P_t, P_e)

### t-test

In [166]:
P_t[rel_err2>1000]

array([1.55910462e-22, 1.72003584e-18])

### mw-test

In [167]:
P_mw[rel_err2>1000]

array([4.88820808e-13, 3.76426971e-12])

### exact test

In [168]:
P_e[rel_err2>1000]

array([-2.22044605e-16,  2.99760217e-15])

### It's the same $p$-values demonstrated above.

# Let's see if the residual between the exact test and the t-test by increasing the number of bins to $n_{bin}=16$.

In [185]:
start = time.time()

SGM = significance_of_mean_cuda(16, dtype_v=np.uint32,dtype_A=np.float64)
SGM.run(TNP_Arr, NotTNP_Arr)
P_e2_list = list(1 - SGM.p_values)

end = time.time()
print(end - start)

9.539236545562744


In [186]:
P_e_2 =  np.array(P_e2_list) 

In [193]:
np.around(MAE(P_t, P_e_2),3)

0.019

In [194]:
np.around(mean_relative_abolute_error(P_t, P_e_2),3)

88.616

In [189]:
rel_err3 = relative_abolute_error(P_t, P_e_2)

In [190]:
P_mw[rel_err3>1000]

array([4.88820808e-13])

In [191]:
P_e[rel_err3>1000]

array([-2.22044605e-16])

In [192]:
P_t[rel_err3>1000]

array([1.55910462e-22])

## The runtime has increased considerbly ~9s, but the error has also decreased considerbly.