<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Proto-SAIL---Analysis:-CD-x-nonIBD-and-UC-x-nonIBD-at-baseline" data-toc-modified-id="Proto-SAIL---Analysis:-CD-x-nonIBD-and-UC-x-nonIBD-at-baseline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Proto SAIL - Analysis: CD x nonIBD and UC x nonIBD at baseline</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Supporting-functions" data-toc-modified-id="Supporting-functions-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Supporting functions</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Subset-relevant-samples" data-toc-modified-id="Subset-relevant-samples-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Subset relevant samples</a></span></li><li><span><a href="#Test-UC-x-nonIBD" data-toc-modified-id="Test-UC-x-nonIBD-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Test UC x nonIBD</a></span></li><li><span><a href="#Test-CD-x-nonIBD" data-toc-modified-id="Test-CD-x-nonIBD-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Test CD x nonIBD</a></span></li></ul></li></ul></div>

# Proto SAIL - Analysis: CD x nonIBD and UC x nonIBD at baseline

## Imports

In [117]:
import os 
import numpy as np
import scipy
from scipy import stats
import pandas 
import platform

In [118]:
## Get versions
print(f"python\t{platform.python_version()}")
print(f"numpy\t{np.__version__}")
print(f"scipy\t{scipy.__version__}")
print(f"pandas\t{pandas.__version__}")
ipython_version = !ipython --version
print(f"ipython\t{ipython_version[0]}")

python	3.7.3
numpy	1.16.2
scipy	1.2.1
pandas	0.24.2
ipython	7.4.0


## Supporting functions

In [119]:
def Wilcoxon_DP(x,y,eps = 0.1):
    
    x, y = map(np.asarray, (x, y))
    n1 = len(x)
    n2 = len(y)
    alldata = np.concatenate((x, y))
    ranked = stats.rankdata(alldata)
    x = ranked[:n1]
    noise = np.random.laplace(0,(2*n1)/eps,1)
    s = np.sum(x, axis=0) + noise 
    
    return (s,n1)

def Wilcoxon_DP_Pratt (x,y,eps = 0.1):
    x, y = map(np.asarray, (x, y))
    D = np.subtract(x,y)
    S = np.sign(D)
    D = np.absolute(D)
    
    ranked = stats.rankdata(D, method = 'max')
    w = np.sum([D[i]*S[i] for i in range(0,len(D))])
    noise = np.random.laplace(0,(2*2)/eps,1)
    s = w + noise 
    
    return (s,len(x))

In [120]:
# return p-value for a DP test statistic 
def DP_test_statistic(w,n,c,eps = 0.1): 
    l = 0
    for k in range(0,c):
        Wk = np.random.normal(0,(n*(n+1)*(2*n+1))/6,1)[0] + np.random.laplace(0,(2*n)/eps,1)[0]
        if Wk > w[0]:
            #print(Wk,w)
            l+=1
    return(l/c)

In [121]:
def dfrow_testranksums(s, index_subset1, index_subset2):
    x1 = s[index_subset1]
    x2 = s[index_subset2]
    
    Nnotnull1 = sum(x1 != 0)
    Nnotnull2 = sum(x2 != 0)
    w,n = Wilcoxon_DP(x1,x2)
    
    return pandas.Series([Nnotnull1, Nnotnull2,w,DP_test_statistic(w,n,c=1000000,eps = 0.1)])

def dfrow_testranksums_Pratt(s, index_subset1, index_subset2):
    x1 = s[index_subset1]
    x2 = s[index_subset2]
    
    if len(x1) > len(x2): 
        x1 = x1[0:len(x2)]
    if len(x2) > len(x1): 
        x2 = x2[0:len(x1)]
    
    w,n = Wilcoxon_DP_Pratt(x1,x2)
    
    return pandas.Series([w,DP_test_statistic(w,n=0,c=1000000,eps = 0.1)])

## Load data

In [56]:
datadir = "~/Novartis/201903_testcase"

In [65]:
iHMPall_MSP_df = pandas.read_csv(
    f"{datadir}/iHMPall_MSP_df.tsv.gz",
    index_col=[0,1], 
    sep = "\t")

iHMPall_metadata_df = pandas.read_csv(
    f"{datadir}/iHMPall_metadata_df.tsv.gz",
    index_col=0,
    sep = "\t")

iHMPall_norm_df = pandas.read_csv(
    f"{datadir}/iHMPall_norm_df.tsv.gz",
    index_col=0,
    sep = "\t")

## Subset relevant samples

In [122]:
i_nonIBD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "nonIBD", ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist() #all sample test code for diagnosis nonIBD without duplicates

i_UC_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "UC", ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist() #all sample test code for diagnosis UC without dupli

i_CD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "CD",     ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist() #all sample test code for diagnosis CD without duplicates

## Test UC x nonIBD

In [None]:
iHMPall_MSP_test_UCxnonIBD_df = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_UC_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_UC_baseline].iloc[0,:].values, axis=1)\
    .apply(lambda x: dfrow_testranksums(x, i_nonIBD_baseline, i_UC_baseline), axis=1)

iHMPall_MSP_test_UCxnonIBD_df.columns = ["statistic", "pvalue"]

iHMPall_MSP_test_UCxnonIBD_df

In [None]:
iHMPall_MSP_test_UCxnonIBD_df = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_UC_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_UC_baseline].iloc[0,:].values, axis=1)\
    .apply(lambda x: dfrow_testranksums_Pratt(x, i_nonIBD_baseline, i_UC_baseline), axis=1)


In [None]:
Test = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_UC_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_UC_baseline].iloc[0,:].values, axis=1) 

In [None]:
k = pandas.Series(Test.iloc[0,:])
dfrow_testranksums_Pratt(k,i_nonIBD_baseline, i_UC_baseline)
#len(i_CD_baseline), len(i_UC_baseline)


## Test CD x nonIBD

In [14]:
iHMPall_MSP_test_CDxnonIBD_df = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_CD_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_CD_baseline].iloc[0,:].values, axis=1)\
    .apply(lambda x: dfrow_testranksums(x, i_nonIBD_baseline, i_CD_baseline), axis=1)

iHMPall_MSP_test_CDxnonIBD_df.columns = ["Nnotnull1", "Nnotnull2", "statistic", "pvalue"]

In [15]:
iHMPall_MSP_test_CDxnonIBD_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Nnotnull1,Nnotnull2,statistic,pvalue
msp_name,module_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
msp_0001,core,8,14,[1371.0034305092406],0.4304
msp_0002,core,2,8,[978.2648672232723],0.4507
msp_0003,core,28,56,[1296.669403622949],0.4397
msp_0004,core,3,7,[-507.9455311226234],0.5193
msp_0005,core,28,57,[-265.6121477726001],0.5136
msp_0006,core,1,1,[1692.3746147759457],0.4064
msp_0007,core,28,54,[1194.5941361816615],0.4323
msp_0008,core,28,57,[969.4754404957446],0.4604
msp_0009,core,28,57,[1544.1952935856475],0.4117
msp_0010,core,28,57,[1243.670388270729],0.4309


In [40]:
#pensez à la troncature de la donnée afin d'appliquer Wilcox
X = [1,9,0,0,8,6]
Y = stats.rankdata(X)

In [103]:
k = pandas.Series(Test.iloc[0,:])
#dfrow_testranksums_Pratt(k,i_nonIBD_baseline, i_UC_baseline)
#len(i_CD_baseline), len(i_UC_baseline)
x1 = k[i_nonIBD_baseline]
x2 = k[i_UC_baseline]
if len(x1) > len(x2): 
    x1 = x1[0:len(x2)]
if len(x2) > len(x1): 
    x2 = x2[0:len(x1)]
x1,x2 = map(np.asarray,(x1,x2))
np.subtract(x1,x2)
ranked = stats.rankdata(np.abs(np.subtract(x1,x2)),method = 'ordinal')
ranked

array([ 1,  2, 20, 16,  3,  4, 17,  5, 15,  6,  7,  8, 18, 26, 28,  9, 21,
       19, 22, 25, 10, 27, 23, 24, 14, 13, 11, 12])