<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Proto-SAIL---Analysis:-CD-x-nonIBD-and-UC-x-nonIBD-at-baseline" data-toc-modified-id="Proto-SAIL---Analysis:-CD-x-nonIBD-and-UC-x-nonIBD-at-baseline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Proto SAIL - Analysis: CD x nonIBD and UC x nonIBD at baseline</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Supporting-functions" data-toc-modified-id="Supporting-functions-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Supporting functions</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Subset-relevant-samples" data-toc-modified-id="Subset-relevant-samples-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Subset relevant samples</a></span></li><li><span><a href="#Test-UC-x-nonIBD" data-toc-modified-id="Test-UC-x-nonIBD-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Test UC x nonIBD</a></span></li><li><span><a href="#Test-CD-x-nonIBD" data-toc-modified-id="Test-CD-x-nonIBD-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Test CD x nonIBD</a></span></li></ul></li></ul></div>

# Proto SAIL - Analysis: CD x nonIBD and UC x nonIBD at baseline

## Imports

In [7]:
import numpy as np
# import scipy
# from scipy import stats
# import pandas
# import platform

In [8]:
## Get versions
# print(f"python\t{platform.python_version()}")
print(f"numpy\t{numpy.__version__}")
# print(f"scipy\t{scipy.__version__}")
# print(f"pandas\t{pandas.__version__}")
# ipython_version = !ipython --version
# print(f"ipython\t{ipython_version[0]}")

numpy	1.13.3


## Supporting functions

In [11]:
def ranksums(x, y):
    """
    Compute the Wilcoxon rank-sum statistic for two samples.
    The Wilcoxon rank-sum test tests the null hypothesis that two sets
    of measurements are drawn from the same distribution.  The alternative
    hypothesis is that values in one sample are more likely to be
    larger than the values in the other sample.
    This test should be used to compare two samples from continuous
    distributions.  It does not handle ties between measurements
    in x and y.  For tie-handling and an optional continuity correction
    see `scipy.stats.mannwhitneyu`.
    Parameters
    ----------
    x,y : array_like
        The data from the two samples
    Returns
    -------
    statistic : float
        The test statistic under the large-sample approximation that the
        rank sum statistic is normally distributed
    pvalue : float
        The two-sided p-value of the test
    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test
    """
    x, y = map(np.asarray, (x, y))
    n1 = len(x)
    n2 = len(y)
    alldata = np.concatenate((x, y))
    ranked = rankdata(alldata)
    x = ranked[:n1]
    s = np.sum(x, axis=0)
    expected = n1 * (n1+n2+1) / 2.0
    z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
#     prob = 2 * distributions.norm.sf(abs(z))
    prob = 2 * np.random.normal(abs(z))
    

    return RanksumsResult(z, prob)

In [13]:
def dfrow_testranksums(s, index_subset1, index_subset2):
    x1 = s[index_subset1]
    x2 = s[index_subset2]
    Nnotnull1 = sum(x1 != 0)
    Nnotnull2 = sum(x2 != 0)
#     t = stats.ranksums(x1,x2)
    t = ranksums(x1,x2)
#     return pandas.Series([Nnotnull1, Nnotnull2, t.statistic, t.pvalue])
    return np.array([Nnotnull1, Nnotnull2, t.statistic, t.pvalue])

## Load data

In [14]:
datadir = "~/Downloads/201903_testcase"

In [15]:
# genfromtxt('my_file.csv', delimiter=',')
iHMPall_MSP_df = np.genfromtxt(
    f"{datadir}/iHMPall_MSP_df.tsv.gz",
    index_col=[0,1], sep = "\t")

iHMPall_metadata_df = np.genfromtxt(
    f"{datadir}/iHMPall_metadata_df.tsv.gz",
    index_col=0,
    sep = "\t")

iHMPall_norm_df = np.genfromtxt(
    f"{datadir}/iHMPall_norm_df.tsv.gz",
    index_col=0,
    sep = "\t")


TypeError: genfromtxt() got an unexpected keyword argument 'index_col'

## Subset relevant samples

In [15]:
i_nonIBD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "nonIBD", ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist()

i_UC_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "UC",     ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist()

i_CD_baseline = iHMPall_metadata_df\
    .loc[iHMPall_metadata_df["diagnosis"] == "CD",     ["participantID", "diagnosis", "visit_num"]]\
    .reset_index()\
    .dropna(axis="index", how="any")\
    .groupby("participantID").first().loc[:,"index"].tolist()

## Test UC x nonIBD

In [16]:
iHMPall_MSP_test_UCxnonIBD_df = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_UC_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_UC_baseline].iloc[0,:].values, axis=1)\
    .apply(lambda x: dfrow_testranksums(x, i_nonIBD_baseline, i_UC_baseline), axis=1)

iHMPall_MSP_test_UCxnonIBD_df.columns = ["Nnotnull1", "Nnotnull2", "statistic", "pvalue"]

In [17]:
iHMPall_MSP_test_UCxnonIBD_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Nnotnull1,Nnotnull2,statistic,pvalue
msp_name,module_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
msp_0001,core,8.0,13.0,-0.955219,0.339467
msp_0002,core,2.0,8.0,-1.201260,0.229651
msp_0003,core,28.0,32.0,0.506555,0.612467
msp_0004,core,3.0,6.0,-0.571684,0.567536
msp_0005,core,28.0,33.0,-0.738123,0.460439
msp_0006,core,1.0,2.0,-0.151967,0.879213
msp_0007,core,28.0,30.0,1.244679,0.213250
msp_0008,core,28.0,33.0,-1.230206,0.218620
msp_0009,core,28.0,33.0,0.000000,1.000000
msp_0010,core,28.0,33.0,0.014473,0.988453


## Test CD x nonIBD

In [18]:
iHMPall_MSP_test_CDxnonIBD_df = iHMPall_MSP_df\
    .loc[iHMPall_MSP_df.index.get_level_values('module_name') == "core", i_nonIBD_baseline + i_CD_baseline]\
    .apply(lambda s: s / iHMPall_norm_df.loc[:, i_nonIBD_baseline + i_CD_baseline].iloc[0,:].values, axis=1)\
    .apply(lambda x: dfrow_testranksums(x, i_nonIBD_baseline, i_CD_baseline), axis=1)

iHMPall_MSP_test_CDxnonIBD_df.columns = ["Nnotnull1", "Nnotnull2", "statistic", "pvalue"]

In [19]:
iHMPall_MSP_test_CDxnonIBD_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Nnotnull1,Nnotnull2,statistic,pvalue
msp_name,module_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
msp_0001,core,8.0,14.0,0.093503,0.925504
msp_0002,core,2.0,8.0,-0.476865,0.633459
msp_0003,core,28.0,56.0,0.794774,0.426745
msp_0004,core,3.0,7.0,-0.130904,0.895851
msp_0005,core,28.0,57.0,-1.028532,0.303700
msp_0006,core,1.0,1.0,0.140254,0.888459
msp_0007,core,28.0,54.0,0.000000,1.000000
msp_0008,core,28.0,57.0,-1.898108,0.057682
msp_0009,core,28.0,57.0,-0.972430,0.330837
msp_0010,core,28.0,57.0,0.514266,0.607066
