In [64]:
import lateral_signaling as lsig

import os
from glob import glob

import pandas as pd
import numpy as np
import numba
from tqdm import tqdm

from collections import Counter

import holoviews as hv
hv.extension("bokeh")

import bokeh.io
bokeh.io.output_notebook()

import colorcet as cc

<hr>

In [51]:
sample_ids = np.array(
    [
        "a1",
        "a2",
        "a3",
        "a4",
        "a5",
        "a6",
        "a7",
        "e1",
        "e2",
        "e3",
        "f1",
        "f2",
        "f3",
        "f4",
        "g1",
        "g2",
        "g3",
        "g4",
        "h1",
        "h2",
        "h3",
        "h4",
    ]
)

sample_experiments = np.array(
    [
        "neg_control",
        "density",
        "density",
        "pos_control",
        "density",
        "density",
        "density",
        "ECM",
        "ECM",
        "ECM",
        "pos_control",
        "ECM",
        "ECM",
        "ECM",
        "cyto_tension",
        "FGF2",
        "cyto_tension",
        "cyto_tension",
        "cyto_tension_control",
        "FGF2_control",
        "cyto_tension_control",
        "cyto_tension_control",
    ]
)

sample_labels = np.array(
    [
        "control_neg_no_senders",
        "density_0.3",
        "density_0.5",
        "control_pos",
        "density_2.4",
        "density_3.6",
        "density_4.8",
        "ECM_PDMS_0.2",
        "ECM_PDMS_8.0",
        "ECM_PDMS_64.",
        "control_pos",
        "ECM_Fibronectin",
        "ECM_Gelatin",
        "ECM_Matrigel",
        "tension_ROCK-inhibitor",
        "FGF2",
        "tension_Blebbistatin",
        "tension_Latrunculin-A",
        "tensionctrl_ROCK-inhibitor",
        "FGF2ctrl",
        "tensionctrl_Blebbistatin",
        "tensionctrl_Latrunculin-A",
    ]
)

sample_plotlabels = np.array(
    [
        "- senders",
        "0.3",
        "0.5",
        "+ senders",
        "2.4",
        "3.6",
        "4.8",
        "PDMS (0.2 kPa)",
        "PDMS (8.0 kPa)",
        "PDMS (64. kPa)",
        "+ senders",
        "Fibronectin",
        "Gelatin",
        "Matrigel",
        "ROCK-inhibitor",
        "FGF2",
        "Blebbistatin",
        "Latrunculin-A",
        "ROCK-inhibitor (-S)",
        "FGF2 (NS)",
        "Blebbistatin (-S)",
        "Latrunculin-A (-S)",
    ]
)

sample_df = pd.DataFrame(
    dict(
        id=sample_ids,
        experiment=sample_experiments,
        label=sample_labels,
        plotlabel=sample_plotlabels,
    )
)

In [23]:
experiments = np.array([
    "neg_control",
    "pos_control",
    "density",
    "ECM",
    "tension",
    "tensionctrl",
    "FGF2",
    "FGF2ctrl",
])


In [25]:
# Remove data pertaining to FGF2 treatment
where_FGF2 = sample_df.label.str.startswith("FGF2")

sample_ids         = sample_ids[~where_FGF2]
sample_experiments = sample_experiments[~where_FGF2]
sample_labels      = sample_labels[~where_FGF2]
sample_plotlabels  = sample_plotlabels[~where_FGF2]

sample_df = sample_df.loc[~where_FGF2]

experiments = experiments[:-2]

In [53]:
n_samples = sample_df.id.size

In [54]:
sample_df

Unnamed: 0,id,experiment,label,plotlabel
0,a1,neg_control,control_neg_no_senders,- senders
1,a2,density,density_0.3,0.3
2,a3,density,density_0.5,0.5
3,a4,pos_control,control_pos,+ senders
4,a5,density,density_2.4,2.4
5,a6,density,density_3.6,3.6
6,a7,density,density_4.8,4.8
7,e1,ECM,ECM_PDMS_0.2,PDMS (0.2 kPa)
8,e2,ECM,ECM_PDMS_8.0,PDMS (8.0 kPa)
9,e3,ECM,ECM_PDMS_64.,PDMS (64. kPa)


In [55]:
data_dir = os.path.join(os.getcwd(), "FACS_data")
files_raw = pd.Series(glob(os.path.join(data_dir, "*.csv")))
files_raw.name = "filename"

In [56]:
filenames = []
data = []

for i, id_ in enumerate(sample_ids):
    f = files_raw[files_raw.str.contains(id_)].values[0]
    d = pd.read_csv(f).squeeze()
    
    filenames.append(f)
    data.append(d)

filenames = np.array(filenames)

<hr>

In [57]:
# Extract control data
pos_ctrl_samples = (sample_df["experiment"]=="pos_control").values.nonzero()[0]
pos_ctrl = np.concatenate([data[i] for i in pos_ctrl_samples])

# Indices of experimental (non-control) samples
exp_sample_idx = (~pd.Series(sample_experiments).str.endswith("control").values).nonzero()[0]

# Extract experimental data
exp_data_list = [data[i].values for i in exp_sample_idx]

In [73]:
# Change backend temporarily
hv.extension("matplotlib")

# Compare pos. control and experimental distributions as ECDFs 
pos_ctrl_ecdf = lsig.ecdf(pos_ctrl).opts(
#     height=200, 
#     width=200,
)
ecdfs = []
for i in range(n_samples):
    plot = pos_ctrl_ecdf * lsig.ecdf(
        data[i].values,
        label=sample_plotlabels[i],
    )
    plot.opts(
        xlabel="mCherry (AU)",
        ylabel="empirical CDF",
        legend_position="bottom_right",
    )
    
    ecdfs.append(plot)
    
hv.Layout(ecdfs).cols(4)

<hr>

### Null hypothesis significance testing with distributions

In our screen, our null expectation is that the experimental condition does not suppress Receiver expression. Specifically, the experimental distribution should not exceed the positive control distribution. 

By visual inspection, our data follow distributions that will not satisfy assumptions of Normality. In order to compare distributions without assuming a specific underlying type of distribution, we can use the Kolmogorov-Smirnov significance test, which only assumes that the underlying distributions are continuous. The two-sample, one-sided K-S test evaluates the null hypothesis that the cumulative density of one distribution is always greater than (or less than) another distribution. Here, our null hypothesis is that $F(x) \leq G(x)$, where $F(x)$ and $G(x)$ are the cumulative distributions of experimental and positive control.

To perform this test, we use the `ks_2samp` function from `scipy.stats`. Because we are performing multiple tests, we will apply Bonferroni correction of the resulting $p$-values, multiplying them by the number of tests.

In [52]:
# Get the number of tests to perform, used for Bonferroni correction
n_tests = len(exp_data_list)

In [53]:
# Perform Kolmogorov-Smirnov tests
from scipy import stats
ks_results = [
    stats.ks_2samp(exp_data, pos_ctrl, alternative="greater", mode="asymp")
    for exp_data in exp_data_list
]

In [59]:
[(l, int(d.mean() - pos_ctrl.mean()), r[0], r[1] * n_tests) for l, d, r in zip(sample_labels[exp_sample_idx], exp_data_list, ks_results)]

[('density_0.3', -52, 0.16700936901778585, 8.902075092293125e-65),
 ('density_0.5', -9, 0.03375959551487351, 0.03424801974125688),
 ('density_2.4', -65, 0.1894359231086446, 2.240557052248214e-83),
 ('density_3.6', -222, 0.6364799633172895, 0.0),
 ('density_4.8', -228, 0.6716848762416515, 0.0),
 ('ECM_PDMS_0.2', 13, 0.034320307035751396, 0.041820475379801894),
 ('ECM_PDMS_8.0', 31, 0.02304045042626715, 0.9131678632025907),
 ('ECM_PDMS_64.', 36, 0.005707705182840611, 12.5777911036085),
 ('control_pos_ECM_None', 0, 0.03271458297055277, 0.043577955826531595),
 ('ECM_Fibronectin', 9, 0.028726260981298907, 0.1724846821023931),
 ('ECM_Gelatin', 21, 0.014241782397722835, 4.948982798197649),
 ('ECM_Matrigel', 16, 0.02661179923900439, 0.3138968564647174),
 ('tension_ROCK-inhibitor', 85, 0.00020242914979757084, 14.993854551594294),
 ('tension_Blebbistatin', 109, 0.0001710278775440397, 14.995033336814405),
 ('tension_Latrunculin-A', 51, 0.04647307598568329, 4.946876888904334e-05)]

<hr>

## Quantifying suppression of synNotch activation

One way to measure the activation in each condition is to calculate the percentage of Receivers with fluorescence above a threshold $\phi_c$. In order to find this control percentage in an unbiased manner, we model the positive-control data distribution as a mixture of two independent Normally-distributed activated and quiescent populations (a Gaussian mixture model, or GMM). We then use this model to estimate $\phi_c$.

To accomplish this, we use the `GaussianMixture` object from `scikit-learn`.

In [429]:
from sklearn.mixture import GaussianMixture

# Construct Gaussian mixture model object
model = GaussianMixture(
    n_components=2, 
)

# Isolate positive control for Gaussian mixture modeling
pos_ctrl_gmm = pos_ctrl[:, np.newaxis]

# Perform model fitting and prediction
gmm_labels = model.fit_predict(pos_ctrl_gmm)

# Separate data by predicted label ("ON" vs "OFF")
pc_on   = pos_ctrl_gmm[gmm_labels == model.means_.argmax()]
pc_off  = pos_ctrl_gmm[gmm_labels == model.means_.argmin()]

# Calculate estimated fluorescence cutoff value (`phi_c`)
fluor_cutoff = np.mean([pc_off.max(), pc_on.min()])

# Calculate estimated percentage of activated cells in positive-control (`theta_c`)
pc_on_pct = pc_on.size / pos_ctrl_gm.size

# Estimate cutoff between ON and OFF states
print(f"""Approximate boundary value between OFF and ON states:

        {fluor_cutoff:.1f}

Using this cutoff, 
        
        {pc_on_pct * 100:.1f} %

of positive-control data points were classified as "ON".
""")

Approximate boundary value between OFF and ON states:

        337.5

Using this cutoff, 
        
        79.6 %

of positive-control data points were classified as "ON".



To get a sense for where this lies in the control distribution, we can plot it as a histogram or empirical cumulative density function (ECDF), colored by predicted state.

In [341]:
# Make data for hist and ECDF
pc_cutoff_data = {
    "fluor": np.sort(pos_ctrl_gm.ravel()),
    "percentile (ECDF)": np.arange(1, 1+pos_ctrl_gm.size)/pos_ctrl_gm.size,
    "GMM label": gmm_labels[np.argsort(pos_ctrl_gm.ravel())],
}

# Make plots
ecdf = hv.Scatter(
    pc_cutoff_data,
    kdims=["fluor"],
    vdims=["percentile (ECDF)", "GMM label"]
).groupby(
    "GMM label"
).opts(
    alpha=0.1,
    xlabel="mCherry (AU)",
).overlay(
).opts(
    show_legend=False,
)

hist = hv.Histogram(
    np.histogram(on_fluor.ravel(), np.linspace(0, 1000, 51))
).opts(
    alpha=0.5,
    xlabel="mCherry (AU)",
) * hv.Histogram(
    np.histogram(off_fluor.ravel(), np.linspace(0, 1000, 51))
).opts(
    alpha=0.5,
)

# Show plots
(hist + ecdf)

<hr>

Bootstrap sampling
* Consider experimental data of size `m`.
* Draw a random sample of size `m` from the experimental data values, with replacement (a "bootstrap" sample)
* Calculate the percentage of the bootstrap sample with "ON" expression ($\theta'$). This is the plug-in estimate.
* Repeat this many times and calculate the proportion of bootstrap samples with $\theta' >= \theta_c$. This is the $p$-value.

In [342]:
@numba.njit
def draw_bs_sample(data):
    """Draw a bootstrap sample from a 1D data set."""
    return np.random.choice(data, size=len(data))

In [343]:
@numba.njit
def draw_bs_reps_theta(x, fluor_cutoff, size=1):
    """Generate array of bootstrap replicates."""
    x_len = len(x)
    out = np.empty(size)
    for i in range(size):
        x_bs = draw_bs_sample(x)
        out[i] = np.sum(x_bs > fluor_cutoff) / x_len

    return out

In [191]:
# @numba.njit
# def diff_median(x, y):
#     """
#     Compute plug-in estimate for the difference in medians between distributions.
#     """
#     return np.median(y) - np.median(x)

In [193]:
# def draw_perm_reps(x, y, stat_fun, size=1):
#     """Generate array of permuation replicates."""
#     return np.array([stat_fun(*draw_perm_sample(x, y)) for _ in range(size)])

In [192]:
@numba.njit
def draw_perm_sample(x, y):
    """Generate a permutation sample."""
    concat_data = np.concatenate((x, y))
    np.random.shuffle(concat_data)

    return concat_data[:len(x)], concat_data[len(x):]

In [355]:
@numba.njit
def draw_perm_reps_diff_median(x, y, size=1):
    """Generate array of permuation replicates."""
    out = np.empty(size)
    for i in range(size):
        x_perm, y_perm = draw_perm_sample(x, y)
        out[i] = np.median(x_perm) - np.median(y_perm)

    return out

@numba.njit
def draw_perm_reps_diff_mean(x, y, size=1):
    """Generate array of permuation replicates."""
    out = np.empty(size)
    for i in range(size):
        x_perm, y_perm = draw_perm_sample(x, y)
        out[i] = np.mean(x_perm) - np.mean(y_perm)

    return out

In [345]:
# Draw replicates
n_samples = len(exp_data_list)
size = 10000
bs_reps_arr = np.empty((n_samples, size))
p_vals = np.empty((n_samples,))

iterator = range(n_samples)
iterator = tqdm(iterator)
for i in iterator:
    bs_reps_arr[i] = draw_bs_reps_theta((exp_data_list)[i], fluor_cutoff, size=size)
    p_vals[i] = np.sum(bs_reps_arr[i] >= pc_on_pct) / size

100%|██████████| 15/15 [00:21<00:00,  1.43s/it]


In [350]:
np.mean(pc_on_pct - bs_reps_arr, axis=1)

array([ 0.06491508,  0.02711356,  0.08530998,  0.39382684,  0.3838749 ,
        0.03272416,  0.02212174,  0.00439232,  0.01750203,  0.02172921,
        0.01011163,  0.02348725, -0.0742434 , -0.08640947, -0.10657911])

In [346]:
[(st, p) for st, p in zip(sample_type[exp_sample_idx], p_vals)]

[('density', 0.0),
 ('density', 0.0),
 ('density', 0.0),
 ('density', 0.0),
 ('density', 0.0),
 ('ECM_stiffness', 0.0),
 ('ECM_stiffness', 0.0001),
 ('ECM_stiffness', 0.2246),
 ('ECM_composition', 0.0006),
 ('ECM_composition', 0.0),
 ('ECM_composition', 0.0335),
 ('ECM_composition', 0.0),
 ('cyto_tension', 1.0),
 ('cyto_tension', 1.0),
 ('cyto_tension', 1.0)]

In [310]:
bs_reps_arr

array([[0.7911178 , 0.79149417, 0.79864509, ..., 0.79883327, 0.80579601,
        0.79770418],
       [0.73314968, 0.73516988, 0.7204775 , ..., 0.7322314 , 0.7241506 ,
        0.72635445],
       [0.77150943, 0.76962264, 0.77679245, ..., 0.77981132, 0.76132075,
        0.76679245],
       ...,
       [0.76697977, 0.76318642, 0.77023121, ..., 0.77077312, 0.77402457,
        0.75578035],
       [0.87388664, 0.86255061, 0.85526316, ..., 0.86619433, 0.87044534,
        0.87935223],
       [0.88164871, 0.88438515, 0.8828459 , ..., 0.8797674 , 0.88164871,
        0.8828459 ]])

In [353]:
np.array([d.mean() for d in exp_data_list])

array([511.3959596 , 554.24528302, 498.08391608, 341.24395323,
       335.63716026, 576.96502146, 595.07324655, 600.32744565,
       564.07194374, 572.80040434, 585.42586175, 580.49855491,
       648.84959514, 673.33812211, 615.27340475])

<hr>

In [357]:
# Compute test statistic for each data set in screen
diff_means = np.array([np.mean(x) - np.mean(pos_ctrl) for x in exp_data_list])

diff_means

In [361]:
# Draw replicates
as = len(exp_data_list)
size = 10000
perm_reps_arr = np.empty((n_tests, size))
perm_reps_pvals = np.empty((n_tests,))

iterator = range(n_tests)
iterator = tqdm(iterator)
for i in iterator:
    perm_reps_arr[i] = draw_perm_reps_diff_mean(pos_ctrl, exp_data_list[i], size=size)
    perm_reps_pvals[i] = np.sum(perm_reps_arr[i] <= diff_means[i]) / size

 13%|█▎        | 2/15 [16:34:22<107:43:26, 29831.31s/it]


KeyboardInterrupt: 

In [360]:
[(st, p) for st, p in zip(sample_type[exp_], perm_reps_pvals)]

array([0.    , 0.0027, 0.    , 0.    , 0.    , 0.9999, 1.    , 1.    ,
       0.5401, 0.9959, 1.    , 1.    , 1.    , 1.    , 1.    ])

In [215]:
# Compute test statistic for each data set in screen
diff_medians = np.array([np.median(x) - np.median(pos_ctrl) for x in exp_data_list])

diff_medians

In [212]:
# Draw replicates
n_tests = len(exp_data_list)
size = 10000
perm_reps_arr = np.empty((n_tests, size))
perm_reps_pvals = np.empty((n_tests,))

iterator = range(n_tests)
iterator = tqdm(iterator)
for i in iterator:
    perm_reps_arr[i] = draw_perm_reps_diff_median(pos_ctrl, exp_data_list[i], size=size)
    perm_reps_pvals[i] = np.sum(perm_reps_arr[i] <= diff_medians[i]) / size

100%|██████████| 15/15 [00:41<00:00,  2.73s/it]


In [214]:
[(st, p) for st, p in zip(sample_type[exp_], perm_reps_pvals)]

array([0.    , 0.0438, 0.    , 0.    , 0.    , 1.    , 1.    , 1.    ,
       0.9976, 1.    , 1.    , 1.    , 1.    , 1.    , 1.    ])

<hr>

In [36]:
%load_ext watermark

In [37]:
%watermark -p holoviews,jupyterlab,bokeh

holoviews : 1.14.3
jupyterlab: 1.2.6
bokeh     : 2.3.1

