# Non-parametric model comparison of FACS data using log-likelihood ratio (LLR) 
---

## Setup

__Environment__

In [1]:
import os

import pandas as pd
import numpy as np
import numba
from tqdm import tqdm

import holoviews as hv
hv.extension("matplotlib")

import lateral_signaling as lsig

__Functions__

In [2]:
def data_to_hist(d, bins, data_range=(0, 1000)):
    """Convert sampled data to a frequency distribution (histogram)"""
    return np.histogram(d, bins=bins, range=data_range)

__Random number generation__

In [3]:
seed = 2021
rng = np.random.default_rng(seed)

__File I/O__

In [113]:
data_dir = os.path.abspath("../data")
FACS_data_dir = os.path.join(data_dir, "FACS_data")
save_dir = os.path.abspath("../plots")

save_data = False
save_figs = False
fmt = "png"
dpi = 300

<hr>

## Read in metadata and data

In [71]:
metadata = pd.read_csv(os.path.join(FACS_data_dir, "metadata.csv"))
metadata

Unnamed: 0,filename,id,experiment,label,plotlabel
0,a1.csv,a1,neg_control,control_neg_no_senders,Neg. Ctrl.
1,a2.csv,a2,density,density_0.3,0.25x
2,a3.csv,a3,density,density_0.5,0.5x
3,a4.csv,a4,pos_control,control_pos,(▲) 1x
4,a5.csv,a5,density,density_2.4,2x
5,a6.csv,a6,density,density_3.6,3x
6,a7.csv,a7,density,density_4.8,4x
7,e1.csv,e1,ECM,ECM_PDMS_0.2,0.2 kPa PDMS
8,e2.csv,e2,ECM,ECM_PDMS_8.0,8.0 kPa PDMS
9,e3.csv,e3,ECM,ECM_PDMS_64.,64. kPa PDMS


In [72]:
# Get data from files
files_raw = metadata.filename
filenames = []
data = []

for i, id_ in enumerate(metadata.id.values):
    f = files_raw[files_raw.str.contains(id_)].values[0]
    d = pd.read_csv(os.path.join(FACS_data_dir, f)).squeeze()
    
    filenames.append(f)
    data.append(d)

filenames = np.array(filenames)

In [73]:
# Get indices of control data
is_control = metadata.experiment.str.contains("control").values
ctrldata_idx = is_control.nonzero()[0]
expdata_idx  = (~is_control).nonzero()[0]

In [74]:
# Extract samples by ctrl vs. experimental
nc, pc1, pc2, *_, pc12 = [data[i].values for i in ctrldata_idx]
expdata_list = [data[i].values for i in expdata_idx]

# Sample sizes of positive controls 1 and 2
print("PC sample sizes:", pc1.size, ",", pc2.size)

# Mean and min of experimental sample sizes
print(f"Mean sample size of experimental samples: {np.mean([d.size for d in expdata_list]):.2f}")

min_samplesize = np.min([d.size for d in expdata_list])
print(f"Min sample size of experimental samples: {min_samplesize}")

PC sample sizes: 5314 , 5546
Mean sample size of experimental samples: 5514.47
Min sample size of experimental samples: 4660


In [75]:
# Select a subset for plotting
idx_to_plot = np.array([0, 22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17])

<hr>

## Calculate PDF and log-PDF of reference distributions by binning

In [98]:
# Number of bins in histogram
nbins = 100

In [99]:
# Get number of observations in each bin
data_hists = np.array([data_to_hist(d.values, nbins)[0] for d in data])

# Add 1 to each bin to avoid div by 0. Then normalize and take the logarithm
data_hists_pdf    = (data_hists + 1) / np.sum(data_hists + 1, axis=1, keepdims=True)
data_hists_logpdf = np.log10(data_hists_pdf)

In [100]:
# Get ctrl and experimental sample histograms
nc_hist_logpdf, pc1_hist_logpdf, pc2_hist_logpdf, *_, pc12_hist_logpdf = data_hists_logpdf[ctrldata_idx]

In [101]:
# Calculate the log likelihood of data given empirical distributions
log_like_nc    = np.sum(data_hists *   nc_hist_logpdf, axis=1)
log_like_pc1   = np.sum(data_hists *  pc1_hist_logpdf, axis=1)
log_like_pc2   = np.sum(data_hists *  pc2_hist_logpdf, axis=1)
log_like_pc12  = np.sum(data_hists * pc12_hist_logpdf, axis=1)

# Get log-likelihood ratios (LLRs)
llr_pc1_nc  = log_like_pc1  - log_like_nc
llr_pc2_nc  = log_like_pc2  - log_like_nc
llr_pc12_nc = log_like_pc12 - log_like_nc

__Save results in metadata__

In [102]:
metadata_res = metadata.copy()
metadata_res["log_likelihood_ratio_Pos1_Neg"] = llr_pc1_nc
metadata_res["log_likelihood_ratio_Pos2_Neg"] = llr_pc2_nc
metadata_res["log_likelihood_ratio_Pos_pooled_Neg"] = llr_pc12_nc

In [126]:
# Select data for plotting
llr_data = metadata_res.iloc[idx_to_plot].copy()
llr_data = llr_data.reset_index(drop=True)
llr_data.index.name = "id_num"
llr_data = llr_data.reset_index()

# Specify the starting x-value of each spike in the spike plot
llr_data["x0"] = 0.
llr_data["label_xval"] = np.minimum(llr_data.log_likelihood_ratio_Pos_pooled_Neg.values, 0)
llr_data["label_xval2"] = llr_data.log_likelihood_ratio_Pos_pooled_Neg.min()

In [127]:
# Set colors of spikes
colors = np.array([lsig.black, lsig.cols_red[0], lsig.cols_teal[1]])
spike_colors_idx = np.zeros(llr_data.shape[0], dtype=int)

# Neg ctrl
spike_colors_idx[0] = 1

# Pooled pos ctrl
spike_colors_idx[1] = 2

# Individual pos ctrl
spike_colors_idx[4] = 2
spike_colors_idx[11] = 2

# Set colors by index
spike_colors = colors[spike_colors_idx]

In [130]:
xlim = (-5000, 16500)
xticks = np.linspace(-5000, 15000, 5)
ylim = (-1, llr_data.shape[0] + 1)

llr_points = hv.Points(
    llr_data,
    kdims=["log_likelihood_ratio_Pos_pooled_Neg", "id_num"],
).opts(
    c=spike_colors,
    s=25,
)

llr_labels = hv.Labels(
    llr_data,
    kdims=["label_xval2", "id_num"],
    vdims=["plotlabel"],
).opts(
    horizontalalignment="right",
    xoffset = -1000,
)

llr_segments = hv.Segments(
    llr_data,
    [
        "x0",
        "id_num",
        "log_likelihood_ratio_Pos_pooled_Neg",
        "id_num",
    ],
).opts(
    color=spike_colors,
)

llr_yaxis = hv.Segments(
    (0, ylim[0], 0, ylim[1]),
).opts(
    color="k",
    lw=1,
)

llr_spikeplot = (
    llr_points * llr_segments * llr_labels * llr_yaxis
    * hv.VLine(-6000)
).opts(
    invert_yaxis=True,
    hooks=[lsig.remove_RT_spines],
).opts(
#     title="High-density conditions resemble No-Senders condition",
    xlabel = r"$\log_{10}\;\left[\frac{P(x | +Senders)}{P(x | -Senders)}\right]$",
    xlim=xlim,
    xticks=xticks,
    yaxis=False,
    ylim=ylim,
    aspect=0.7,
)

In [131]:
hv.output(llr_spikeplot, dpi=dpi//2)

  arr = np.array(values)
  arr = np.array(values)


In [132]:
if save_figs:
    fname = "FACS_llr_spikeplot"
    fpath = os.path.join(save_dir, fname + "." + fmt)
    hv.save(llr_spikeplot, fpath, dpi=dpi)

  arr = np.array(values)
  arr = np.array(values)


<hr>

## How does bin size affect log-likelihood ratio?

__Calculate log-likelihood ratio (LLR) for a range of bin sizes__

In [89]:
# Max number of bins to try
max_nbins = 10000

nbins_range = 1 + np.arange(max_nbins)

# Initialize output
llrs_nc_pc1  = np.zeros((max_nbins, len(data)))
llrs_nc_pc2  = np.zeros((max_nbins, len(data)))
llrs_nc_pc12 = np.zeros((max_nbins, len(data)))

for i, nbins in enumerate(tqdm(nbins_range)):

    # Get number of observations in each bin
    _data_hists = np.array([data_to_hist(d.values, nbins)[0] for d in data])

    # Add 1 to each bin to avoid div by 0. Then normalize and take the logarithm
    _data_hists_pdf    = (_data_hists + 1) / np.sum(_data_hists + 1, axis=1, keepdims=True)
    _data_hists_logpdf = np.log10(_data_hists_pdf)

    # Get ctrl and experimental sample histograms
    _nc_hist_logpdf, _pc1_hist_logpdf, _pc2_hist_logpdf, *_, _pc12_hist_logpdf = _data_hists_logpdf[ctrldata_idx]

    # Calculate the log likelihood of data given empirical distributions
    _log_like_nc    = np.sum(_data_hists *   _nc_hist_logpdf, axis=1)
    _log_like_pc1   = np.sum(_data_hists *  _pc1_hist_logpdf, axis=1)
    _log_like_pc2   = np.sum(_data_hists *  _pc2_hist_logpdf, axis=1)
    _log_like_pc12  = np.sum(_data_hists * _pc12_hist_logpdf, axis=1)

    # Get log-likelihood ratios (LLRs)
    llrs_nc_pc1[i]  = _log_like_nc - _log_like_pc1
    llrs_nc_pc2[i]  = _log_like_nc - _log_like_pc2
    llrs_nc_pc12[i] = _log_like_nc - _log_like_pc12

100%|█████████████████████████████████| 10000/10000 [03:06<00:00, 53.74it/s]


__Plot LLR as a function of bin size for each sample__

In [116]:
llr_nbins = hv.Overlay(
    [
        
        # Plot the LLR as a function of # of bins used
        hv.Curve(
            (nbins_range, llr),
            label=label,
        ).opts(
            linewidth=1,
        )
        for llr, label in zip(llrs_nc_pc1.T[idx_to_plot], metadata.plotlabel[idx_to_plot])
    ] + [
        
        # Add a line showing log-ratio of zero (equal likelihood)
        hv.HLine(
            0
        ).opts(
            c="k",
            linestyle="dotted",
            linewidth=1,
        )
    ]
).opts(
    aspect=2,
    legend_position="right",
    logx=True,
    xlabel="# histogram bins",
    ylabel=r"$\log_{10}$(Likelihood ratio) (LLR)"
)

In [117]:
hv.output(llr_nbins, dpi=dpi//2)

In [None]:
if save_figs:
    fname = "FACS_llr_vs_num_bins"
    fpath = os.path.join(save_dir, fname + "." + fmt)
    hv.save(llr_nbins, fpath, dpi=dpi)

<hr>

__Save results in new metadata file__

In [115]:
if save_data:
    fname = "metadata_with_LLR"
    fpath = os.path.join(FACS_data_dir, fname)
    metadata_res.to_csv(fpath)