### Bland-Altman analysis

Bland-Altman analysis was performed to test the agreement between the enDR3-DRIPc-seq and S9.6-DRIPc-seq methods.

https://www.jstor.org/stable/2987937?origin=crossref

https://biochemia-medica.com/en/journal/25/2/10.11613/BM.2015.015/fullArticle

https://www.sciencedirect.com/science/article/pii/S2452247318302462

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5800325/

https://www.statology.org/bland-altman-plot/

In [None]:
from typing import Optional

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns

from scipy import stats

In [None]:
# Make SVG text as font not as curves
mpl.rcParams['svg.fonttype'] = 'none'

mpl.rcParams['font.sans-serif'].insert(0, 'Arial')
mpl.rcParams['font.family'] = 'sans-serif'

In [None]:
sns_axes_style = sns.axes_style().copy()

In [None]:
sns_axes_style['xtick.bottom'] = True
sns_axes_style['ytick.left'] = True

In [None]:
SRC_DIR_PATH = Path(r"/path/to/analysis/correlations")
DEST_DIR_PATH = Path(r"/path/to/analysis/figures/correlations")

In [None]:
# Library size-normalized read counts file. Peaks - rows. Columns - samples.
# These are read counts over overlapping peaks.
# One can have a file of multiple samples and replicates of each sample.
# All of them will be globally quantile normalised.
counts_file_name = "Dataset-1_vs_Dataset-2_overlap_counts_norm.txt"

In [None]:
counts_normalised = pd.read_csv(SRC_DIR_PATH.joinpath(counts_file_name), header=0, sep='\t', decimal='.')

In [None]:
counts_normalised.describe()

#### Quantile normalisation

First step is to quantile normalize the dataset.

In [None]:
def quantile_normalize(df):
    """
    input: dataframe with numerical columns
    output: dataframe with quantile normalized values
    """
    df_sorted = pd.DataFrame(np.sort(df.values,
                                     axis=0), 
                             index=df.index, 
                             columns=df.columns)
    df_mean = df_sorted.mean(axis=1)
    df_mean.index = np.arange(1, len(df_mean) + 1)
    df_qn = df.rank(method="min").stack().astype(int).map(df_mean).unstack()
    del df_sorted
    return df_qn

In [None]:
counts_normalised_qn = quantile_normalize(counts_normalised.iloc[:, range(3, 9)])
final_df = pd.concat([counts_normalised[['chr', 'start', 'end']].copy(), counts_normalised_qn], axis=1)

In [None]:
final_df.head()

In [None]:
# Save the normalised data
# Example file name
dest_file_name = "Dataset-1_vs_Dataset-2_overlap_counts_norm_qn.txt"
final_df.to_csv(SRC_DIR_PATH.joinpath(dest_file_name), sep='\t', index=False)

#### Actual Bland-Altman analysis

The code is written in a way that allows analysing multiple samples at a time.
The replicates are also handled, but one have to take care of it tin the 
`get_mean_data()` function.

In [None]:
def get_mean_data(data_df: pd.DataFrame) -> pd.DataFrame:
    """
    Get the mean of the replicate samples per data row (peak).
    
    This is very specific to data format. Adjust accordingly or create a new function.
    It would be ideal to abstract this away and use a sort of template (dictionary)
    to combine selected columns. Anyways, there is no plan to dramatically change
    samples.
    
    The return datafram should contain 'chr', 'start', and 'end' columns. The rest of the columns are
    considered to be mean data of a dataset.
    
    In the example below, the resulting data frame would have the following columns:
    'chr', 'start', 'end', 'Dataset_1', 'Dataset_2'.
    """
    tmp_mean_data = data_df[['chr', 'start', 'end']].copy()
    # Example four replicates of Dataset 1
    tmp_mean_data['Dataset_1'] = data_df[['Dataset_1_1', 'Dataset_1_2', 'Dataset_1_3', 'Dataset_1_4']].mean(axis=1)
    # Example two replicates of Dataset 2
    tmp_mean_data['Dataset_2'] = data_df[['Dataset_2_1', 'Dataset_2_2']].mean(axis=1)
    return tmp_mean_data

In [None]:
def get_datasets_mean_diff(data, col_1_name, col_2_name):
    # First three columns should be always copied in order to look carefully
    # at the particular data points (peaks)
    tmp_data = data[['chr', 'start', 'end', col_1_name, col_2_name]].copy()
    
    # Calculate columns mean, diff and percentage diff
    tmp_data['mean'] = data[[col_1_name, col_2_name]].mean(axis=1, skipna=True)
    tmp_data['diff'] = data[col_1_name] - data[col_2_name]
    tmp_data['diff_prc'] =  (tmp_data['diff']/tmp_data['mean'])*100
    
    return tmp_data

In [None]:
def get_data_stat_summary(data, verbose: bool = True):
    mean_diff = data['diff'].mean(skipna=True)
    sd_diff = data['diff'].std(skipna=True)

    ci_diff = stats.t.interval(alpha=0.95, df=len(data)-1,
                               loc=mean_diff, scale=stats.sem(data['diff']))

    ttest_res = stats.ttest_1samp(data['diff'], popmean=0, alternative='two-sided')

    upper_limit = mean_diff + 1.96 * sd_diff
    lower_limit = mean_diff - 1.96 * sd_diff

    if verbose:
        print(f"Mean diff: {mean_diff}")
        print(f"Diff STD: {sd_diff}")
        print(f"Diff 95% CI: {ci_diff}")
        print("One sample TTest under NULL (mu=0):")
        print(ttest_res)
        print(f"\nLimits of agreement (95%): [{lower_limit}, {upper_limit}]")
    
    report = {}
    report['mean_diff'] = mean_diff
    report['sd_diff'] = sd_diff
    report['ci_diff'] = ci_diff
    report['pvalue'] = ttest_res[1]
    report['loa_upper'] = upper_limit
    report['loa_lower'] = lower_limit
    
    return report

In [None]:
def make_ba_plot(data, stat_summary: dict, title=None):
    sns.set_theme(style="white", font="Arial")
    with sns.axes_style(style=sns_axes_style):
        ba_plot = sns.scatterplot(data=data, x='mean', y='diff')
        ba_plot.axhline(stat_summary['mean_diff'], color='k', linewidth=0.8)
        ba_plot.axhline(0, color='grey', linewidth=0.8)
        ba_plot.axhline(stat_summary['loa_upper'], color='r', linewidth=0.75, linestyle='--')
        ba_plot.axhline(stat_summary['loa_lower'], color='g', linewidth=0.75, linestyle='--')
        ba_plot.set(title=title)
        ba_plot.get_figure().tight_layout()

In [None]:
def make_ba_plot_axis(data, stat_summary: dict, fig_ax, title=None):
    ba_plot = sns.scatterplot(data=data, x='mean', y='diff', ax=fig_ax, s=10)
    ba_plot.axhline(stat_summary['mean_diff'], color='k', linewidth=0.8)
    ba_plot.axhline(0, color='grey', linewidth=0.8)
    ba_plot.axhline(stat_summary['loa_upper'], color='r', linewidth=0.75, linestyle='--')
    ba_plot.axhline(stat_summary['loa_lower'], color='g', linewidth=0.75, linestyle='--')
    ba_plot.set(title=title)
    loa = f"LoA = [{stat_summary['loa_lower']:.2f}, {stat_summary['loa_upper']:.2f}] Total = {stat_summary['loa_upper']-stat_summary['loa_lower']:.2f}"
    fig_ax.text(0.025, 0.025, loa,
        verticalalignment='bottom', horizontalalignment='left',
        transform=fig_ax.transAxes, fontsize=10)
    ba_plot.set_ylim(-2, 2)
    ba_plot.get_figure().tight_layout()

In [None]:
def calculate_row_count(samples_cnt, col_cnt):
    """
    If plotting more than one datasets pair, 
    calculate the number of plot rows,
    given the number of columns.
    """
    if samples_cnt % col_cnt == 0:
        return samples_cnt // col_cnt
    else:
        return samples_cnt // col_cnt + 1

In [None]:
def load_data(file_path) -> pd.DataFrame:
    return pd.read_csv(file_path, header=0, sep='\t', decimal='.')

In [None]:
FIG_COLUMNS_COUNT = 1

def generate_save_ba_plots(file_path, samples_mapping: list, save_figs: bool = False, dest_dir: Optional[Path] = None, verbose: bool = False):
    tmp_df = load_data(file_path)
    tmp_mean_df = get_mean_data(tmp_df)
    del tmp_df
    
    base_file_name = file_path.stem
    sup_title = base_file_name.replace("_", " ")

    pairs_cnt = len(samples_mapping)
    rows_cnt = calculate_row_count(pairs_cnt, FIG_COLUMNS_COUNT)
    
    if FIG_COLUMNS_COUNT < 2:
        fig_width = FIG_COLUMNS_COUNT*3+1
    else:
        fig_width = FIG_COLUMNS_COUNT*3+3
    
    fig_height = rows_cnt*3.5
    fig, axs = plt.subplots(rows_cnt, FIG_COLUMNS_COUNT, figsize=(fig_width, fig_height), sharey=True)
    
    fig.suptitle(sup_title)
    
    counter = 0
    for row in range(0, rows_cnt):
        for col in range(0, FIG_COLUMNS_COUNT):
            if counter > pairs_cnt-1:
                fig.delaxes(axs[row, col])
                continue

            mapping = samples_mapping[counter]
            tmp_data_diff = get_datasets_mean_diff(tmp_mean_df, mapping[0], mapping[1])
            tmp_stat_summary = get_data_stat_summary(tmp_data_diff, verbose=verbose)

            if FIG_COLUMNS_COUNT < 2:
                make_ba_plot_axis(tmp_data_diff, tmp_stat_summary, axs, title=mapping[2])
            else:
                make_ba_plot_axis(tmp_data_diff, tmp_stat_summary, axs[row, col], title=mapping[2])
            
            del tmp_data_diff, tmp_stat_summary
            counter += 1

    if save_figs and dest_dir:   
        fig.savefig(dest_dir.joinpath(base_file_name+"_ba.jpg"), format='jpg', dpi=600, transparent=True)
        fig.savefig(dest_dir.joinpath(base_file_name+"_ba.pdf"), format='pdf', dpi=600, transparent=True)
        plt.close(fig)

In [None]:
# (column_1, column_2, title)
# diff = column_1 - column 2
# Here, one can provide a list of tuples for as many pairs as needed.
# Note that the sample names must be present in the dataframe generated
# in `get_mean_data()`
diff_pairs = [
    ("Dataset_1", "Dataset_2", "BA plot (DS_1 - DS_2)"),
]

#### Generate stats and plots. Save plots.

In [None]:
generate_save_ba_plots(
    SRC_DIR_PATH.joinpath("Dataset-1_vs_Dataset-2_overlap_counts_norm_qn.txt"), 
    diff_pairs, 
    verbose=True,
    save_figs=False,
    dest_dir=DEST_DIR_PATH
)