# TMB Comparison

Fetch tumor mutation burder and compare between the two groups.


In [None]:
import scanpy as sc
import anndata
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import harmonypy
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.font_manager
from matplotlib import font_manager
from matplotlib.font_manager import fontManager, FontProperties
from scipy.stats import ttest_ind, mannwhitneyu, shapiro
from io import StringIO
import json


from common_utils import find_arial_font, setup_dirs

find_arial_font()

In [None]:
outDir = OUTDIR_TMB
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
def plot_box(
    data, figsize=(4, 3), fname=None, tmb_col=None, titeStr=None, p_value=None
):
    """
    Plot boxplot with swarmplot overlay for TMB data

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe containing TMB data
    figsize : tuple, optional
        Figure size, by default (4, 3)
    fname : str
        Filename to save the plot
    tmb_col : str
        Column name for TMB data
    titeStr : str
        Title for the plot
    p_value : float
        P-value for the statistical test to be displayed on the plot

    Returns
    -------
    None. Saves the plot to the figures directory
    """
    colormap = {"group1": "blue", "group2": "red"}
    plt.figure(figsize=figsize)
    sns.boxplot(
        data=data, x="group", y=tmb_col, palette=colormap, width=0.3, fill=False
    )
    sns.swarmplot(
        data=data, x="group", y=tmb_col, color=".25", alpha=0.8, palette=colormap
    )
    # Add horizontal bar for significance
    x1, x2 = 0, 1  # x-coordinates for Group 1 and Group 2
    y, h, col = (data[tmb_col].max() + 0.01, 0.5, "k")
    plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, color=col)
    plt.text(
        (x1 + x2) * 0.5,
        y + h,
        "n.s." if p_value > 0.05 else f"p={p_value:.3f}",
        ha="center",
        va="bottom",
        color=col,
    )
    plt.xlabel("")
    plt.ylabel("Tumor Mutation Burden")
    plt.title(titeStr)
    plt.grid(False)
    plt.ylim(top=data[tmb_col].max() + data[tmb_col].max() * 0.1)
    plt.tight_layout()
    plt.savefig(os.path.join(figuresDir, fname))
    plt.show()

In [None]:
group_1_path = os.path.join(tablesDir, "msk_met_2021_clinical_data_group_1.tsv")
group_2_path = os.path.join(tablesDir, "msk_met_2021_clinical_data.tsv")

with open(group_1_path, "r") as f:
    group1 = json.load(f)
with open(group_2_path, "r") as f:
    group2 = json.load(f)

group1 = pd.DataFrame(group1)
group2 = pd.DataFrame(group2)

g1 = group1[["sampleId", "TMB_NONSYNONYMOUS"]].copy()
g1["group"] = "group1"
g2 = group2[["sampleId", "TMB_NONSYNONYMOUS"]].copy()
g2["group"] = "group2"
g = pd.concat([g1, g2], axis=0)

g1 = g[g["group"] == "group1"].copy()
g2 = g[g["group"] == "group2"].copy()

# Clean up
g["TMB_NONSYNONYMOUS"] = g["TMB_NONSYNONYMOUS"].astype(float)

# Compute the average TMB between the two groups
g_mean = g.groupby("group").agg({"TMB_NONSYNONYMOUS": "mean"}).reset_index()

g1 = g[g["group"] == "group1"]["TMB_NONSYNONYMOUS"].tolist()
g2 = g[g["group"] == "group2"]["TMB_NONSYNONYMOUS"].tolist()

# Compute the p-value (these are unpaired samples)
t, p = ttest_ind(g1, g2)

_, p1 = shapiro(g1)
_, p2 = shapiro(g2)
print(f"Group 1 Normality p-value: {p1}")
print(f"Group 2 Normality p-value: {p2}")

stat, p_value = mannwhitneyu(g1, g2, alternative="two-sided")
print("Mann-Whitney U test")
print(f"U = {stat}, p = {p_value}")

In [None]:
# Plot a boxplot with swarmplot overlay
plot_box(
    g,
    fname="tmb_comparison_swarm_update_feb_11.pdf",
    tmb_col="TMB_NONSYNONYMOUS",
    titeStr="TMB (non-synonymous)",
    p_value=p_value,
    figsize=(3, 3),
)

In [None]:
# Save the raw data
g.to_csv(os.path.join(dataDir, 'tmb_records_jan09_2025.csv'), index=False)