# TMB Comparison



## Missing patients


P-0055204-T01-IM6

In [None]:
import scanpy as sc
import anndata
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import harmonypy
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.font_manager
from matplotlib import font_manager
from matplotlib.font_manager import fontManager, FontProperties




def setup_dirs(outDir):
    figuresDir = os.path.join(outDir, 'figures')
    dataDir = os.path.join(outDir, 'data')
    tablesDir = os.path.join(outDir, 'tables')
    os.makedirs(figuresDir, exist_ok=True)
    os.makedirs(dataDir, exist_ok=True)
    os.makedirs(tablesDir, exist_ok=True)
    return figuresDir, dataDir, tablesDir

def force_arial():
    arial_font_path = '/home/salehis/projects/cdm/fonts/arial.ttf'
    font_manager.fontManager.addfont(arial_font_path)
    prop = font_manager.FontProperties(fname=arial_font_path)
    print("Arial font forced")

# set the font
def find_arial_font():
    arial_font = None
    for font in font_manager.findSystemFonts():
        #if font.lower().endswith("arial.ttf"):
        if "arial" in font.lower():
            arial_font = font
            break
        if arial_font:
            print("Found Arial font at: ", arial_font)
            prop = font_manager.FontProperties(fname=arial_font)
            sns.set(font=prop.get_name())
    if arial_font is None:
        print("Arial font not found")
        force_arial()


def filter_genes(adata):
    """
    Filtering the following genes to avoid the dominant effect of
    IG{}V, (Immunoglobulin variable)
    TR{}, (T cell receptor variable genes)
    linc, (Long intergenic non-coding),
    genes starting with RP (ribosomal protein),
    genes starting with MT- (mitochondrial genes)
    HLA genes
    """
    genes = [x for x in adata.var.index.tolist() if "MT-" not in x]
    genes = [x for x in genes if "." not in x]
    genes = [x for x in genes if not x.startswith("RP")]
    genes = [x for x in genes if "linc" not in x.lower()]
    genes = [x for x in genes if "TRA" not in x.upper()]
    genes = [x for x in genes if "TRB" not in x.upper()]
    genes = [x for x in genes if "TRG" not in x.upper()]
    genes = [x for x in genes if "TRD" not in x.upper()]
    genes = [x for x in genes if "IGKV" not in x.upper()]
    genes = [x for x in genes if "IGHV" not in x.upper()]
    genes = [x for x in genes if "IGLV" not in x.upper()]
    genes = [x for x in genes if "-" not in x.upper() and "HLA" not in x.upper()]
    adata = adata[:, genes].copy()
    return adata

find_arial_font()

In [None]:
outDir = '/data1/shahs3/users/salehis/sclc/results/rebuttal/nat_methods/tmb'
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
rsync -azvp --relative \
    iris:/data1/shahs3/users/salehis/sclc/./results//rebuttal/nat_methods/tmb/figures/*.p* \
    /Users/salehis/Projects/sclc/rebuttal_code/SCLC_MET/

In [None]:
# Send the data up

cd /Users/salehis/Projects/sclc/rebuttal_code/SCLC_MET/data/
rsync -azvp *.tsv iris:/data1/shahs3/users/salehis/sclc/results/rebuttal/nat_methods/tmb/tables/

In [None]:
group_1_path = os.path.join(tablesDir, 'msk_met_2021_clinical_data_group_1.tsv')
group_2_path = os.path.join(tablesDir, 'msk_met_2021_clinical_data.tsv')

# These are json files
import json
with open(group_1_path, 'r') as f:
    group1 = json.load(f)
with open(group_2_path, 'r') as f:
    group2 = json.load(f)

group1 = pd.DataFrame(group1)
group2 = pd.DataFrame(group2)

g1 = group1[['sampleId', 'TMB_NONSYNONYMOUS']].copy()
g1['group'] = 'group1'
g2 = group2[['sampleId', 'TMB_NONSYNONYMOUS']].copy()
g2['group'] = 'group2'
g = pd.concat([g1, g2], axis=0)

g['TMB_NONSYNONYMOUS'] = g['TMB_NONSYNONYMOUS'].astype(float)

# Compute the average TMB between the two groups
g_mean = g.groupby('group').agg({'TMB_NONSYNONYMOUS': 'mean'}).reset_index()


g1 = g[g['group'] == 'group1']['TMB_NONSYNONYMOUS'].tolist()
g2 = g[g['group'] == 'group2']['TMB_NONSYNONYMOUS'].tolist()


# Compute the p-value (these are unpaired samples)
from scipy.stats import ttest_ind
t, p = ttest_ind(g1['TMB_NONSYNONYMOUS'], g2['TMB_NONSYNONYMOUS'])


from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene

_, p1 = shapiro(g1)
_, p2 = shapiro(g2)
print(f"Group 1 Normality p-value: {p1}")
print(f"Group 2 Normality p-value: {p2}")


from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene


stat, p_value = mannwhitneyu(g1, g2, alternative='two-sided')
print('Mann-Whitney U test')
print(f'U = {stat}, p = {p_value}')


# Violin plot of TMB
plt.clf()
fig, ax = plt.subplots(figsize=(4, 4))
sns.violinplot(data=g, x='group', y='TMB_NONSYNONYMOUS', ax=ax)
plt.ylabel('TMB (non-synonymous)')
plt.xlabel('')
plt.title('TMB (non-synonymous)')
ax.grid(False)
plt.savefig(os.path.join(figuresDir, 'tmb_violinplot.pdf'), bbox_inches='tight')
plt.close(fig)


In [None]:
def plot_box(data):
    plt.figure(figsize=(4, 3))
    sns.boxplot(data=data, x="group", y="TMB_NONSYNONYMOUS", palette="Set2", width=0.3, fill=False)
    sns.swarmplot(data=data, x="group", y="TMB_NONSYNONYMOUS", color=".25", alpha=0.8)
    # Add horizontal bar for significance
    x1, x2 = 0, 1  # x-coordinates for Group 1 and Group 2
    y, h, col = (data["TMB_NONSYNONYMOUS"].max() + 0.01, 0.5, "k")
    plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, color=col)
    plt.text(
        (x1 + x2) * 0.5,
        y + h,
        "n.s." if p_value > 0.05 else f"p={p_value:.3f}",
        ha="center",
        va="bottom",
        color=col,
    )
    # Customize plot
    plt.xlabel("")
    plt.ylabel("Tumor Mutation Burden")
    plt.title("TMB Comparison Between Groups")
    # remove the grid
    plt.grid(False)
    plt.ylim(top=data["TMB_NONSYNONYMOUS"].max() + data["TMB_NONSYNONYMOUS"].max()*.1)
    plt.tight_layout()
    plt.savefig(os.path.join(figuresDir, "tmb_comparison_swarm.pdf"))
    plt.show()

plot_box(g)