# TMB Comparison



## Missing patients


### Group 1

P-0055204-T01-IM6


### Group 2

P-0058988-T02-IM7, NA
P-0060030-T01-IM7, NA
P-0052740-T01-IM6, NA
P-0052740-T02-IM6, NA
P-0060611-T01-IM7, NA
P-0058192-T01-IM6, NA
P-0058192-T02-IM7, NA
P-0025975-T01-IM6, 5.1881885699, https://www.cbioportal.org/patient/clinicalData?studyId=luad_mskimpact_2021&caseId=P-0025975
P-0023592-T01-IM6, 15.56456571, https://www.cbioportal.org/patient/clinicalData?studyId=lung_pdx_msk_2021&sampleId=P-0023592-T01-IM6
P-0048114-T01-IM6, 12.9704714248, https://www.cbioportal.org/patient/clinicalData?studyId=nsclc_ctdx_msk_2022&sampleId=P-0048114-T01-IM6
P-0026393-T02-IM6, 12.10577333, https://www.cbioportal.org/patient/clinicalData?studyId=lung_pdx_msk_2021&sampleId=P-0026393-T02-IM6


In [None]:
import scanpy as sc
import anndata
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import harmonypy
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.font_manager
from matplotlib import font_manager
from matplotlib.font_manager import fontManager, FontProperties




def setup_dirs(outDir):
    figuresDir = os.path.join(outDir, 'figures')
    dataDir = os.path.join(outDir, 'data')
    tablesDir = os.path.join(outDir, 'tables')
    os.makedirs(figuresDir, exist_ok=True)
    os.makedirs(dataDir, exist_ok=True)
    os.makedirs(tablesDir, exist_ok=True)
    return figuresDir, dataDir, tablesDir

def force_arial():
    arial_font_path = '/home/salehis/projects/cdm/fonts/arial.ttf'
    font_manager.fontManager.addfont(arial_font_path)
    prop = font_manager.FontProperties(fname=arial_font_path)
    print("Arial font forced")

# set the font
def find_arial_font():
    arial_font = None
    for font in font_manager.findSystemFonts():
        #if font.lower().endswith("arial.ttf"):
        if "arial" in font.lower():
            arial_font = font
            break
        if arial_font:
            print("Found Arial font at: ", arial_font)
            prop = font_manager.FontProperties(fname=arial_font)
            sns.set(font=prop.get_name())
    if arial_font is None:
        print("Arial font not found")
        force_arial()


def filter_genes(adata):
    """
    Filtering the following genes to avoid the dominant effect of
    IG{}V, (Immunoglobulin variable)
    TR{}, (T cell receptor variable genes)
    linc, (Long intergenic non-coding),
    genes starting with RP (ribosomal protein),
    genes starting with MT- (mitochondrial genes)
    HLA genes
    """
    genes = [x for x in adata.var.index.tolist() if "MT-" not in x]
    genes = [x for x in genes if "." not in x]
    genes = [x for x in genes if not x.startswith("RP")]
    genes = [x for x in genes if "linc" not in x.lower()]
    genes = [x for x in genes if "TRA" not in x.upper()]
    genes = [x for x in genes if "TRB" not in x.upper()]
    genes = [x for x in genes if "TRG" not in x.upper()]
    genes = [x for x in genes if "TRD" not in x.upper()]
    genes = [x for x in genes if "IGKV" not in x.upper()]
    genes = [x for x in genes if "IGHV" not in x.upper()]
    genes = [x for x in genes if "IGLV" not in x.upper()]
    genes = [x for x in genes if "-" not in x.upper() and "HLA" not in x.upper()]
    adata = adata[:, genes].copy()
    return adata

find_arial_font()

In [None]:
outDir = '/data1/shahs3/users/salehis/sclc/results/rebuttal/nat_methods/tmb'
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
rsync -azvp --relative \
    iris:/data1/shahs3/users/salehis/sclc/./results//rebuttal/nat_methods/tmb/figures/*.p* \
    /Users/salehis/Projects/sclc/rebuttal_code/SCLC_MET/

In [None]:
# Send the data up

cd /Users/salehis/Projects/sclc/rebuttal_code/SCLC_MET/data/
rsync -azvp *.tsv iris:/data1/shahs3/users/salehis/sclc/results/rebuttal/nat_methods/tmb/tables/

In [None]:
group_1_path = os.path.join(tablesDir, 'msk_met_2021_clinical_data_group_1.tsv')
group_2_path = os.path.join(tablesDir, 'msk_met_2021_clinical_data.tsv')

# These are json files
import json
with open(group_1_path, 'r') as f:
    group1 = json.load(f)
with open(group_2_path, 'r') as f:
    group2 = json.load(f)

group1 = pd.DataFrame(group1)
group2 = pd.DataFrame(group2)

g1 = group1[['sampleId', 'TMB_NONSYNONYMOUS']].copy()
g1['group'] = 'group1'
g2 = group2[['sampleId', 'TMB_NONSYNONYMOUS']].copy()
g2['group'] = 'group2'
g = pd.concat([g1, g2], axis=0)

# Add the following to group 2 (Manually curated from cBioPortal)
manual_group2 = """
P-0025975-T01-IM6, 5.1881885699, https://www.cbioportal.org/patient/clinicalData?studyId=luad_mskimpact_2021&caseId=P-0025975
P-0023592-T01-IM6, 15.56456571, https://www.cbioportal.org/patient/clinicalData?studyId=lung_pdx_msk_2021&sampleId=P-0023592-T01-IM6
P-0048114-T01-IM6, 12.9704714248, https://www.cbioportal.org/patient/clinicalData?studyId=nsclc_ctdx_msk_2022&sampleId=P-0048114-T01-IM6
P-0026393-T02-IM6, 12.10577333, https://www.cbioportal.org/patient/clinicalData?studyId=lung_pdx_msk_2021&sampleId=P-0026393-T02-IM6
"""
manual_group2 = [x.split(',') for x in manual_group2.strip().split('\n')]
manual_group2 = pd.DataFrame(manual_group2, columns=['sampleId', 'TMB_NONSYNONYMOUS', 'link'])
manual_group2['group'] = 'group2'
manual_group2['TMB_NONSYNONYMOUS'] = manual_group2['TMB_NONSYNONYMOUS'].astype(float)
g = pd.concat([g, manual_group2[['sampleId', 'TMB_NONSYNONYMOUS', 'group']]], axis=0)


g['TMB_NONSYNONYMOUS'] = g['TMB_NONSYNONYMOUS'].astype(float)

# Compute the average TMB between the two groups
g_mean = g.groupby('group').agg({'TMB_NONSYNONYMOUS': 'mean'}).reset_index()


g1 = g[g['group'] == 'group1']['TMB_NONSYNONYMOUS'].tolist()
g2 = g[g['group'] == 'group2']['TMB_NONSYNONYMOUS'].tolist()


# Compute the p-value (these are unpaired samples)
from scipy.stats import ttest_ind
t, p = ttest_ind(g1, g2)


from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene

_, p1 = shapiro(g1)
_, p2 = shapiro(g2)
print(f"Group 1 Normality p-value: {p1}")
print(f"Group 2 Normality p-value: {p2}")


from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene


stat, p_value = mannwhitneyu(g1, g2, alternative='two-sided')
print('Mann-Whitney U test')
print(f'U = {stat}, p = {p_value}')


# Violin plot of TMB
plt.clf()
fig, ax = plt.subplots(figsize=(4, 4))
sns.violinplot(data=g, x='group', y='TMB_NONSYNONYMOUS', ax=ax)
plt.ylabel('TMB (non-synonymous)')
plt.xlabel('')
plt.title('TMB (non-synonymous) (p-value: {:.2e})'.format(p))
ax.grid(False)
plt.savefig(os.path.join(figuresDir, 'tmb_violinplot.pdf'), bbox_inches='tight')
plt.close(fig)


In [None]:
def plot_box(data, figsize=(4, 3), fname=None, tmb_col=None, titeStr=None, p_value=None):
    plt.figure(figsize=figsize)
    sns.boxplot(data=data, x="group", y=tmb_col, palette="Set2", width=0.3, fill=False)
    sns.swarmplot(data=data, x="group", y=tmb_col, color=".25", alpha=0.8)
    # Add horizontal bar for significance
    x1, x2 = 0, 1  # x-coordinates for Group 1 and Group 2
    y, h, col = (data[tmb_col].max() + 0.01, 0.5, "k")
    plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, color=col)
    plt.text(
        (x1 + x2) * 0.5,
        y + h,
        "n.s." if p_value > 0.05 else f"p={p_value:.3f}",
        ha="center",
        va="bottom",
        color=col,
    )
    # Customize plot
    plt.xlabel("")
    plt.ylabel("Tumor Mutation Burden")
    plt.title(titeStr)
    # remove the grid
    plt.grid(False)
    plt.ylim(top=data[tmb_col].max() + data[tmb_col].max()*.1)
    plt.tight_layout()
    plt.savefig(os.path.join(figuresDir, fname))
    plt.show()

plot_box(g, fname="tmb_comparison_swarm.pdf", tmb_col='TMB_NONSYNONYMOUS', titeStr='TMB (non-synonymous)', p_value=p_value)

## Comparison with the cBioportal github

In [None]:
large_path = '/data1/shahs3/users/leej39/dataset/msk_solid_heme/data_clinical_sample.txt'
large = pd.read_csv(large_path, sep='\t')


In [None]:

g1_ids = """
P-0000113-T01-IM3
P-0045188-T01-IM6
P-0046012-T01-IM6
P-0055204-T01-IM6
P-0036308-T01-IM6
P-0028176-T01-IM6
"""

g2_ids = """
P-0020524-T01-IM6
P-0050527-T01-IM6
P-0018985-T01-IM6
P-0027732-T01-IM6
P-0058988-T02-IM7
P-0009560-T01-IM5
P-0060030-T01-IM7
P-0007427-T01-IM5
P-0011456-T01-IM5
P-0052740-T01-IM6
P-0052740-T02-IM6
P-0034259-T01-IM6
P-0011404-T01-IM5
P-0009458-T01-IM5
P-0048114-T01-IM6
P-0007578-T01-IM5
P-0011181-T01-IM5
P-0005122-T01-IM5
P-0026393-T01-IM6
P-0026393-T02-IM6
P-0024245-T01-IM6
P-0023819-T01-IM6
P-0024602-T01-IM6
P-0010351-T01-IM5
P-0011034-T01-IM5
P-0016476-T01-IM6
P-0037667-T01-IM6
P-0010066-T01-IM5
P-0004453-T01-IM5
P-0018330-T01-IM6
P-0021206-T01-IM6
P-0023296-T01-IM6
P-0008000-T01-IM5
P-0006785-T01-IM5
P-0060611-T01-IM7
P-0000387-T01-IM3
P-0004617-T01-IM5
P-0009613-T01-IM5
P-0058192-T01-IM6
P-0058192-T02-IM7
P-0025975-T01-IM6
P-0025975-T02-IM6
P-0032648-T01-IM6
P-0005805-T01-IM5
P-0028087-T01-IM6
P-0039729-T01-IM6
P-0034597-T01-IM6
P-0023592-T01-IM6
P-0036630-T01-IM6
P-0019009-T01-IM6
P-0028616-T01-IM6
P-0017529-T01-IM5
P-0006463-T01-IM5
P-0021685-T01-IM6
P-0002050-T01-IM3
P-0041471-T01-IM6
"""


g1_ids = g1_ids.strip().split('\n')
g2_ids = g2_ids.strip().split('\n')

len(g1_ids), len(g2_ids)

In [None]:
sub = large[large["#Sample Identifier"].isin(g1_ids + g2_ids)].copy()
assert sub.shape[0] == len(g1_ids) + len(g2_ids), "Some samples are missing"

# Assert that the individual sample_ids are there
for sample_id in g1_ids + g2_ids:
    assert sample_id in sub["#Sample Identifier"].tolist(), f"{sample_id} is missing"

for ii in sub.columns:
    if "TMB" in ii.upper():
        print(ii)

# Extract impact TMB Score
sub = sub[["#Sample Identifier", "Impact TMB Score"]].copy()
# Add group
sub["group"] = "group1"
sub.loc[sub["#Sample Identifier"].isin(g2_ids), "group"] = "group2"

sub["Impact TMB Score"] = sub["Impact TMB Score"].astype(float)

# Violin plots
plt.clf()
fig, ax = plt.subplots(figsize=(4, 4))
sns.violinplot(data=sub, x="group", y="Impact TMB Score", ax=ax, inner="point")
plt.ylabel("Impact TMB Score")
plt.xlabel("")
plt.title("Impact TMB Score")
ax.grid(False)
plt.savefig(os.path.join(figuresDir, "impact_tmb_violinplot.pdf"), bbox_inches="tight")
plt.close(fig)

# Now Compute the p-value using Mann-Whitney U test
g1 = sub[sub["group"] == "group1"]["Impact TMB Score"].tolist()
g2 = sub[sub["group"] == "group2"]["Impact TMB Score"].tolist()
stat, p_value = mannwhitneyu(g1, g2, alternative="two-sided")
print("Mann-Whitney U test")
print(f"U = {stat}, p = {p_value}")

# plot the box plot
plot_box(
    sub,
    fname="tmb_comparison_swarm_full.pdf",
    tmb_col="Impact TMB Score",
    titeStr="TMB",
    p_value=p_value,
)