In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker
import sigProfilerPlotting as sigPlt
from SigProfilerAssignment import Analyzer as Analyze

"""
from SigProfilerMatrixGenerator import install as genInstall
genInstall.install('GRCh38')
"""

sns.set_theme(font="Arial", font_scale=1.1, style='ticks')
plt.rc("axes.spines", top=True, right=True)

def reverse_complement(string):
    try:
        complement_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
        complement_string = ''.join([complement_dict[s] for s in string])
    except KeyError:
        raise ValueError("Invalid character other than A,T,G and C")
    return complement_string[::-1]

def trinuc_context_change(string):
    # ex. AAC>ACC => GTT>GGT
    if len(string) != 7 or string[3] != '>':
        raise ValueError("Input string must be in the format 'NNN>NNN'")
    
    if string[1] not in ['C', 'T']:
        string_pair = string.split('>')
        new_string = reverse_complement(string_pair[0]) + '>' + reverse_complement(string_pair[1])
    else:
        new_string = string
    return new_string

sbs6 = ("C>A", "C>G", "C>T", "T>A", "T>C", "T>G") # Should make also .SBS6.all

sbs96_sigprofiler="""A[C>A]A	A[C>A]C	A[C>A]G	A[C>A]T	A[C>G]A	A[C>G]C	A[C>G]G	A[C>G]T	A[C>T]A	A[C>T]C	A[C>T]G	A[C>T]T	A[T>A]A	A[T>A]C	A[T>A]G	A[T>A]T	A[T>C]A	A[T>C]C	A[T>C]G	A[T>C]T	A[T>G]A	A[T>G]C	A[T>G]G	A[T>G]T	C[C>A]A	C[C>A]C	C[C>A]G	C[C>A]T	C[C>G]A	C[C>G]C	C[C>G]G	C[C>G]T	C[C>T]A	C[C>T]C	C[C>T]G	C[C>T]T	C[T>A]A	C[T>A]C	C[T>A]G	C[T>A]T	C[T>C]A	C[T>C]C	C[T>C]G	C[T>C]T	C[T>G]A	C[T>G]C	C[T>G]G	C[T>G]T	G[C>A]A	G[C>A]C	G[C>A]G	G[C>A]T	G[C>G]A	G[C>G]C	G[C>G]G	G[C>G]T	G[C>T]A	G[C>T]C	G[C>T]G	G[C>T]T	G[T>A]A	G[T>A]C	G[T>A]G	G[T>A]T	G[T>C]A	G[T>C]C	G[T>C]G	G[T>C]T	G[T>G]A	G[T>G]C	G[T>G]G	G[T>G]T	T[C>A]A	T[C>A]C	T[C>A]G	T[C>A]T	T[C>G]A	T[C>G]C	T[C>G]G	T[C>G]T	T[C>T]A	T[C>T]C	T[C>T]G	T[C>T]T	T[T>A]A	T[T>A]C	T[T>A]G	T[T>A]T	T[T>C]A	T[T>C]C	T[T>C]G	T[T>C]T	T[T>G]A	T[T>G]C	T[T>G]G	T[T>G]T"""
sbs96_sigprofiler = sbs96_sigprofiler.split()

sbs96 = dict()
for sbs in sbs96_sigprofiler:
    sbs96[f'{sbs[0]}{sbs[2]}{sbs[-1]}>{sbs[0]}{sbs[-3]}{sbs[-1]}'] = sbs

RuntimeError: CPU dispatcher tracer already initlized

# 1. Single Base Substitution (SBS96 Signature)

#### First execute _01.sbs96_extraction_mutyper.sh to get .SBS96 files

In [None]:
dir="/mmfs1/gscratch/stergachislab/mhsohny/SMaHT/Improving_SomaticVariantCalling_through_DSA/Fiber-seq/VariantCalls_DeepVariant_1.6.1/Mutational_Spectrum/01.SBS"

for sample in ["COLO829T_PassageB_DSA", "COLO829T_PassageA_DSA"]:
    # INFO: All
    os.system(f"mkdir -p {dir}/{sample}")
    os.system(f"mkdir -p {dir}/{sample}/All")

    df = pd.read_csv(f"{dir}/{sample}.SBS96", sep="\t", header=None).T
    df.columns = ['SBS96_pre', 'Count']
    df['SBS96'] = df['SBS96_pre'].apply(trinuc_context_change)
    df['SBS96_SigProfiler'] = df['SBS96'].apply(lambda x: sbs96.get(x, None))

    df['SBS96_SigProfiler'] = pd.Categorical(df['SBS96_SigProfiler'], categories=sbs96_sigprofiler, ordered=True)
    """
    ['T[T>G]T', 'T[T>C]T', 'T[T>A]T', 'G[T>G]T', 'G[T>C]T', ..., 'T[C>G]G', 'T[C>T]G', 'T[C>A]T', 'T[C>G]T', 'T[C>T]T']
    Length: 96
    Categories (96, object): ['A[C>A]A' < 'A[C>A]C' < 'A[C>A]G' < 'A[C>A]T' ... 'T[T>G]A' < 'T[T>G]C' < 'T[T>G]G' < 'T[T>G]T']
    """
    df = df.sort_values(by='SBS96_SigProfiler').reset_index(drop=True)

    df.rename(columns={'SBS96_SigProfiler': 'MutationType', 'Count': f'{sample}'}, inplace=True)
    df[['MutationType', f'{sample}']].to_csv(f"{dir}/{sample}/All/{sample}.SBS96.all", sep='\t', index=False)

    # INFO: Shared
    os.system(f"mkdir -p {dir}/{sample}/Shared")
    df = pd.read_csv(f"{dir}/{sample}.SBS96", sep="\t", header=None).T
    df.columns = ['SBS96_pre', 'Count']
    df['SBS96'] = df['SBS96_pre'].apply(trinuc_context_change)
    df['SBS96_SigProfiler'] = df['SBS96'].apply(lambda x: sbs96.get(x, None))

    df['SBS96_SigProfiler'] = pd.Categorical(df['SBS96_SigProfiler'], categories=sbs96_sigprofiler, ordered=True)
    df = df.sort_values(by='SBS96_SigProfiler').reset_index(drop=True)

    df.rename(columns={'SBS96_SigProfiler': 'MutationType', 'Count': f'{sample}'}, inplace=True)
    df[['MutationType', f'{sample}']].to_csv(f"{dir}/{sample}/Shared/{sample}_Shared.SBS96.all", sep='\t', index=False)

    # INFO: Passage-Specific
    if sample == "COLO829T_PassageB_DSA":
        prefix = "B-Specific"
    elif sample == "COLO829T_PassageA_DSA":
        prefix = "A-Specific"

    os.system(f"mkdir -p {dir}/{sample}/{prefix}")
    df = pd.read_csv(f"{dir}/{sample}.SBS96", sep="\t", header=None).T
    df.columns = ['SBS96_pre', 'Count']
    df['SBS96'] = df['SBS96_pre'].apply(trinuc_context_change)
    df['SBS96_SigProfiler'] = df['SBS96'].apply(lambda x: sbs96.get(x, None))

    df['SBS96_SigProfiler'] = pd.Categorical(df['SBS96_SigProfiler'], categories=sbs96_sigprofiler, ordered=True)
    df = df.sort_values(by='SBS96_SigProfiler').reset_index(drop=True)

    df.rename(columns={'SBS96_SigProfiler': 'MutationType', 'Count': f'{sample}'}, inplace=True)
    df[['MutationType', f'{sample}']].to_csv(f"{dir}/{sample}/{prefix}/{sample}_{prefix}.SBS96.all", sep='\t', index=False)

In [None]:
for sample in ["COLO829T_PassageB_DSA", "COLO829T_PassageA_DSA"]:
    # INFO: All
    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/All/{sample}.SBS96.all", 
        output_path=f"{dir}/{sample}/All", 
        project=f"{sample}", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=False)

    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/All/{sample}.SBS96.all", 
        output_path=f"{dir}/{sample}/All", 
        project=f"{sample}.percentage", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=True)

    Analyze.cosmic_fit(
        f"{dir}/{sample}/All/{sample}.SBS96.all", 
        f"{dir}/{sample}/All", 
        input_type="matrix", 
        context_type="96", 
        collapse_to_SBS96=True, 
        cosmic_version=3.4, 
        exome=False,
        genome_build="GRCh38", 
        signature_database=None,
        exclude_signature_subgroups=None, 
        export_probabilities=True,
        export_probabilities_per_mutation=False, 
        make_plots=True,
        sample_reconstruction_plots="pdf", 
        verbose=False)
    
    # INFO: Shared
    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/Shared/{sample}_Shared.SBS96.all", 
        output_path=f"{dir}/{sample}/Shared", 
        project=f"{sample}", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=False)

    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/Shared/{sample}_Shared.SBS96.all", 
        output_path=f"{dir}/{sample}/Shared", 
        project=f"{sample}.percentage", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=True)

    Analyze.cosmic_fit(
        f"{dir}/{sample}/Shared/{sample}_Shared.SBS96.all", 
        f"{dir}/{sample}/Shared", 
        input_type="matrix", 
        context_type="96", 
        collapse_to_SBS96=True, 
        cosmic_version=3.4, 
        exome=False,
        genome_build="GRCh38", 
        signature_database=None,
        exclude_signature_subgroups=None, 
        export_probabilities=True,
        export_probabilities_per_mutation=False, 
        make_plots=True,
        sample_reconstruction_plots="pdf", 
        verbose=False)

    if sample == "COLO829T_PassageB_DSA":
        prefix = "B-Specific"
    elif sample == "COLO829T_PassageA_DSA":
        prefix = "A-Specific"

    # INFO: Passage-Specific
    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/{prefix}/{sample}_{prefix}.SBS96.all", 
        output_path=f"{dir}/{sample}/{prefix}", 
        project=f"{sample}", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=False)

    sigPlt.plotSBS(
        matrix_path=f"{dir}/{sample}/{prefix}/{sample}_{prefix}.SBS96.all", 
        output_path=f"{dir}/{sample}/{prefix}", 
        project=f"{sample}.percentage", 
        plot_type="96", 
        savefig_format="pdf",
        percentage=True)

    Analyze.cosmic_fit(
        f"{dir}/{sample}/{prefix}/{sample}_{prefix}.SBS96.all", 
        f"{dir}/{sample}/{prefix}", 
        input_type="matrix", 
        context_type="96", 
        collapse_to_SBS96=True, 
        cosmic_version=3.4, 
        exome=False,
        genome_build="GRCh38", 
        signature_database=None,
        exclude_signature_subgroups=None, 
        export_probabilities=True,
        export_probabilities_per_mutation=False, 
        make_plots=True,
        sample_reconstruction_plots="pdf", 
        verbose=False)

There may be an issue with the formatting of your matrix file.
There may be an issue with the formatting of your matrix file.
Assigning COSMIC sigs or Signature Database ...... 


SystemExit: Signatures Database and Samples are of same context type and is not equal to 96. please rerun by setting the flag "collapse_to_SBS96 = False "

# 2. Double Base Substitution (DBS)