In [1]:
### This notebook analyze candidate mutation positions and see if they lies in the splicing region
import pandas as pd
import os,pickle,re
from tqdm import tqdm

In [8]:
### Read the synonymous candidate list/maf directory
dir_out_fig4 = './figure4/'
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_090821'
df_syn = pd.read_csv(os.path.join(dir_out_fig4,feature_type+'.syn.df_all_forheatmap.'+run+'.csv'),index_col = 0)
df_syn = df_syn.set_index('gene')

dir_maf = '../maf_out/maf_cohorts_060121'
feature_type = 'histology'

In [9]:
# Read transcript info file
dict_transcript_info = pickle.load(open('../anno_ref/proc_refs/dict_transcript_info_062121.pkl','rb'))
dict_name = pickle.load(open(os.path.join('../anno_ref/proc_refs','dict_name_forcov_062121.pkl'), 'rb'))
# dict_record = pickle.load(open('../anno_ref/proc_refs/dict_record_new.pkl','rb'))

In [10]:
# Find transcript for the gene
# Create dictionary for candidate gene name - transcript id
dict_name_id = {}
for sig_genes in df_syn.index.tolist():
    for transcript,gene in dict_name.items():
        if gene == sig_genes:
            dict_name_id[sig_genes] = transcript

In [11]:
### Get the synonymous mutations of the candidate gene in the cohort
def get_syn_mut(gene):
    global df_syn
    
    histology = df_syn.loc[gene,'feature'] #histology type
    # open maf file
    df_maf = pd.read_csv(os.path.join(dir_maf,feature_type, histology+'.csv'), sep = '\t')
    df_maf = df_maf.set_index(['Hugo_Symbol','Variant_Classification','Donor_ID' ])
    # total number of donors in this cohort
    ldonor_all = df_maf.index.get_level_values('Donor_ID').unique().tolist()
    #print(f'Total {len(ldonor_all)} patients')
    
    # Find synonymous mutations
    df_gene_mut = df_maf.loc[pd.IndexSlice[gene, 'Silent',:],:]### All patients' specific histology in this histology type
    # Number of patients have synonymous mutations
    ldonor = df_gene_mut.index.get_level_values('Donor_ID').unique().tolist()
    #print(f'{len(ldonor)} patients have synonymous mutations, they are: {ldonor}')

    return df_gene_mut,histology

In [34]:
# Determine if the mutaiton position is close to the exon start-end
def find_splicing_pos(mut_pos, cds, gene_strand):
    flag = 0; effect = None
    if gene_strand == '-':
        if mut_pos< cds[1] and mut_pos > cds[1]-20:
            effect = 'within 20bp of 5 prime splicing site';flag=1
        elif mut_pos > cds[0] and mut_pos< cds[0] + 20:
            effect = 'within 20bp of 3 prime splicing site';flag=1
    else:
        if mut_pos> cds[0] and mut_pos < cds[0]+20:
            effect = 'within 20bp of 5 prime splicing site';flag = 1
        elif mut_pos < cds[1] and mut_pos> cds[1] - 20:
            effect = 'within 20bp of 3 prime splicing site';flag = 1
    return flag, effect

In [35]:
# Determine if the mutaiton position is close to the exon start-end
def find_initiation_pos(mut_pos, cds, gene_strand):
    flag = 0; effect = None
    # if gene_strand == '-':
    #     if mut_pos< cds[1] and mut_pos > cds[1]-10:
    #         effect = 'within 10bp of initiation site';flag = 1
    #     elif mut_pos > cds[0] and mut_pos< cds[0] +20:
    #         effect = 'within 20bp of 3 prime splicing site';flag = 1
    # else:
    if mut_pos > cds[0] and mut_pos< cds[0] + 10:
        effect = 'within 10bp of initiation site';flag = 1
    elif mut_pos < cds[1] and mut_pos> cds[1]- 20:
        effect = 'within 20bp of 3 prime splicing site';flag = 1
    return flag, effect

In [36]:
def make_gene_df(gene, histology, mutation, exon, effect):
    dict_gene_splice = {'Gene':[gene], 'Histology':[histology], 'Mutation':[mutation], 'Exon':[exon], 'Effect':[effect]}
    df = pd.DataFrame.from_dict(dict_gene_splice)
    print(dict_gene_splice)
    return df

In [47]:
df_splice = pd.DataFrame(columns = ['Gene','Histology', 'Mutation', 'Exon', 'Effect'])
for genes in dict_name_id:
    print(f'searching for gene {genes}')
    transcript = dict_name_id[genes]
    chromosome = dict_transcript_info[transcript]['chr']
    CDS = dict_transcript_info[transcript]['CDS']
    len_cds = len(CDS);len_exon = len(dict_transcript_info[transcript]['exon'])
    # If there is extra UTR exon, add the index by +1 in the following session
    n_plusidx = len_exon-len_cds+1
    strand = dict_transcript_info[transcript]['strand']
    df_synmut, his = get_syn_mut(genes)

    # If negative strand then reverse the exon orders
    # if strand == '-':
    #     CDS.reverse()
    for mutations in df_synmut['Genome_Change'].unique().tolist():
        change = mutations.split(':')[1]
        mutation = chromosome+':g.'+change
        muts = int(re.findall(r'\d+', mutation)[1])
        for idx,exons in enumerate(CDS):
            if idx == 0: # Find initiation mutations
                res, eff = find_initiation_pos(muts, exons, strand)
                if res == 1:
                    exn = idx +n_plusidx; df_gene_splice = make_gene_df(genes, his, mutation, exn, eff)
                    df_splice = pd.concat([df_splice, df_gene_splice], axis = 0)
            else:
                res, eff = find_splicing_pos(muts, exons, strand)
                if res ==1:
                    exn = idx+n_plusidx; df_gene_splice = make_gene_df(genes, his, mutation, exn, eff)
                    df_splice = pd.concat([df_splice, df_gene_splice], axis = 0)

searching for gene BCL2


  if (await self.run_code(code, result,  async_=asy)):


searching for gene SRSF2
{'Gene': ['SRSF2'], 'Histology': ['Lymph-BNHL'], 'Mutation': ['chr17:g.74733234G>A'], 'Exon': [2], 'Effect': ['within 20bp of 3 prime splicing site']}
searching for gene ITLN1


  if (await self.run_code(code, result,  async_=asy)):


searching for gene XIRP2
{'Gene': ['XIRP2'], 'Histology': ['Skin-Melanoma'], 'Mutation': ['chr2:g.168096370A>G'], 'Exon': [7], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene ACTA1


  if (await self.run_code(code, result,  async_=asy)):


{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567630A>G'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567636C>A'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567639C>A'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567630A>G'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567636C>A'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
{'Gene': ['ACTA1'], 'Histology': ['Lymph-CLL'], 'Mutation': ['chr1:g.229567639C>A'], 'Exon': [6], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene GLRA3


  if (await self.run_code(code, result,  async_=asy)):


searching for gene SOX18
{'Gene': ['SOX18'], 'Histology': ['Eso-AdenoCA'], 'Mutation': ['chr20:g.62680308C>T'], 'Exon': [2], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene DMRTB1
searching for gene SIGLEC15
{'Gene': ['SIGLEC15'], 'Histology': ['Panc-AdenoCA'], 'Mutation': ['chr18:g.43418687G>C'], 'Exon': [4], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene TP53I3
searching for gene CALR


  if (await self.run_code(code, result,  async_=asy)):


searching for gene HIST1H4E
searching for gene HIST1H3G
{'Gene': ['HIST1H3G'], 'Histology': ['Breast-AdenoCA'], 'Mutation': ['chr6:g.26271214C>A'], 'Exon': [1], 'Effect': ['within 10bp of initiation site']}
searching for gene PURA
searching for gene TUBA4A
searching for gene HIST1H2BK
searching for gene AKAP2
searching for gene NOL9
searching for gene ACTRT2
searching for gene NACA
searching for gene PRX
searching for gene MSRB2
searching for gene ITPR2
searching for gene SH3BGR


  if (await self.run_code(code, result,  async_=asy)):


{'Gene': ['SH3BGR'], 'Histology': ['Panc-Endocrine'], 'Mutation': ['chr21:g.40834315G>A'], 'Exon': [3], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene ACVRL1
{'Gene': ['ACVRL1'], 'Histology': ['Prost-AdenoCA'], 'Mutation': ['chr12:g.52307539C>T'], 'Exon': [4], 'Effect': ['within 20bp of 3 prime splicing site']}
{'Gene': ['ACVRL1'], 'Histology': ['Prost-AdenoCA'], 'Mutation': ['chr12:g.52307549T>C'], 'Exon': [4], 'Effect': ['within 20bp of 3 prime splicing site']}
searching for gene TPM2
searching for gene ZFP69


In [48]:
df_splice

Unnamed: 0,Gene,Histology,Mutation,Exon,Effect
0,SRSF2,Lymph-BNHL,chr17:g.74733234G>A,2,within 20bp of 3 prime splicing site
0,XIRP2,Skin-Melanoma,chr2:g.168096370A>G,7,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567630A>G,6,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567636C>A,6,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567639C>A,6,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567630A>G,6,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567636C>A,6,within 20bp of 5 prime splicing site
0,ACTA1,Lymph-CLL,chr1:g.229567639C>A,6,within 20bp of 5 prime splicing site
0,SOX18,Eso-AdenoCA,chr20:g.62680308C>T,2,within 20bp of 5 prime splicing site
0,SIGLEC15,Panc-AdenoCA,chr18:g.43418687G>C,4,within 20bp of 5 prime splicing site
