In [1]:
### This notebook analyze candidate mutation positions and see if they lies in the splicing region
import pandas as pd
import os,pickle,re
from tqdm import tqdm

In [2]:
### Read the synonymous candidate list/maf directory
dir_out_fig4 = './figure4/'
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_new_newcova_nohypermutator'
df_syn = pd.read_csv(os.path.join(dir_out_fig4,feature_type+'.syn.df_all_forheatmap.'+run+'.csv'),index_col = 0)
df_syn = df_syn.set_index('gene')

dir_maf = '../data/maf/histology_nohypermutator'

In [3]:
df_syn

Unnamed: 0_level_0,X,p,q,feature,exp.nonexp,FDR
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BCL2,0,0.0,0.0,Lymph-BNHL,exp,8.984726e-47
SRSF2,2,0.0,0.0,Lymph-BNHL,exp,8.984726e-47
ITLN1,52770,0.0,0.0,CNS-PiloAstro,exp,8.984726e-47
ACTA1,211080,0.0,0.0,Lymph-CLL,exp,8.984726e-47
SIGLEC15,316623,0.0,0.0,Panc-AdenoCA,exp,8.984726e-47
TP53I3,316624,0.0,0.0,Panc-AdenoCA,exp,8.984726e-47
LONRF3,351800,0.0,0.0,Liver-HCC,exp,8.984726e-47
RTN4,351801,0.0,0.0,Liver-HCC,exp,8.984726e-47
PPWD1,386980,0.0,0.0,Head-SCC,exp,8.984726e-47
MAGEC1,527700,0.0,0.0,Breast-AdenoCA,exp,8.984726e-47


In [4]:
# Read transcript info file
dict_transcript_info = pickle.load(open('../data/proc_refs/dict_transcript_info_062121.pkl','rb'))
dict_name = pickle.load(open(os.path.join('../data/proc_refs','dict_name_forcov_102121.pkl'), 'rb'))
# dict_record = pickle.load(open('../anno_ref/proc_refs/dict_record_new.pkl','rb'))

In [5]:
# Find transcript for the gene
# Create dictionary for candidate gene name - transcript id
dict_name_id = {}
for sig_genes in df_syn.index.tolist():
    for transcript,gene in dict_name.items():
        if gene == sig_genes:
            dict_name_id[sig_genes] = transcript

In [13]:
dict_transcript_info['ENST00000359995']

{'strand': '-',
 'chr': 'chr17',
 'exon': [[74732881, 74733456], [74732236, 74732546], [74730201, 74731240]],
 'CDS': [[74732881, 74733242], [74732246, 74732546]],
 'UTR': [[74733243, 74733456], [74732236, 74732245], [74730201, 74731240]],
 'transcript': [74730201, 74733456]}

In [7]:
### Get the synonymous mutations of the candidate gene in the cohort
def get_syn_mut(gene):
    global df_syn
    
    histology = df_syn.loc[gene,'feature'] #histology type
    # open maf file
    df_maf = pd.read_csv(os.path.join(dir_maf,histology+'.csv'), sep = '\t')
    df_maf = df_maf.set_index(['Hugo_Symbol','Variant_Classification','Donor_ID' ])
    # total number of donors in this cohort
    ldonor_all = df_maf.index.get_level_values('Donor_ID').unique().tolist()
    #print(f'Total {len(ldonor_all)} patients')
    
    # Find synonymous mutations
    df_gene_mut = df_maf.loc[pd.IndexSlice[gene, 'Silent',:],:]### All patients' specific histology in this histology type
    # Number of patients have synonymous mutations
    ldonor = df_gene_mut.index.get_level_values('Donor_ID').unique().tolist()
    #print(f'{len(ldonor)} patients have synonymous mutations, they are: {ldonor}')

    return df_gene_mut,histology

In [8]:
# Determine if the mutaiton position is close to the exon start-end
def find_splicing_pos(mut_pos, cds, gene_strand):
    flag = 0; effect = None
    if gene_strand == '-':
        if mut_pos< cds[1] and mut_pos > cds[1]-20:
            effect = 'within 20bp of 5 prime splicing site';flag=1
        elif mut_pos > cds[0] and mut_pos< cds[0] + 20:
            effect = 'within 20bp of 3 prime splicing site';flag=1
    else:
        if mut_pos> cds[0] and mut_pos < cds[0]+20:
            effect = 'within 20bp of 5 prime splicing site';flag = 1
        elif mut_pos < cds[1] and mut_pos> cds[1] - 20:
            effect = 'within 20bp of 3 prime splicing site';flag = 1
    return flag, effect

In [9]:
# Determine if the mutaiton position is close to the exon start-end
def find_initiation_pos(mut_pos, cds, gene_strand):
    flag = 0; effect = None
    # if gene_strand == '-':
    #     if mut_pos< cds[1] and mut_pos > cds[1]-10:
    #         effect = 'within 10bp of initiation site';flag = 1
    #     elif mut_pos > cds[0] and mut_pos< cds[0] +20:
    #         effect = 'within 20bp of 3 prime splicing site';flag = 1
    # else:
    if mut_pos > cds[0] and mut_pos< cds[0] + 10:
        effect = 'within 10bp of initiation site';flag = 1
    elif mut_pos < cds[1] and mut_pos> cds[1]- 20:
        effect = 'within 20bp of 3 prime splicing site';flag = 1
    return flag, effect

In [10]:
def make_gene_df(gene, histology, mutation, exon, effect):
    dict_gene_splice = {'Gene':[gene], 'Histology':[histology], 'Mutation':[mutation], 'Exon':[exon], 'Effect':[effect]}
    df = pd.DataFrame.from_dict(dict_gene_splice)
    print(dict_gene_splice)
    return df

In [15]:
df_splice = pd.DataFrame(columns = ['Gene','Histology', 'Mutation', 'Exon', 'Effect'])
for genes in dict_name_id:
    print(f'searching for gene {genes}')
    transcript = dict_name_id[genes]
    chromosome = dict_transcript_info[transcript]['chr']
    CDS = dict_transcript_info[transcript]['CDS']
    EXON = dict_transcript_info[transcript]['exon']
    len_cds = len(CDS);len_exon = len(dict_transcript_info[transcript]['exon'])
    # If there is extra UTR exon, add the index by +1 in the following session
    n_plusidx = len_exon-len_cds+1
    strand = dict_transcript_info[transcript]['strand']
    df_synmut, his = get_syn_mut(genes)

    # If negative strand then reverse the exon orders
    # if strand == '-':
    #     CDS.reverse()
    print(df_synmut['Genome_Change'].unique().tolist())
    for mutations in df_synmut['Genome_Change'].unique().tolist():
        change = mutations.split(':')[1]
        mutation = chromosome+':g.'+change
        muts = int(re.findall(r'\d+|X|Y', mutation)[1])
        for idx,exons in enumerate(CDS):
            if idx == 0: # Find initiation mutations
                res, eff = find_initiation_pos(muts, exons, strand)
                if res == 1:
                    exn = idx +n_plusidx; df_gene_splice = make_gene_df(genes, his, mutation, exn, eff)
                    df_splice = pd.concat([df_splice, df_gene_splice], axis = 0)
            else:
                res, eff = find_splicing_pos(muts, exons, strand)
                if res ==1:
                    exn = idx+n_plusidx; df_gene_splice = make_gene_df(genes, his, mutation, exn, eff)
                    df_splice = pd.concat([df_splice, df_gene_splice], axis = 0)

searching for gene BCL2
['g.chr18:60985471G>A', 'g.chr18:60985492C>T', 'g.chr18:60985573G>A', 'g.chr18:60985366A>G', 'g.chr18:60985435C>T', 'g.chr18:60985834C>T', 'g.chr18:60985876C>A', 'g.chr18:60985833G>A', 'g.chr18:60985359G>A', 'g.chr18:60985366A>T', 'g.chr18:60985861C>T', 'g.chr18:60985549G>A', 'g.chr18:60985711C>A', 'g.chr18:60985876C>T', 'g.chr18:60985528G>A', 'g.chr18:60985540G>A', 'g.chr18:60985713G>T', 'g.chr18:60985363C>T', 'g.chr18:60985417G>A', 'g.chr18:60985717T>C', 'g.chr18:60985795C>G', 'g.chr18:60985876C>G', 'g.chr18:60985582G>A', 'g.chr18:60985753G>A', 'g.chr18:60985384G>A', 'g.chr18:60985545G>A', 'g.chr18:60985738C>T', 'g.chr18:60985615C>T', 'g.chr18:60985444C>T', 'g.chr18:60985867G>A', 'g.chr18:60985735G>A', 'g.chr18:60985714G>A']
searching for gene SRSF2
['g.chr17:74732997G>C', 'g.chr17:74733006C>G', 'g.chr17:74733234G>A']
{'Gene': ['SRSF2'], 'Histology': ['Lymph-BNHL'], 'Mutation': ['chr17:g.74733234G>A'], 'Exon': [2], 'Effect': ['within 20bp of 3 prime splicing s

  if (await self.run_code(code, result,  async_=asy)):


['g.chr5:64881864T>A', 'g.chr5:64881882A>T']
searching for gene MAGEC1
['g.chrX:140993787G>A', 'g.chrX:140993823G>A']
searching for gene PURA
['g.chr5:139494369T>A', 'g.chr5:139494363T>C', 'g.chr5:139494468T>C', 'g.chr5:139494471G>C', 'g.chr5:139494492G>C', 'g.chr5:139494495T>C', 'g.chr5:139494501A>G']
searching for gene HIST1H2BK
['g.chr6:27114272C>G', 'g.chr6:27114271_27114272GC>AA', 'g.chr6:27114449G>A', 'g.chr6:27114467G>A', 'g.chr6:27114271G>A', 'g.chr6:27114440C>T']
searching for gene NOL9
['g.chr1:6614184G>A', 'g.chr1:6614410C>G', 'g.chr1:6614182C>G', 'g.chr1:6614362G>A', 'g.chr1:6614524G>A', 'g.chr1:6614329G>C', 'g.chr1:6614311G>A']
searching for gene ZCCHC5
['g.chrX:77913840C>T', 'g.chrX:77912781G>A', 'g.chrX:77912744G>A']
searching for gene PRX
['g.chr19:40902642C>T', 'g.chr19:40900053G>T']
searching for gene ITPR2


  if (await self.run_code(code, result,  async_=asy)):


['g.chr12:26755360C>T', 'g.chr12:26784883G>T', 'g.chr12:26784934G>T']
searching for gene SH3BGR
['g.chr21:40834315G>A', 'g.chr21:40834462C>T']
{'Gene': ['SH3BGR'], 'Histology': ['Panc-Endocrine'], 'Mutation': ['chr21:g.40834315G>A'], 'Exon': [3], 'Effect': ['within 20bp of 5 prime splicing site']}
searching for gene ACVRL1
['g.chr12:52307539C>T', 'g.chr12:52307549T>C', 'g.chr12:52314611G>A']
{'Gene': ['ACVRL1'], 'Histology': ['Prost-AdenoCA'], 'Mutation': ['chr12:g.52307539C>T'], 'Exon': [4], 'Effect': ['within 20bp of 3 prime splicing site']}
{'Gene': ['ACVRL1'], 'Histology': ['Prost-AdenoCA'], 'Mutation': ['chr12:g.52307549T>C'], 'Exon': [4], 'Effect': ['within 20bp of 3 prime splicing site']}
searching for gene MSRB2
['g.chr10:23384576C>T', 'g.chr10:23384552C>G']
searching for gene TPM2
['g.chr9:35689758G>T', 'g.chr9:35689782C>G']
searching for gene ZFP69
['g.chr1:40961323C>G', 'g.chr1:40961446C>T']
searching for gene SOX18
['g.chr20:62680308C>T', 'g.chr20:62680134C>T', 'g.chr20:626

  if (await self.run_code(code, result,  async_=asy)):


['g.chr1:182025565G>A', 'g.chr1:182025649G>A', 'g.chr1:182026306C>T']
searching for gene GLRA3
['g.chr4:175710034A>T', 'g.chr4:175710037A>T']
searching for gene TMEM129
['g.chr4:1720112G>A', 'g.chr4:1720274G>A']


In [17]:
df_splice.to_csv('df_splice.csv')

In [4]:
df_splice = pd.read_csv('df_splice.csv')

In [5]:
df_splice

Unnamed: 0.1,Unnamed: 0,Gene,Histology,Mutation,Exon,Effect
0,0,SRSF2,Lymph-BNHL,chr17:g.74733234G>A,2,within 20bp of 3 prime splicing site
1,0,ACTA1,Lymph-CLL,chr1:g.229567630A>G,6,within 20bp of 5 prime splicing site
2,0,ACTA1,Lymph-CLL,chr1:g.229567636C>A,6,within 20bp of 5 prime splicing site
3,0,ACTA1,Lymph-CLL,chr1:g.229567639C>A,6,within 20bp of 5 prime splicing site
4,0,SIGLEC15,Panc-AdenoCA,chr18:g.43418687G>C,4,within 20bp of 5 prime splicing site
5,0,RTN4,Liver-HCC,chr2:g.55200752C>T,6,within 20bp of 5 prime splicing site
6,0,SH3BGR,Panc-Endocrine,chr21:g.40834315G>A,3,within 20bp of 5 prime splicing site
7,0,ACVRL1,Prost-AdenoCA,chr12:g.52307539C>T,4,within 20bp of 3 prime splicing site
8,0,ACVRL1,Prost-AdenoCA,chr12:g.52307549T>C,4,within 20bp of 3 prime splicing site
9,0,SOX18,Eso-AdenoCA,chr20:g.62680308C>T,2,within 20bp of 5 prime splicing site


In [6]:
dict_name_id

{'BCL2': 'ENST00000398117',
 'SRSF2': 'ENST00000359995',
 'ITLN1': 'ENST00000326245',
 'ACTA1': 'ENST00000366684',
 'SIGLEC15': 'ENST00000389474',
 'TP53I3': 'ENST00000335934',
 'LONRF3': 'ENST00000371628',
 'RTN4': 'ENST00000394609',
 'PPWD1': 'ENST00000261308',
 'MAGEC1': 'ENST00000285879',
 'PURA': 'ENST00000331327',
 'HIST1H2BK': 'ENST00000396891',
 'NOL9': 'ENST00000377705',
 'ZCCHC5': 'ENST00000321110',
 'PRX': 'ENST00000291825',
 'ITPR2': 'ENST00000381340',
 'SH3BGR': 'ENST00000333634',
 'ACVRL1': 'ENST00000388922',
 'MSRB2': 'ENST00000376510',
 'TPM2': 'ENST00000378292',
 'ZFP69': 'ENST00000372706',
 'SOX18': 'ENST00000340356',
 'NCAN': 'ENST00000252575',
 'HIST1H4E': 'ENST00000360441',
 'DMRTB1': 'ENST00000371445',
 'KLHL21': 'ENST00000377658',
 'ADAM17': 'ENST00000310823',
 'ZNF648': 'ENST00000339948',
 'GLRA3': 'ENST00000274093',
 'TMEM129': 'ENST00000382936'}