In [1]:
### This notebook is the analysis of BCL-2 mutations, permutation analysis

import pandas as pd
import os
import pickle
from Bio import SeqIO
from Bio.Seq import Seq
from utils import *
import pickle
import re

In [2]:
dir_out = './figure6/'

***1. BCL-2 mutation information(input for R lollipop plot and permutation test)***

In [3]:
# Read the BCL-2 mutation maf files
dir_maf = '../data/maf/histology_nohypermutator'
feat = 'Lymph-BNHL.csv'

df_mut = pd.read_csv(os.path.join(dir_maf,feat), sep = '\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Filter out BCL-2 silent mutation
df_bcl2_syn = df_mut[(df_mut['Variant_Classification'] == 'Silent') & (df_mut['Hugo_Symbol'] == 'BCL2')]

# Save the synonymous mutation dataframe for permutation and lolliplot input
#df_bcl2_syn.to_csv(os.path.join(dir_out,'data','df_bcl2_syn.csv'))

In [5]:
df_bcl2_syn

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,...,i_signature_R1,i_signature_R2,i_snv_near_indel,t_alt_count,t_ref_count,i_model_score,i_n_vaf,Project_Code,Donor_ID,categ
14041,BCL2,18,60985471,60985471,+,Silent,SNP,G,G,A,...,False,False,False,5.0,12.0,,,Lymph-BNHL,DO27805,3
14042,BCL2,18,60985492,60985492,+,Silent,SNP,C,C,T,...,False,False,False,4.0,12.0,,,Lymph-BNHL,DO27805,3
16683,BCL2,18,60985573,60985573,+,Silent,SNP,G,G,A,...,False,False,False,9.0,21.0,,,Lymph-BNHL,DO52651,3
20582,BCL2,18,60985366,60985366,+,Silent,SNP,A,A,G,...,False,False,False,9.0,6.0,,,Lymph-BNHL,DO52671,5
36626,BCL2,18,60985435,60985435,+,Silent,SNP,C,C,T,...,False,False,False,8.0,19.0,,,Lymph-BNHL,DO52685,3
36627,BCL2,18,60985834,60985834,+,Silent,SNP,C,C,T,...,False,False,False,4.0,15.0,,,Lymph-BNHL,DO52685,3
36628,BCL2,18,60985876,60985876,+,Silent,SNP,C,C,A,...,False,False,False,3.0,13.0,,,Lymph-BNHL,DO52685,4
48763,BCL2,18,60985834,60985834,+,Silent,SNP,C,C,T,...,False,False,False,7.0,18.0,,,Lymph-BNHL,DO52695,3
66082,BCL2,18,60985833,60985833,+,Silent,SNP,G,G,A,...,False,False,False,3.0,21.0,,,Lymph-BNHL,DO27857,3
69695,BCL2,18,60985359,60985359,+,Silent,SNP,G,G,A,...,False,False,False,9.0,16.0,,,Lymph-BNHL,DO52690,3


In [7]:
len(df_bcl2_syn) ### The number of synonymous mutations in total

41

In [8]:
len(df_bcl2_syn['Donor_ID'].unique()) ### Total number of patients

26

In [None]:
# bcl2_syn_mut = df_bcl2_syn['Start_position'].to_list() # Don't know what this is for

***2. BCL-2 CDS sequence(permutation test, sequence is copy and pasted into R)***

In [9]:
# Load annotation data
dict_name = pickle.load(open('../anno_ref/proc_refs/dict_name_062121.pkl','rb'))
dict_transcript_info = pickle.load(open('../anno_ref/proc_refs/dict_transcript_info_062121.pkl','rb'))
dict_record = SeqIO.to_dict(SeqIO.parse('../anno_ref/gencode_v19/GRCh37.p13.genome.fa', 'fasta'))

In [10]:
# This information was mannually got from Uniprot
BH4 = (27,90); BH3 = (276,321); BH1 = (405,465);BH2 = (558,606)

In [11]:
# The canonical transcript is ENST00000398117
bcl2_info = dict_transcript_info['ENST00000398117']

In [12]:
### Get the cds sequence of BCL2
list_cds = get_mrna_position('ENST00000398117', dict_transcript_info, '-')
seq_transcript = get_transcript_sequence('ENST00000398117', dict_transcript_info, dict_record, '-')
seq_cds = get_cdna_sequence(list_cds, '-', seq_transcript)

In [51]:
### how many patients have BH4 mutation

# BH4 position
BH4_start = 60985899-27+1
BH4_end = 60985899-90+1

# Print out Patient ID that have BH4 mutation
dlist = []
for i in df_bcl2_syn.index:
    pos = re.findall('\d+',df_bcl2_syn.loc[i,'Genome_Change'])[1]
    pos_list.append(pos)
    if int(pos) >= BH4_end and int(pos) <=BH4_start:
        print(df_bcl2_syn.loc[i,'Donor_ID'])
        dlist.append(df_bcl2_syn.loc[i,'Donor_ID'])

len(set(dlist))

DO52685
DO52695
DO27857
DO52652
DO27859
DO27809
DO52664
DO52689
DO52669


9

***3. The data goes into R***
06-BCL-2_enrichment.R for the permutation plot

***
### Section2: BCL-2 patient with translocation event

In [34]:
# Read the fusion file
dir_refs = '../data/anno_refs/'
exp_dir = '../data/anno_refs/pcawg_rnaseq/'

df_fus = pd.read_csv(os.path.join(dir_refs,'pcawg_fusion.tsv'), sep = '\t')
df_bcl2 = df_fus[(df_fus['gene_id1']=='ENSG00000171791') | (df_fus['gene_id2'] == 'ENSG00000171791')]

In [35]:
# Get the synonymous patient id that has fusion
bcl_synpat = df_bcl2_syn['Donor_ID'].unique().tolist()
fus_pats = df_bcl2['icgc_donor_id'].tolist()
fus_syn_pat = list(set(fus_pats).intersection(bcl_synpat))

In [42]:
fus_syn_pat

['DO27785', 'DO27857', 'DO52689', 'DO27859']

In [39]:
# Get the syn patient dataframe
df_bcl2.columns

Index(['fusion_id', 'aliquot_id', 'known_gene1', 'known_gene2', 'gene_id1',
       'gene_id2', 'chr1', 'break1', 'chr2', 'break2', 'frameshift', 'exon',
       'UTR', 'SV', 'SV_sv_id', 'SV2_sv_id', 'class', 'SV_match_composite',
       'tx1_coord', 'tx2_coord', 'tx_bkpt1', 'tx_bkpt2', 'project_code',
       'histology_abbreviation', 'wgs_aliquot_id', 'icgc_donor_id'],
      dtype='object')

In [50]:
df_bcl2['break1'].mean()

60818657.6

In [40]:
# Get the syn patient dataframe
df_bcl2[['fusion_id','chr1', 'break1', 'chr2', 'break2','class','histology_abbreviation','icgc_donor_id']]

Unnamed: 0,fusion_id,chr1,break1,chr2,break2,class,histology_abbreviation,icgc_donor_id
41,BCL2->IGHG1,18,60847806,14,106209206,composite,Lymph-BNHL,DO27785
42,BCL2->IGHG1,18,60843599,14,106209408,SV independent,Lymph-BNHL,DO27857
46,BCL2->IGHG1,18,60812505,14,106209198,composite,Lymph-BNHL,DO27859
47,BCL2->IGHG1,18,60794689,14,106209407,composite,Lymph-BNHL,DO52656
49,BCL2->IGHG1,18,60794689,14,106209407,composite,Lymph-BNHL,DO52689


In [52]:
df_bcl2_syn['Start_position'].mean()-df_bcl2['break1'].mean()

166976.96097560972

{'DO27785', 'DO27857', 'DO27859', 'DO52689'}