# <center>Generate COSMIC legacy db</center>

<div style="text-align: justify">This page documents the download, cleanup, annotation process and legacy db file generating for COSMIC <p>1. you need to specify the working directory below where SNVindel and fusion data downloaded from <a href="https://cancer.sanger.ac.uk/cosmic">https://cancer.sanger.ac.uk/cosmic</a> can be found. The default working directory will be in the same place as the current note book file.<br>2. you need to specify the cosmic version and genome version. <br>3. follow the wiki page (<a href="https://wiki.stjude.org/display/compbio/How+to+clean+and+annotate+the+COSMIC+database">https://wiki.stjude.org/display/compbio/How+to+clean+and+annotate+the+COSMIC+database</a>) to clean and annotate the COSMIC database<br><font color = red>4. it is supposed to run on hpc <br>5. run each of the cell that starts with %%writefile first <br>6. Session setup. run the following commands before you start your jupyter notebook:<br>&nbsp;&nbsp;&nbsp;&nbsp;setcbenv prod<br>&nbsp;&nbsp;&nbsp;&nbsp;cbload clinical-classifier<br>&nbsp;&nbsp;&nbsp;&nbsp;cbload snv-annovar<br>&nbsp;&nbsp;&nbsp;&nbsp;cbload snv-vep<br>&nbsp;&nbsp;&nbsp;&nbsp;cbload util-scripts<br>&nbsp;&nbsp;&nbsp;&nbsp;module load vcftools<br>&nbsp;&nbsp;&nbsp;&nbsp;module load htslib/1.15.1<br>&nbsp;&nbsp;&nbsp;&nbsp;module load vep/v100<font>
</div>


1. <font color = blue>&nbsp;&nbsp;&nbsp;&nbsp;Session Setup<br></font>

In [106]:
#check the Session setup
import subprocess as sp
ckrt = sp.run('vcf2tab.pl',shell=True,stderr=sp.PIPE).stderr.decode('utf-8')
if 'command not found' in ckrt:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sessionSetup = False
else:
    sessionSetup = True
    print('Session setup successfully!')

Session setup successfully!


2. <font color = blue>&nbsp;&nbsp;&nbsp;&nbsp;Specify the working directory<br>&nbsp;&nbsp;&nbsp;&nbsp;Specify genome version</font>

In [111]:
#specify the working directory
working_dir = './' 
os.chdir(working_dir)
print('working directory: '+working_dir)

#genome version
genome_v = 'hg19'#speicify the genome version ( hg19 or hg38 )
genome_version_name = {'hg38':'GRCh38','hg19':'GRCh37'}
print('genome version: '+genome_v)

working directory: ./
genome version: hg19


3. <font color = blue>&nbsp;&nbsp;&nbsp;&nbsp;Check the downloaded files<br></font>

In [72]:
import os
import sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)

#check if the required files are available
cosmicFiles = [x+'_'+genome_version_name[genome_v]+'.tsv.gz' 
               for x in ['CosmicCodingMuts','CosmicFusionExport','CosmicMutantExport']]
for f in cosmicFiles:
    if not os.path.isfile(f):
        print('Error:')
        print('\tPlease download the file from https://cancer.sanger.ac.uk/cosmic')
        sys.exit(1)
print('All files have been downloaded. Please move forward to the next step!')


All files have been downloaded. Please move forward to the next step!


4. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;VCF allele extraction</font>

In [5]:
#VCF allele extraction

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
    
import os
if genome_v == 'hg38':
    os.system('bsub <vcf_allele_ext_hg38.sh')
elif genome_v == 'hg19':
    os.system('bsub <vcf_allele_ext_hg19.sh')
    

Job <171020600> is submitted to queue <standard>.


5. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;genomic allele annotation</font>

In [18]:
#genomic allele annotation
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)

cooked_tab_file = 'CosmicCodingMuts_'+genome_version_name[genome_v]+'.tsv.gz.cooked.tab'
if not os.path.isfile(cooked_tab_file):
    print('VCF allele extraction is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
os.system('bsub <genoallel_anno_'+genome_v+'.sh')



Job <171022769> is submitted to queue <standard>.


0

6. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;population frequency filtering</font>

In [41]:
#population frequency filtering
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)

genomic_tab_file = 'CosmicMutantExport_'+genome_version_name[genome_v]+'.tsv.gz.genomic.tab'
if not os.path.isfile(genomic_tab_file):
    print('genomic allele annotation is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
os.system('bsub <freqfilter_'+genome_v+'.sh')


Job <171025499> is submitted to queue <standard>.


      A default memory request of 2.50 GB has been placed for this job
      The job will be killed if   2.50 GB of memory is used


0

7. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;cleaning</font>

In [45]:
#cleaning
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
    
exac_filter_file = 'CosmicMutantExport_'+genome_version_name[genome_v]+'.tsv.gz.genomic.tab.exac.tab'
if not os.path.isfile(exac_filter_file):
    print('genomic allele annotation is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
os.system('bsub <cleaning_'+genome_v+'.sh')


Job <171036674> is submitted to queue <standard>.


0

8. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;sorting</font>

In [46]:
#sorting based on genomic position
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
    
cleaned_file = 'CosmicMutantExport_'+genome_version_name[genome_v]+'.tsv.gz.genomic.tab.exac.tab.cleaned.tab'
if not os.path.isfile(cleaned_file):
    print('genomic allele annotation is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
os.system('bsub <sorting_'+genome_v+'.sh')


Job <171041603> is submitted to queue <standard>.


0

9. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;VEP annotation</font>

In [53]:
#VEP annotation
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
os.system('bsub <vep_'+genome_v+'.sh')


Job <171044928> is submitted to queue <standard>.


      A default memory request of 2.50 GB has been placed for this job
      The job will be killed if   2.50 GB of memory is used


0

10. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;duplicate record removal</font>

In [58]:
#duplicate record removal
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
    
hgvs_file = 'CosmicMutantExport_'+genome_version_name[genome_v]+'.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab'
if not os.path.isfile(hgvs_file):
    print('genomic allele annotation is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
    
os.system('bsub <duprm_'+genome_v+'.sh')



Job <171063972> is submitted to queue <standard>.


0

11. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;generate snvindel file for db loading</font>

In [65]:
#generate snvindel file for db loading
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
    
uniq_file = 'CosmicMutantExport_'+genome_version_name[genome_v]+'.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab.unique.tab'
if not os.path.isfile(hgvs_file):
    print('genomic allele annotation is not done yet! Please wait for the last step to be finished!')
    sys.exit(1)
snvindelOutputFile = 'cosmic.snvindel.'+genome_v
!python3 gencosmicsnvindel.py $uniq_file $snvindelOutputFile
print('cosmic.snvindel.'+genome_v+' generated!')


Done


12. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;generate fusion file for db loading</font>

In [68]:
#generate fusion file for db loading
import os,sys

if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
fusionFile = 'CosmicFusionExport_'+genome_version_name[genome_v]+'.tsv.gz'
fusionOutFile = 'cosmic.fusion.'+genome_v
!python3 cosmic_fusion.py $fusionFile $genome_v $fusionOutFile
print('cosmic.fusion.'+genome_v+' generated!')

49957 lines have no fusion name
1169 lines with ensemble name cannot be converted to refseq
13812 duplicated lines removed
7 skipped for lines without breakpoint info
5619 skipped for ? in fusion name
cosmic.fusion.hg38 generated!


13. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;db loading</font>

In [69]:
#db loading
import os,sys
if not sessionSetup:
    print('Please finish the session setup steps as shown in RED above in the instruction')
    sys.exit(1)
snvindelFile = 'cosmic.snvindel.'+genome_v
fusionFile = 'cosmic.fusion.'+genome_v
if not os.path.isfile(snvindelFile):
    print('Please run Part8 (step 7) to generate snvindel file!')
    sys.exit(1)
if not os.path.isfile(fusionFile):
    print('Please run Part9 (step 8) to generate fusion file!')
dbout = 'cosmic.'+genome_v+'.db'
sqlFile = 'cosmic.'+genome_v+'.db.sql'
!sqlite3 $dbout <$sqlFile
if os.path.isfile(dbout):
    print(dbout+' generated!')

cosmic.hg38.db generated!


14. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;generate sliced db file</font> <p>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;TP53    NM_000546 <br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;KRAS    NM_033360<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NRAS    NM_002524<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;FLT3    NM_004119 <br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BRAF    NM_004333<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ABL1    NM_005157 ABL1-BCR fusion （AML）<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CTNNB1  NM_001904 T41A<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;FGFR1   NM_023110 N546K<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;H3-3A   NM_002107 K28M G35W

In [115]:
#check the snvindel file and fusion file
import os
import sys

snvindelOutputFile = 'cosmic.snvindel.'+genome_v
fusionOutFile = 'cosmic.fusion.'+genome_v
if not os.path.isfile(snvindelOutputFile):
    print('Please run step-11 to generate snvindel data!')
    sys.exit(1)
if not os.path.isfile(fusionOutFile):
    print('Please run step-12 to generate fusion data!')
    sys.exit(1)
    
slicedSnvindelFile = 'cosmic.snvindel.slice.'+genome_v
slicedFusionFile = 'cosmic.fusion.slice.'+genome_v
sliceddbout = 'cosmic.slice.'+genome_v+'.db'
!printf "NM_000546\nNM_033360\nNM_002524\nNM_004119\nNM_004333\nNM_005157\nNM_001904\nNM_023110\nNM_002107\n" >gene4sliceddb
!grep -w -f gene4sliceddb $snvindelOutputFile > $slicedSnvindelFile
!grep -w -f gene4sliceddb $fusionOutFile >$slicedFusionFile
slicedsqlFile = 'cosmic.slice.'+genome_v+'.db.sql'
!sqlite3 $sliceddbout <$slicedsqlFile
if os.path.isfile(sliceddbout):
    print(sliceddbout+' generated!')



cosmic.slice.hg19.db generated!


15. <font color=blue>&nbsp;&nbsp;&nbsp;&nbsp;delete intermediate files</font>

In [74]:
#intermediate files deletion
import os

intermFiles = [x for x in os.listdir() if x.endswith('.tab') or x.endswith('log')]

for f in intermFiles:
    os.system('rm -f '+f)
    print(f+' deleted!')
print("Congratulations!!! Legacy db building for "+genome_v+' is done!')

CosmicMutantExport_GRCh38.tsv.gz.genomic.tab deleted!
step1_hg38.elog deleted!
step3_hg38.log deleted!
CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab deleted!
CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab deleted!
step4-5_hg38.log deleted!
CosmicCodingMuts_GRCh38.tsv.gz.cooked.tab deleted!
step6_hg38.log deleted!
step4-5_hg38.elog deleted!
step6_hg38.elog deleted!
CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab.unique.tab deleted!
step4_hg38.log deleted!
CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab deleted!
step5_hg38.elog deleted!
step5_hg38.log deleted!
step1_hg38.log deleted!
step2_hg38.log deleted!
CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.null_positions.tab deleted!
step4_hg38.elog deleted!
step3_hg38.elog deleted!
step2_hg38.elog deleted!
Congratulations!!! Legacy db building for hg38 is done!


## script files (need to run each of the following cells first)

In [82]:
%%writefile vcf_allele_ext_hg19.sh

#!/bin/bash
#BSUB -J VCF_Allele_ext_hg19
#BSUB -q standard 
#BSUB -M 10000
#BSUB -e vcf_allele_ext_hg19.elog
#BSUB -o vcf_allele_ext_hg19.log


vcf2tab.pl -file CosmicCodingMuts_GRCh37.tsv.gz -cosmic 2 -lite -filter-duplicates-strict 1 -duplicate-add-field COSMIC_id



Overwriting vcf_allele_ext_hg19.sh


In [83]:
%%writefile vcf_allele_ext_hg38.sh

#!/bin/bash
#BSUB -J VCF_Allele_ext_hg38
#BSUB -q standard 
#BSUB -M 10000
#BSUB -e vcf_allele_ext_hg38.elog
#BSUB -o vcf_allele_ext_hg38.log


vcf2tab.pl -file CosmicCodingMuts_GRCh38.tsv.gz -cosmic 2 -lite -filter-duplicates-strict 1 -duplicate-add-field COSMIC_id



Overwriting vcf_allele_ext_hg38.sh


In [84]:
%%writefile genoallel_anno_hg19.sh 

#!/bin/bash
#BSUB -J genoallel_anno_hg19
#BSUB -q standard 
#BSUB -M 10000
#BSUB -e genoallel_anno_hg19.elog
#BSUB -o genoallel_anno_hg19.log

COSMIC_TAB=CosmicMutantExport_GRCh37.tsv.gz
COSMIC_VCF_COOKED=CosmicCodingMuts_GRCh37.tsv.gz.cooked.tab
cosmic_cleaner.pl -cosmic $COSMIC_TAB -vcf-allele-annotate $COSMIC_VCF_COOKED

Overwriting genoallel_anno_hg19.sh


In [85]:
%%writefile genoallel_anno_hg38.sh 

#!/bin/bash
#BSUB -J genoallel_anno_hg38
#BSUB -q standard 
#BSUB -M 10000
#BSUB -e genoallel_anno_hg38.elog
#BSUB -o genoallel_anno_hg38.log

COSMIC_TAB=CosmicMutantExport_GRCh38.tsv.gz
COSMIC_VCF_COOKED=CosmicCodingMuts_GRCh38.tsv.gz.cooked.tab
cosmic_cleaner.pl -cosmic $COSMIC_TAB -vcf-allele-annotate $COSMIC_VCF_COOKED

Overwriting genoallel_anno_hg38.sh


In [86]:
%%writefile freqfilter_hg19.sh

#!/bin/bash
#BSUB -J freqfilter_hg19
#BSUB -q standard 
#BSUB -e freqfilter_hg19.elog
#BSUB -o freqfilter_hg19.log

mux.pl -file CosmicMutantExport_GRCh37.tsv.gz.genomic.tab -template 'tag_exac.pl -genome GRCh37-lite -file %s -bambino -ignore-unusab
le -no-annotation -require-af-le 0.001' -suffix exac.tab -count 100000 -ram 500 -wait 30 -clean glob

Overwriting freqfilter_hg19.sh


In [87]:
%%writefile freqfilter_hg38.sh

#!/bin/bash
#BSUB -J freqfilter_hg38
#BSUB -q standard 
#BSUB -e freqfilter_hg38.elog
#BSUB -o freqfilter_hg38.log

mux.pl -file CosmicMutantExport_GRCh38.tsv.gz.genomic.tab -template 'tag_exac.pl -genome GRCh38 -file %s -bambino -ignore-unusable -n
o-annotation -require-af-le 0.001' -suffix exac.tab -count 100000 -ram 500 -wait 30 -clean glob

Overwriting freqfilter_hg38.sh


In [88]:
%%writefile cleaning_hg19.sh

#!/bin/bash
#BSUB -J cleaning_hg19
#BSUB -q standard
#BSUB -M 20000
#BSUB -e cleaning_hg19.elog
#BSUB -o cleaning_hg19.log

export CS_DIR=/rgs01/project_space/zhanggrp/ClinicalSeq/common
export SUPPORT_DIR=$CS_DIR/germline_PublicDB/COSMIC/clean_support
cosmic_cleaner.pl -cosmic CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab -genome GRCh37-lite -hypermutable-samples $CS_DIR/hypermutable_sample_0819_2013.lst -cancer-gene-list $CS_DIR/cancer_gene.lst -bad-gene-list $CS_DIR/bad_gene.lst -refgene-fasta $SUPPORT_DIR/human.rna.fna.gz -tolerant 1 -bad-literature $CS_DIR/bad_gene_literature.lst.mod -bad-variants $CS_DIR/bad_CHEK2_site.txt -no-summary -no-recurrence -gedi-validated $SUPPORT_DIR/gedi_validated.tab -out-suffix cleaned.tab $*

Writing cleaning_hg19.sh


In [89]:
%%writefile cleaning_hg38.sh
#!/bin/bash
#BSUB -J cleaning_hg38
#BSUB -q standard 
#BSUB -M 20000
#BSUB -e cleaning_hg38.elog
#BSUB -o cleaning_hg38.log

export CS_DIR=/rgs01/project_space/zhanggrp/ClinicalSeq/common
export SUPPORT_DIR=$CS_DIR/germline_PublicDB/COSMIC/clean_support
cosmic_cleaner.pl -cosmic CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab -genome GRCh38 -hypermutable-samples $CS_DIR/hypermutable_sample_0819_2013.lst -cancer-gene-list $CS_DIR/cancer_gene.lst -bad-gene-list $CS_DIR/bad_gene.lst -refgene-fasta $SUPPORT_DIR/human.rna.fna.gz -tolerant 1 -bad-literature $CS_DIR/bad_gene_literature.lst.mod -bad-variants $CS_DIR/bad_CHEK2_site.txt -no-summary -no-recurrence -gedi-validated $SUPPORT_DIR/hg38/gedi_validated.tab.liftover.tab.edit -reannotate 2 -out-suffix cleaned.tab $*

Writing cleaning_hg38.sh


In [90]:
%%writefile sorting_hg19.sh

#!/bin/bash
#BSUB -J sorting_hg19
#BSUB -q standard
#BSUB -M 30000
#BSUB -e sorting_hg19.elog
#BSUB -o sorting_hg19.log

head -n 1 CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab >CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted
tail -n +2 CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab >x.hg19
cut -f38,39 x.hg19 >y.hg19 #check if the chromosome and position are always on 38th and 39th coloumns
paste y.hghg19 x.hg19 >z.hg19
sort -k1,1 -k2,2n z.hg19 >z.hg19.sorted
cut -f 3- z.hg19.sorted >>CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted
rm -f x.hg19 y.hg19 z.hg19 z.hg19.sorted
mv CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab

Writing sorting_hg19.sh


In [91]:
%%writefile sorting_hg38.sh

#!/bin/bash
#BSUB -J sorting_hg38
#BSUB -q standard
#BSUB -M 30000
#BSUB -e sorting_hg38.elog
#BSUB -o sorting_hg38.log

head -n 1 CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab >CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted
tail -n +2 CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab >x.hg38
cut -f38,39 x.hg38 >y.hg38 #check if the chromosome and position are always on 38th and 39th coloumns
paste y.hg38 x.hg38 >z.hg38
sort -k1,1 -k2,2n z.hg38 >z.hg38.sorted
cut -f 3- z.hg38.sorted >>CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted
rm -f x.hg38 y.hg38 z.hg38 z.hg38.sorted
mv CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.sorted CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab


Writing sorting_hg38.sh


In [92]:
%%writefile vep_hg19.sh

#!/bin/bash
#BSUB -J vep_hg19
#BSUB -q standard
#BSUB -e vep_hg19.elog
#BSUB -o vep_hg19.log

GENOME=GRCh37-lite
mux.pl -file CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab -ram 2250 -suffix hgvs.tab -wait 30 -clean glob -pool 400 -count 10000 -bsub-cores 2 -template "bambino2vep.pl -file %s -genome $GENOME -bambino" $*

Writing vep_hg19.sh


In [93]:
%%writefile vep_hg38.sh

#!/bin/bash
#BSUB -J vep_hg38
#BSUB -q standard
#BSUB -e vep_hg38.elog
#BSUB -o vep_hg38.log

GENOME=GRCh38
mux.pl -file CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab -ram 2250 -suffix hgvs.tab -wait 30 -clean glob -pool 400 -count 10000 -bsub-cores 2 -template "bambino2vep.pl -file %s -genome $GENOME -bambino" $*

Writing vep_hg38.sh


In [94]:
%%writefile duprm_hg19.sh

#!/bin/bash
#BSUB -J duprm_hg19
#BSUB -q standard
#BSUB -M 10000
#BSUB -e duprm_hg19.elog
#BSUB -o duprm_hg19.log

bambino2annovar.pl -filter-sample-duplicates -f-sample ID_sample -f-gene vep_sj_gene -f-aa vep_sj_aachange -file CosmicMutantExport_GRCh37.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab



Writing duprm_hg19.sh


In [95]:
%%writefile duprm_hg38.sh

#!/bin/bash
#BSUB -J duprm_hg38
#BSUB -q standard
#BSUB -M 10000
#BSUB -e duprm_hg38.elog
#BSUB -o duprm_hg38.log

bambino2annovar.pl -filter-sample-duplicates -f-sample ID_sample -f-gene vep_sj_gene -f-aa vep_sj_aachange -file CosmicMutantExport_GRCh38.tsv.gz.genomic.tab.exac.tab.cleaned.tab.hgvs.tab


Writing duprm_hg38.sh


In [96]:
%%writefile gencosmicsnvindel.py

#!/usr/bin/python3

#generate file for db file building

import sys

if len(sys.argv) < 3:
        print('usage: python3 '+sys.argv[1]+' <*.tab.cleaned.tab.hgvs.tab.unique.tab> <output>')
        sys.exit(1)

header = ['Gene name', 'Accession Number', 'Gene CDS length', 'HGNC ID', 'Sample name', 'ID_sample', 'ID_tumour', 'Primary site', 'Site subtype 1', 'Site subtype 2', 'Site subtype 3', 'Primary histology', 'Histology subtype 1', 'Histology subtype 2', 'Histology subtype 3', 'Genome-wide screen', 'GENOMIC_MUTATION_ID', 'LEGACY_MUTATION_ID', 'MUTATION_ID', 'Mutation CDS', 'Mutation AA', 'Mutation Description', 'Mutation zygosity', 'LOH', 'GRCh', 'Mutation genome position', 'Mutation strand', 'Resistance Mutation', 'Mutation somatic status', 'Pubmed_PMID', 'ID_STUDY', 'Sample Type', 'Tumour origin', 'Age', 'HGVSP', 'HGVSC', 'HGVSG', 'Chr', 'Pos', 'Chr_Allele', 'Alternative_Allele', 'sj_diagnosis', 'sj_subtype', 'sj_subgroup', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra', 'vep_HGVSp', 'vep_HGVSc', 'vep_sj_gene', 'vep_sj_class', 'vep_sj_aachange', 'vep_sj_cdna', 'vep_pubmed', 'vep_sj_note', 'vep_sj_filter_isoform', 'vep_sj_filter_isoform_unversioned', 'vep_sj_filter_isoform_preferred', 'vep_result_count']

inputfile = sys.argv[1]
fh = open(inputfile)
out = open(sys.argv[2],'w')
fileheader = fh.readline().strip().split('\t')
if not fileheader == header:
        print('please check the file header:')
        print('the header should be in the order:')
        for e in header:
                print(e)
        sys.exit(1)
for line in fh:
        l = line.replace('\n','').split('\t')
        OUTL = l[0:27]
        OUTL.append('') # no snp info from VEP+ pipeline
        OUTL.append(l[27])
        OUTL.extend(['']*2) # no fathmm_prediction and fathmm_score
        OUTL.extend(l[28:34])
        OUTL.extend(l[41:44]) # sj_diagnosis, subtype, subgroup
        #OUTL.extend(l[34:37]) # sj_diagnosis, subtype, subgroup
        OUTL.extend(l[37:41]) # chr, pos, ref, alt
        OUTL.extend(['']*3) # no genomic_parsable, type and notes
        #OUTL.extend(l[41:44]) # no genomic_parsable, type and notes
        OUTL.append('') #no annovar_region 
        OUTL.append(l[57])
        OUTL.extend(['']*2) # no annovar_exonic_function and annovar_exonic_function_gene
        OUTL.append(l[57])
        OUTL.append(l[44]) #gi
        OUTL.extend(l[58:61]) # class, aachange,cdna
        OUTL.append('') # no exception
        OUTL.append(l[62]) #note
        if '.' in l[63]:
                ISO = l[63].split('.')[0]
        else:
                ISO = l[63]
        OUTL.extend([ISO,l[65]])
        out.write('\t'.join(OUTL)+'\n')
fh.close()
out.close()

Overwriting gencosmicsnvindel.py


In [97]:
%%writefile cosmic_fusion.py 

#!/usr/bin/python3

import sys
import gzip
import json
import re

if len(sys.argv) == 1:
        print('Usage: python3 '+sys.argv[0]+' <CosmicfusionExport_GRCh37.tsv.gz> <hg19/hg38> <output>')
        sys.exit(1)
#####
#function
def FINDTRANSID(fusionName):
        searchRT = re.search('.*?(ENST.*?)\..*?(ENST.*?)\.',fusionName)
        if not searchRT:
                return False
        ENSIDS = searchRT.groups()
        if len(ENSIDS) != 2:
                return False
        else:
                return ENSIDS
def PARSEJS(breakpoints):
        chra,stda,ga,posastart,posastop,chrb,stdb,gb,posbstart,posbstop = breakpoints
        ajs = {}
        bjs = {}
        chra = chra.lower() if chra.lower().startswith('chr') else 'chr'+chra
        ajs['chr'] = chra
        chrb = chrb.lower() if chrb.lower().startswith('chr') else 'chr'+chrb
        bjs['chr'] = chrb
        ga = ga.strip().split('_')[0] if '_' in ga else ga
        if ga:
                ajs['name'] = ga
        else:
                print('no gene name: '+'\t'.join(breakpoints))
                sys.exit(1)
        gb = gb.strip().split('_')[0] if '_' in gb else gb
        if gb:
                bjs['name'] = gb
        else:
                print('no gene name: '+'\t'.join(breakpoints))
                sys.exit(1)
        ajs['strand'] = stda
        bjs['strand'] = stdb
        if stda == '+':
                ajs['position'] = int(posastop) - 1
        elif stda == '-':
                ajs['position'] = int(posastart) - 1
        if stdb == '+':
                bjs['position'] = int(posbstart) - 1
        elif stdb == '-':
                bjs['position'] = int(posbstop) - 1
        return ajs,bjs

rawfile = sys.argv[1]
genomename = sys.argv[2]
out = open(sys.argv[3],'w')
builds = {
        'hg19':{
                'ref2ens':'/research/rgs01/resgen/legacy/gb_customTracks/tp/jwang/tools/cosmic_update/dataset/hg19/wgEncodeGencodeRefSeqV34lift37.txt'
        },
        'hg38':{
                'ref2ens':'/research/rgs01/resgen/legacy/gb_customTracks/tp/jwang/tools/cosmic_update/dataset/hg38/wgEncodeGencodeRefSeqV34.txt'
        }
}

genome = builds[genomename]
refseqmap={}
for line in open(genome['ref2ens']):
        l = line.strip().split('\t')
        if l[0] and l[1]:
                refseqmap[l[0].split('.')[0]] = l[1].split('.')[0]


sample2fus={}
JSINFO = ["5'_CHROMOSOME","5'_STRAND","5'_GENE_NAME","5'_GENOME_START_FROM","5'_GENOME_STOP_FROM","3'_CHROMOSOME","3'_STRAND","3'_GENE_NAME","3'_GENOME_START_FROM","3'_GENOME_STOP_FROM"]
extInfo = ['PRIMARY_SITE','SITE_SUBTYPE_1','SITE_SUBTYPE_2','SITE_SUBTYPE_3','PRIMARY_HISTOLOGY','HISTOLOGY_SUBTYPE_1','HISTOLOGY_SUBTYPE_2','HISTOLOGY_SUBTYPE_3','FUSION_TYPE','PUBMED_PMID']


fh = gzip.open(rawfile)
head = fh.readline().decode('utf-8').strip().split('\t')
headidx = {x:head.index(x) for x in head}

nofusionname = 0
skipquestionmark = 0
norefseq_nomatchensemble = 0
nopairtransid = 0
nobreakpointinfo = 0
duplicated = 0

for line in fh:
        l = line.decode('utf-8').replace('\n','').split('\t')
        if not 'TRANSLOCATION_NAME' in head:
                print('Check header: there is no the column TRANSLOCATION_NAME')
                sys.exit(1)
        sampleID = l[headidx['SAMPLE_ID']]
        sampleName = l[headidx['SAMPLE_NAME']]
        fus = l[headidx['TRANSLOCATION_NAME']]
        if not fus:
                nofusionname += 1
                continue
        if '?' in fus:
                skipquestionmark += 1
                continue
        transcript_id = FINDTRANSID(fus)
        if not transcript_id:
                nopairtransid += 1
                continue
        else:
                id1,id2 = transcript_id
        if not sampleID in sample2fus:
                sample2fus[sampleID] = {}
        if not fus in sample2fus[sampleID]:
                sample2fus[sampleID][fus] = 1
        else:
                duplicated += 1
                continue
        refid1 = refseqmap[id1] if id1 in refseqmap else False
        refid2 = refseqmap[id2] if id2 in refseqmap else False
        if not refid1 or not refid2:
                norefseq_nomatchensemble += 1
                continue
        #json info
        breakpointinfo = [l[headidx[x]] for x in JSINFO]
        if not all(breakpointinfo):
                nobreakpointinfo += 1
                continue
        ajs,bjs = PARSEJS([l[headidx[x]] for x in JSINFO])
        ajs['isoform'] = refid1
        bjs['isoform'] = refid2
        JS = [{'a':ajs,'b':bjs,'translocationname':fus}]
        name = '.'.join([ajs['name'],bjs['name']])
        iso = '.'.join([refid1,refid2])
        out.write('\t'.join([sampleID,sampleName,name,iso,json.dumps(JS,sort_keys=True)]+[l[headidx[x]] for x in extInfo])+'\n')
fh.close()
out.close()

if nofusionname:
        print(str(nofusionname)+' lines have no fusion name')
if norefseq_nomatchensemble:
         print(str(norefseq_nomatchensemble)+' lines with ensemble name cannot be converted to refseq')
if nopairtransid:
        print(str(nopairtransid)+' lines have not paired transcript ID')
if duplicated:
        print(str(duplicated)+' duplicated lines removed')
if nobreakpointinfo:
        print(str(nobreakpointinfo)+' skipped for lines without breakpoint info')
if skipquestionmark:
        print(str(skipquestionmark)+' skipped for ? in fusion name')


Overwriting cosmic_fusion.py


In [98]:
%%writefile cosmic.hg19.db.sql

drop table if exists cosmic_hg19;
create table cosmic_hg19 (
gene1 character varying(100) null,
gene1accession character varying(100) null,
gene1cdslen integer null,
hgnc_id integer null,
sample_name character varying(255),
id_sample integer,
id_tumor integer,
primary_site character varying(255),
site_subtype1 character varying(255),
site_subtype2 character varying(255),
site_subtype3 character varying(255),
primary_histology character varying(255),
histology_subtype1 character varying(255),
histology_subtype2 character varying(255),
histology_subtype3 character varying(255),
genome_wide_screen character varying(255),
genomic_mutation_id character varying(255),
legacy_mutation_id character varying(255),
mutation_id character varying(255),
mutation_cds character varying(255),
mutation_aa character varying(255),
mutation_description character varying(255),
mutation_zygosity character varying(255),
loh character varying(255),
grch integer,
genome_coordinate character varying(255),
strand character(1),
snp character varying(10),
resistance_mutation character varying(10),
fathmm_prediction character varying(255),
fathmm_score character varying(255),
mutation_somatic_status character varying(255),
pmid integer,
id_study integer,
sample_source character varying(255),
tumor_origin character varying(255),
age character varying(255),
sj_diagnosis character varying(255),
sj_subtype character varying(255),
sj_subgroup     character varying(255),
chr character(2),
position integer,
reference_allele character varying(255),
mutant_allele character varying(255),
genomic_parsable character(1),
genomic_parsable_type character varying(255),
genomic_parsable_notes character varying(255),
annovar_region character varying(255),
annovar_region_gene     character varying(255),
annovar_exonic_function character varying(255),
annovar_exonic_function_gene character varying(255),
annovar_sj_gene character varying(255),
annovar_sj_gi integer,
annovar_sj_class character varying(255),
annovar_sj_aachange character varying(255),
annovar_sj_cdna character varying(255),
annovar_sj_exception character varying(255),
annovar_sj_notecharacter varying(255),
annovar_sj_filter_isoform character varying(255),
annovar_sj_filter_isoform_preferred     character(1)
);

drop table if exists cosmic_fusion;
create table cosmic_fusion (
sample_id int,
sample_name character varying(255),
genes text,
isoforms text,
fusions json,
primarysite character,
sitesubtype1 character,
sitesubtype2 character,
sitesubtype3 character,
primaryhistology character,
histologysubtype1 character,
histologysubtype2 character,
histologysubtype3 character,
note character,
pmid integer
);

.mode tabs
.import cosmic.fusion.hg19 cosmic_fusion
.import cosmic.snvindel.hg19 cosmic_hg19


create index cosmic_hg19_refseq on cosmic_hg19(annovar_sj_filter_isoform);
create index cosmicfusiongene on cosmic_fusion(genes);
create index cosmicfusionisoform on cosmic_fusion(isoforms);


Overwriting cosmic.hg19.db.sql


In [104]:
%%writefile cosmic.slice.hg19.db.sql

drop table if exists cosmic_hg19;
create table cosmic_hg19 (
gene1 character varying(100) null,
gene1accession character varying(100) null,
gene1cdslen integer null,
hgnc_id integer null,
sample_name character varying(255),
id_sample integer,
id_tumor integer,
primary_site character varying(255),
site_subtype1 character varying(255),
site_subtype2 character varying(255),
site_subtype3 character varying(255),
primary_histology character varying(255),
histology_subtype1 character varying(255),
histology_subtype2 character varying(255),
histology_subtype3 character varying(255),
genome_wide_screen character varying(255),
genomic_mutation_id character varying(255),
legacy_mutation_id character varying(255),
mutation_id character varying(255),
mutation_cds character varying(255),
mutation_aa character varying(255),
mutation_description character varying(255),
mutation_zygosity character varying(255),
loh character varying(255),
grch integer,
genome_coordinate character varying(255),
strand character(1),
snp character varying(10),
resistance_mutation character varying(10),
fathmm_prediction character varying(255),
fathmm_score character varying(255),
mutation_somatic_status character varying(255),
pmid integer,
id_study integer,
sample_source character varying(255),
tumor_origin character varying(255),
age character varying(255),
sj_diagnosis character varying(255),
sj_subtype character varying(255),
sj_subgroup     character varying(255),
chr character(2),
position integer,
reference_allele character varying(255),
mutant_allele character varying(255),
genomic_parsable character(1),
genomic_parsable_type character varying(255),
genomic_parsable_notes character varying(255),
annovar_region character varying(255),
annovar_region_gene     character varying(255),
annovar_exonic_function character varying(255),
annovar_exonic_function_gene character varying(255),
annovar_sj_gene character varying(255),
annovar_sj_gi integer,
annovar_sj_class character varying(255),
annovar_sj_aachange character varying(255),
annovar_sj_cdna character varying(255),
annovar_sj_exception character varying(255),
annovar_sj_notecharacter varying(255),
annovar_sj_filter_isoform character varying(255),
annovar_sj_filter_isoform_preferred     character(1)
);

drop table if exists cosmic_fusion;
create table cosmic_fusion (
sample_id int,
sample_name character varying(255),
genes text,
isoforms text,
fusions json,
primarysite character,
sitesubtype1 character,
sitesubtype2 character,
sitesubtype3 character,
primaryhistology character,
histologysubtype1 character,
histologysubtype2 character,
histologysubtype3 character,
note character,
pmid integer
);

.mode tabs
.import cosmic.fusion.slice.hg19 cosmic_fusion
.import cosmic.snvindel.slice.hg19 cosmic_hg19


create index cosmic_hg19_refseq on cosmic_hg19(annovar_sj_filter_isoform);
create index cosmicfusiongene on cosmic_fusion(genes);
create index cosmicfusionisoform on cosmic_fusion(isoforms);

Writing cosmic.slice.hg19.db.sql


In [99]:
%%writefile cosmic.hg38.db.sql

drop table if exists cosmic;
create table cosmic (
gene1 character varying(100) null,
gene1accession character varying(100) null,
gene1cdslen integer null,
hgnc_id integer null,
sample_name character varying(255),
id_sample integer,
id_tumor integer,
primary_site character varying(255),
site_subtype1 character varying(255),
site_subtype2 character varying(255),
site_subtype3 character varying(255),
primary_histology character varying(255),
histology_subtype1 character varying(255),
histology_subtype2 character varying(255),
histology_subtype3 character varying(255),
genome_wide_screen character varying(255),
genomic_mutation_id character varying(255),
legacy_mutation_id character varying(255),
mutation_id character varying(255),
mutation_cds character varying(255),
mutation_aa character varying(255),
mutation_description character varying(255),
mutation_zygosity character varying(255),
loh character varying(255),
grch integer,
genome_coordinate character varying(255),
strand character(1),
snp character varying(10),
resistance_mutation character varying(10),
fathmm_prediction character varying(255),
fathmm_score character varying(255),
mutation_somatic_status character varying(255),
pmid integer,
id_study integer,
sample_source character varying(255),
tumor_origin character varying(255),
age character varying(255),
sj_diagnosis character varying(255),
sj_subtype character varying(255),
sj_subgroup	character varying(255),
chr character(2),
position integer,
reference_allele character varying(255),
mutant_allele character varying(255),
genomic_parsable character(1),
genomic_parsable_type character varying(255),
genomic_parsable_notes character varying(255),
annovar_region character varying(255),
annovar_region_gene	character varying(255),
annovar_exonic_function character varying(255),
annovar_exonic_function_gene character varying(255),
annovar_sj_gene character varying(255),
annovar_sj_gi integer,
annovar_sj_class character varying(255),
annovar_sj_aachange character varying(255),
annovar_sj_cdna character varying(255),
annovar_sj_exception character varying(255),
annovar_sj_notecharacter varying(255),
annovar_sj_filter_isoform character varying(255),
annovar_sj_filter_isoform_preferred	character(1)
);

drop table if exists cosmic_fusion;
create table cosmic_fusion (
sample_id int,
sample_name character varying(255),
genes text,
isoforms text,
fusions json,
primarysite character,
sitesubtype1 character,
sitesubtype2 character,
sitesubtype3 character,
primaryhistology character,
histologysubtype1 character,
histologysubtype2 character,
histologysubtype3 character,
note character,
pmid integer
);

.mode tabs
.import cosmic.fusion.hg38 cosmic_fusion
.import cosmic.snvindel.hg38 cosmic


create index cosmic_refseq on cosmic(annovar_sj_filter_isoform);
create index cosmicfusiongene on cosmic_fusion(genes);
create index cosmicfusionisoform on cosmic_fusion(isoforms);


Overwriting cosmic.hg38.db.sql


In [105]:
%%writefile cosmic.slice.hg38.db.sql

drop table if exists cosmic;
create table cosmic (
gene1 character varying(100) null,
gene1accession character varying(100) null,
gene1cdslen integer null,
hgnc_id integer null,
sample_name character varying(255),
id_sample integer,
id_tumor integer,
primary_site character varying(255),
site_subtype1 character varying(255),
site_subtype2 character varying(255),
site_subtype3 character varying(255),
primary_histology character varying(255),
histology_subtype1 character varying(255),
histology_subtype2 character varying(255),
histology_subtype3 character varying(255),
genome_wide_screen character varying(255),
genomic_mutation_id character varying(255),
legacy_mutation_id character varying(255),
mutation_id character varying(255),
mutation_cds character varying(255),
mutation_aa character varying(255),
mutation_description character varying(255),
mutation_zygosity character varying(255),
loh character varying(255),
grch integer,
genome_coordinate character varying(255),
strand character(1),
snp character varying(10),
resistance_mutation character varying(10),
fathmm_prediction character varying(255),
fathmm_score character varying(255),
mutation_somatic_status character varying(255),
pmid integer,
id_study integer,
sample_source character varying(255),
tumor_origin character varying(255),
age character varying(255),
sj_diagnosis character varying(255),
sj_subtype character varying(255),
sj_subgroup	character varying(255),
chr character(2),
position integer,
reference_allele character varying(255),
mutant_allele character varying(255),
genomic_parsable character(1),
genomic_parsable_type character varying(255),
genomic_parsable_notes character varying(255),
annovar_region character varying(255),
annovar_region_gene	character varying(255),
annovar_exonic_function character varying(255),
annovar_exonic_function_gene character varying(255),
annovar_sj_gene character varying(255),
annovar_sj_gi integer,
annovar_sj_class character varying(255),
annovar_sj_aachange character varying(255),
annovar_sj_cdna character varying(255),
annovar_sj_exception character varying(255),
annovar_sj_notecharacter varying(255),
annovar_sj_filter_isoform character varying(255),
annovar_sj_filter_isoform_preferred	character(1)
);

drop table if exists cosmic_fusion;
create table cosmic_fusion (
sample_id int,
sample_name character varying(255),
genes text,
isoforms text,
fusions json,
primarysite character,
sitesubtype1 character,
sitesubtype2 character,
sitesubtype3 character,
primaryhistology character,
histologysubtype1 character,
histologysubtype2 character,
histologysubtype3 character,
note character,
pmid integer
);

.mode tabs
.import cosmic.fusion.slice.hg38 cosmic_fusion
.import cosmic.snvindel.slice.hg38 cosmic


create index cosmic_refseq on cosmic(annovar_sj_filter_isoform);
create index cosmicfusiongene on cosmic_fusion(genes);
create index cosmicfusionisoform on cosmic_fusion(isoforms);


Writing cosmic.slice.hg38.db.sql
