In [1]:
import pandas as pd
import os

## ANNOVAR configuration

In [2]:
ANNOVAR_GZ = "/home/ma/Downloads/annovar.latest.tar.gz"
PARENT_DIR = "/home/ma/Downloads"

# Extract the annovar files if they are not already extracted
if not os.path.exists(os.path.join(PARENT_DIR, "annovar")):
    !tar -xzf {ANNOVAR_GZ} -C {PARENT_DIR}

In [3]:
# Path to the directory containing the unpacked ANNOVAR files
ANNOVAR_DIR = PARENT_DIR + "/annovar"

# show the contents of the directory
!ls $ANNOVAR_DIR

annotate_variation.pl  example			   table_annovar.pl
coding_change.pl       humandb			   variants_reduction.pl
convert2annovar.pl     retrieve_seq_from_fasta.pl


In [4]:
# show the example files
!ls $ANNOVAR_DIR/example

ex1.avinput	       example.tab_region  grantham.matrix
ex2.vcf		       gene_fullxref.txt   README
example.simple_region  gene_xref.txt	   snplist.txt


In [5]:
# show the first 10 lines of the example avinput file
!head -n 10 $ANNOVAR_DIR/example/ex1.avinput

1	948921	948921	T	C	comments: rs15842, a SNP in 5' UTR of ISG15
1	1404001	1404001	G	T	comments: rs149123833, a SNP in 3' UTR of ATAD3C
1	5935162	5935162	A	T	comments: rs1287637, a splice site variant in NPHP4
1	162736463	162736463	C	T	comments: rs1000050, a SNP in Illumina SNP arrays
1	84875173	84875173	C	T	comments: rs6576700 or SNP_A-1780419, a SNP in Affymetrix SNP arrays
1	13211293	13211294	TC	-	comments: rs59770105, a 2-bp deletion
1	11403596	11403596	-	AT	comments: rs35561142, a 2-bp insertion
1	105492231	105492231	A	ATAAA	comments: rs10552169, a block substitution
1	67705958	67705958	G	A	comments: rs11209026 (R381Q), a SNP in IL23R associated with Crohn's disease
2	234183368	234183368	A	G	comments: rs2241880 (T300A), a SNP in the ATG16L1 associated with Crohn's disease


In [6]:
# show the last 10 lines of the example avinput file
!tail -n 10 $ANNOVAR_DIR/example/ex1.avinput

16	50756540	50756540	G	C	comments: rs2066845 (G908R), a non-synonymous SNP in NOD2
16	50763778	50763778	-	C	comments: rs2066847 (c.3016_3017insC), a frameshift SNP in NOD2
13	20763686	20763686	G	-	comments: rs1801002 (del35G), a frameshift mutation in GJB2, associated with hearing loss
13	20797176	21105944	0	-	comments: a 342kb deletion encompassing GJB6, associated with hearing loss
8	8887543	8887543	A	T	comments: a mutation that abolishes stop codon
8       8887539 8887539 A       T	comments: a mutation that results in premature stop codon
8       8887536 8887537 AG      GATT	comments: a mutation that creates a stop codon 2 amino acids downstream
8       8887540 8887540 G       GGAA	comments: a mutation that results in insertion of a new amino acid
5       1295288 1295288 G       A	comments: a variant upstream of transcriptional start site
chr14   95602958        95602958        A       C	comments: a variant that affects splicing of UTR regions


we can see that the input file has exceptions to its input fields

## Prepare input files

We parse the TSV file to create a new file with the correct input fields

Ultimately we aim to use the output gathered from Clara Parabricks as input.

In [7]:
TSV_FILE = "/mnt/nas/wgs/geneseeq/BX231U0168/BX231U0168-WGS-R1_sorted_dedup_recal_filtered_snps_indels_funcotated_PASS.tsv"
AVINPUT_FILE = "/mnt/nas/wgs/geneseeq/BX231U0168/av.avinput"

In [8]:
# show the first 10 lines of the TSV file
!head -n 10 $TSV_FILE

CHROM	POS	REF	ALT	QUAL	FILTER	DP	AF	AC	AN	BaseQRankSum	ClippingRankSum	DP	ExcessHet	FS	InbreedingCoeff	MQ	MQRankSum	QD	ReadPosRankSum	SOR	Funcotator
chr1	15903	G	GC	93.79999999999998	PASS	3	1.00	2	2	NA	NA	3	0.0000	0.000	NA	33.67	NA	31.27	NA	2.833	NA
chr1	16288	C	G	214.64	PASS	45	0.500	1	2	3.169	NA	45	0.0000	18.138	NA	40.84	-3.491	4.77	0.990	3.767	NA
chr1	16298	C	T	555.64	PASS	42	0.500	1	2	5.014	NA	42	0.0000	17.101	NA	40.19	-4.452	14.25	0.846	2.809	NA
chr1	17365	C	G	100.64	PASS	22	0.500	1	2	-0.663	NA	22	0.0000	2.611	NA	40.67	-0.588	4.57	-0.314	0.119	NA
chr1	17614	G	A	364.64	PASS	36	0.500	1	2	1.182	NA	36	0.0000	0.000	NA	43.08	-0.021	11.05	0.219	0.420	NA
chr1	20316	GA	G	214.60000000000002	PASS	28	0.500	1	2	-0.436	NA	28	0.0000	6.779	NA	29.57	-0.092	8.94	0.806	2.067	NA
chr1	49298	T	C	985.06	PASS	31	1.00	2	2	NA	NA	31	0.0000	0.000	NA	44.64	NA	31.78	NA	0.756	NA
chr1	51803	T	C	741.64	PASS	37	0.500	1	2	-3.253	NA	37	0.0000	18.711	NA	44.59	-0.847	21.19	0.472	1.032	NA
chr1	51898	C	A	115.64	PASS	40	

In [9]:
# read the TSV file into a pandas dataframe
df = pd.read_csv(TSV_FILE, sep="\t")
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,QUAL,FILTER,DP,AF,AC,AN,...,DP.1,ExcessHet,FS,InbreedingCoeff,MQ,MQRankSum,QD,ReadPosRankSum,SOR,Funcotator
0,chr1,15903,G,GC,93.8,PASS,3,1.0,2,2,...,3,0.0,0.0,,33.67,,31.27,,2.833,
1,chr1,16288,C,G,214.64,PASS,45,0.5,1,2,...,45,0.0,18.138,,40.84,-3.491,4.77,0.99,3.767,
2,chr1,16298,C,T,555.64,PASS,42,0.5,1,2,...,42,0.0,17.101,,40.19,-4.452,14.25,0.846,2.809,
3,chr1,17365,C,G,100.64,PASS,22,0.5,1,2,...,22,0.0,2.611,,40.67,-0.588,4.57,-0.314,0.119,
4,chr1,17614,G,A,364.64,PASS,36,0.5,1,2,...,36,0.0,0.0,,43.08,-0.021,11.05,0.219,0.42,


In [10]:
columns = ["CHROM", "POS", "REF", "ALT"]

# create a new dataframe with only the columns we need
df = df[columns]
df.head()

Unnamed: 0,CHROM,POS,REF,ALT
0,chr1,15903,G,GC
1,chr1,16288,C,G
2,chr1,16298,C,T
3,chr1,17365,C,G
4,chr1,17614,G,A


however the CHROM field is not in the correct format

In [11]:
# show the unique chromosomes
df["CHROM"].unique()

array(['chr1', 'chr10', 'chr11', 'chr11_KI270721v1_random', 'chr12',
       'chr13', 'chr14', 'chr14_GL000009v2_random',
       'chr14_GL000225v1_random', 'chr14_KI270722v1_random',
       'chr14_GL000194v1_random', 'chr14_KI270723v1_random',
       'chr14_KI270724v1_random', 'chr14_KI270725v1_random',
       'chr14_KI270726v1_random', 'chr15', 'chr16',
       'chr16_KI270728v1_random', 'chr17', 'chr17_GL000205v2_random',
       'chr17_KI270729v1_random', 'chr17_KI270730v1_random', 'chr18',
       'chr19', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random',
       'chr1_KI270708v1_random', 'chr1_KI270709v1_random',
       'chr1_KI270710v1_random', 'chr1_KI270711v1_random',
       'chr1_KI270712v1_random', 'chr1_KI270713v1_random',
       'chr1_KI270714v1_random', 'chr2', 'chr20', 'chr21', 'chr22',
       'chr22_KI270731v1_random', 'chr22_KI270732v1_random',
       'chr22_KI270733v1_random', 'chr22_KI270734v1_random',
       'chr22_KI270735v1_random', 'chr22_KI270736v1_random',
       '

In [12]:
# keep only the integer part of the CHROM column following the "chr" prefix, removing anything after _ if present, and convert to integer
df["CHROM_INT"] = df["CHROM"].str.replace("chr", "").str.split("_", expand=True)[0].astype(int)

In [13]:
# show the unique chromosomes
df["CHROM_INT"].unique()


array([ 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  2, 20, 21, 22,  3])

In [14]:
df["CHROM_INT"].nunique()


16

In [15]:
# remove the CHROM column
df = df.drop("CHROM", axis=1)

In [16]:
# rename POS to Start
df = df.rename(columns={"POS": "Start"})

In [17]:
df.sample(10)

Unnamed: 0,Start,REF,ALT,CHROM_INT
1453344,1320613,C,G,16
2052798,71315651,C,T,2
1958883,13106598,C,T,2
2605005,23593143,G,T,3
2446975,21374062,G,C,21
1801784,62645605,T,TAGAGAG,18
507561,93295779,C,G,10
503533,90247350,T,A,10
907170,67867522,T,C,12
1299236,95539883,G,GTA,14


In [18]:
# add an End column, its value being Start plus length of REF minus 1
df["End"] = df["Start"] + df["REF"].str.len() - 1

In [19]:
# show the first 10 rows of the dataframe
df.head()

Unnamed: 0,Start,REF,ALT,CHROM_INT,End
0,15903,G,GC,1,15903
1,16288,C,G,1,16288
2,16298,C,T,1,16298
3,17365,C,G,1,17365
4,17614,G,A,1,17614


In [20]:
# reorder the columns to match the order of the example avinput file
df = df[["CHROM_INT", "Start", "End", "REF", "ALT"]]

In [21]:
df.sample(10)

Unnamed: 0,CHROM_INT,Start,End,REF,ALT
872082,12,45041768,45041768,T,C
1638632,17,34440360,34440360,A,G
1403083,15,72137648,72137648,G,A
2539709,22,41564087,41564089,TAA,T
1306666,14,99591302,99591302,G,C
682269,11,66269799,66269799,A,G
2537614,22,39423345,39423345,T,C
1995784,2,36397044,36397044,G,A
2153369,2,146130496,146130496,G,T
2235956,2,203229032,203229032,T,C


In [22]:
# save the dataframe to a avinput file
df.to_csv(AVINPUT_FILE, sep="\t", index=False, header=False)


## Download annotation databases

only to be run once

In [23]:
# download database files
# !perl $ANNOVAR_DIR/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene $ANNOVAR_DIR/humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGene.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneMrna.fa.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_refGeneVersion.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the '/home/ma/Downloads/annovar/humandb' directory


In [24]:
# download database files
# !perl $ANNOVAR_DIR/annotate_variation.pl -buildver hg38 -downdb cytoBand $ANNOVAR_DIR/humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the '/home/ma/Downloads/annovar/humandb' directory


In [25]:
# download database files
# !perl $ANNOVAR_DIR/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 $ANNOVAR_DIR/humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_exac03.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_exac03.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the '/home/ma/Downloads/annovar/humandb' directory


In [26]:
# download database files
# !perl $ANNOVAR_DIR/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 $ANNOVAR_DIR/humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_avsnp147.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_avsnp147.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the '/home/ma/Downloads/annovar/humandb' directory


In [27]:
# download database files
# !perl $ANNOVAR_DIR/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a $ANNOVAR_DIR/humandb/

NOTICE: Web-based checking to see whether ANNOVAR new version is available ... Done
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_dbnsfp30a.txt.gz ... OK
NOTICE: Downloading annotation database http://www.openbioinformatics.org/annovar/download/hg38_dbnsfp30a.txt.idx.gz ... OK
NOTICE: Uncompressing downloaded files
NOTICE: Finished downloading annotation files for hg38 build version, with files saved at the '/home/ma/Downloads/annovar/humandb' directory


In [31]:
# show the hg38 database files
!ls -lh $ANNOVAR_DIR/humandb/hg38_*

-rw-rw-r-- 1 ma ma  5.9G 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_avsnp147.txt
-rw-rw-r-- 1 ma ma  884M 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_avsnp147.txt.idx
-rw-rw-r-- 1 ma ma   54K 10月 29 12:15 /home/ma/Downloads/annovar/humandb/hg38_cytoBand.txt
-rw-rw-r-- 1 ma ma   14G 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_dbnsfp30a.txt
-rw-rw-r-- 1 ma ma   18M 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_dbnsfp30a.txt.idx
-rw-rw-r-- 1 ma ma  600M 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_exac03.txt
-rw-rw-r-- 1 ma ma   23M 12月  6  2019 /home/ma/Downloads/annovar/humandb/hg38_exac03.txt.idx
-rw-rw-r-- 1 ma ma  289M 8月   3  2022 /home/ma/Downloads/annovar/humandb/hg38_refGeneMrna.fa
-rw-rw-r-- 1 ma ma   25M 8月   3  2022 /home/ma/Downloads/annovar/humandb/hg38_refGene.txt
-rw-rw-r-- 1 ma ma 1011K 8月   3  2022 /home/ma/Downloads/annovar/humandb/hg38_refGeneVersion.txt


## Run ANNOVAR

TODO: fix output file path

In [33]:
# annotate the avinput file
!perl $ANNOVAR_DIR/table_annovar.pl $AVINPUT_FILE $ANNOVAR_DIR/humandb/ -buildver hg38  -out myanno -remove  -protocol refGene,cytoBand,exac03,avsnp147,dbnsfp30a -operation gx,r,f,f,f -nastring . -csvout -polish -xref $ANNOVAR_DIR/example/gene_xref.txt

-----------------------------------------------------------------
NOTICE: Processing operation=gx protocol=refGene

NOTICE: Running with system command <annotate_variation.pl -geneanno -buildver hg38 -dbtype refGene -outfile myanno.refGene -exonsort -nofirstcodondel /mnt/nas/wgs/geneseeq/BX231U0168/av.avinput /home/ma/Downloads/annovar/humandb/>
NOTICE: Output files are written to myanno.refGene.variant_function, myanno.refGene.exonic_variant_function
NOTICE: Reading gene annotation from /home/ma/Downloads/annovar/humandb/hg38_refGene.txt ... Done with 88819 transcripts (including 21511 without coding sequence annotation) for 28307 unique genes
NOTICE: Processing next batch with 2591148 unique variants in 2591148 input lines
NOTICE: Finished analyzing 1000000 query variants
NOTICE: Finished analyzing 2000000 query variants
NOTICE: Reading FASTA sequences from /home/ma/Downloads/annovar/humandb/hg38_refGeneMrna.fa ... Done with 17080 sequences

------------------------------------------

## Interpret the results