# Experimenting with Annovar

In [None]:
import json
import rsidx
import re
import sqlite3
import pandas as pd
import pysam
from search_your_dna.hg_util import get_assembly_metadata_df
from search_your_dna.util import read_raw_zipped_vcf_file, get_vcf_file_header_line_number

## Preprocessing before annotations

Had a problem around annovar not caring about alt contigs at all (couldn't match rsid to them). Thus, it feels that I need to compile a vcf file where I have incorporated needed alt contigs inside the main contig.

In [None]:
project_root_dir = "/home/s/src/search_your_dna/"
raw_vcf_file = "data/GFX0237425.GRCh38.p7.vcf"
header_pattern = "#CHROM\s+POS\s+ID\s+REF\s+ALT"

In [None]:
raw_vcf_df = read_raw_zipped_vcf_file(raw_vcf_file)
raw_vcf_df.head()

In [None]:
raw_vcf_df[raw_vcf_df["#CHROM"] == "chrX_KI270913v1_alt"].head()

In [None]:
with open(f"{project_root_dir}/my_grch38_p7_build/region_contig_read_counts.json", "r") as f:
    region_contig_read_counts = json.load(f)

assembly_report_file = f"{project_root_dir}/data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_report.txt"
assembly_regions_file = f"{project_root_dir}/data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_regions.txt"

assembly_metadata_df = get_assembly_metadata_df(assembly_report_file=assembly_report_file,
                                                assembly_regions_file=assembly_regions_file)


def calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df):
    alt_contigs_to_use = pd.DataFrame(columns=["chrom", "start", "stop", "contig", "region"])
    for region, contig_read_count in region_contig_read_counts.items():
        region_metadata_df = assembly_metadata_df[assembly_metadata_df["region_name"] == region]
        chrom = region_metadata_df["chromosome"].iloc[0]
        chrom_start = region_metadata_df["chromosome_start"].iloc[0]
        chrom_stop = region_metadata_df["chromosome_stop"].iloc[0]

        regions_contig_with_highest_coverage = sorted(contig_read_count.items(), key=lambda item: item[1])[-1]
        current_contig = regions_contig_with_highest_coverage[0]
        if current_contig != "main":
            alt_contigs_to_use = alt_contigs_to_use.append(
                {"chrom": chrom, "start": chrom_start, "stop": chrom_stop, "contig": current_contig, "region": region},
                ignore_index=True)
    return alt_contigs_to_use

alt_contigs_to_use = calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df)
alt_contigs_to_use = alt_contigs_to_use.sort_values(by=["chrom", "start"])#, ascending=False)
alt_contigs_to_use.head()

### Remove main contig parts that shouldn't be used

In [None]:
for index, row in list(alt_contigs_to_use.iterrows()):
    length_before = raw_vcf_df.shape[0]
    print(row["contig"], "vcf length before", length_before)
    raw_vcf_df = raw_vcf_df[~((raw_vcf_df["#CHROM"] == "chr" + row["chrom"]) & (row["start"] <= raw_vcf_df["POS"]) & (raw_vcf_df["POS"] <= row["stop"]))]
    print("\t\tlength after", raw_vcf_df.shape[0])
    print("\t\t\tremoved snps", length_before - raw_vcf_df.shape[0])

### Make alt contigs that should be used part of main contig

In [None]:
def transform_row(contig_start):
    def _transform(row):
        row["#CHROM"] = row["#CHROM"].split("_")[0]
        row["POS"] = contig_start + row["POS"]
        return row
    return _transform

for index, row in list(alt_contigs_to_use.iterrows()):
    length_before = raw_vcf_df.shape[0]
    print(row["contig"], "vcf length before", length_before)
    alt_contig_rows_in_raw_vcf_df = raw_vcf_df[raw_vcf_df["#CHROM"] == row["contig"]]
    raw_vcf_df = raw_vcf_df[raw_vcf_df["#CHROM"] != row["contig"]]
    updated_alt_contig_rows_in_raw_vcf_df = alt_contig_rows_in_raw_vcf_df.apply(transform_row(row["start"]), axis="columns")
    raw_vcf_df = raw_vcf_df.append(updated_alt_contig_rows_in_raw_vcf_df, ignore_index=True)
    print("\t\talt_contig length", updated_alt_contig_rows_in_raw_vcf_df.shape[0])
    print("\t\tlength after", raw_vcf_df.shape[0])
    print("\t\t\tlost snps", raw_vcf_df.shape[0] - length_before)

In [None]:
raw_vcf_df.shape

In [None]:
alt_contig_rows_in_raw_vcf_df.tail()

In [None]:
updated_alt_contig_rows_in_raw_vcf_df.tail()

In [None]:
list(alt_contigs_to_use.iterrows())[1:2]


### Remove all other alt contigs

In [None]:
CHROM_TO_KEEP = [
    "chr1",
    "chr2",
    "chr3",
    "chr4",
    "chr5",
    "chr6",
    "chr7",
    "chr8",
    "chr9",
    "chr10",
    "chr11",
    "chr12",
    "chr13",
    "chr14",
    "chr15",
    "chr16",
    "chr17",
    "chr18",
    "chr19",
    "chr20",
    "chr21",
    "chr22",
    "chrX",
    "chrY",
    "chrM",
]

print("\t\tlength before", raw_vcf_df.shape[0])
raw_vcf_df = raw_vcf_df[raw_vcf_df["#CHROM"].isin(CHROM_TO_KEEP)]
print("\t\tlength after", raw_vcf_df.shape[0])

### Sort vcf

In [None]:
raw_vcf_df = raw_vcf_df.sort_values(by=["#CHROM", "POS"], ignore_index=True)

### Store new vcf


In [None]:
high_coverage_alt_contigs_vcf_file = "data/GFX0237425.GRCh38.p7.using_high_coverage_alt_contigs.vcf"

In [None]:
header_row_number = get_vcf_file_header_line_number(file_name=raw_vcf_file)
row_counter = 0
header_text = []
with open(raw_vcf_file, "r") as f:
    for line_text in f:
        if row_counter >= header_row_number:
            break
        row_counter += 1
        if line_text.startswith("##contig=<ID="):
            contig = line_text.replace("##contig=<ID=", "").split(",")[0]
            if contig not in CHROM_TO_KEEP:
                continue
        header_text.append(line_text)

In [None]:
with open(high_coverage_alt_contigs_vcf_file, "w") as f:
    f.writelines(header_text)
    raw_vcf_df.to_csv(f, sep="\t", index=None)

## Using Annovar to annotate variance

Example from: https://annovar.openbioinformatics.org/en/latest/user-guide/startup/

In [None]:
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene data/humandb/
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb cytoBand data/humandb/
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar exac03 data/humandb/
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp147 data/humandb/
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp150 data/humandb/
!~/bin/annovar/annotate_variation.pl -buildver hg38 -downdb -webfrom annovar dbnsfp30a data/humandb/

In [None]:
!~/bin/annovar/table_annovar.pl data/GFX0237425.GRCh38.p7.using_high_coverage_alt_contigs.vcf data/humandb/ -buildver hg38 -out data/GFX0237425.GRCh38.p7.annotated -remove -protocol refGene,avsnp150 -operation gx,f -nastring . -vcfinput -polish -xref ~/bin/annovar/example/gene_xref.txt
# !~/bin/annovar/table_annovar.pl data/GFX0237425.GRCh38.p7.using_high_coverage_alt_contigs.vcf data/humandb/ -buildver hg38 -out data/GFX0237425.GRCh38.p7.annotated -remove -protocol refGene,cytoBand,exac03,avsnp150,dbnsfp30a -operation gx,r,f,f,f -nastring . -vcfinput -polish -xref ~/bin/annovar/example/gene_xref.txt

### Move rsid to the ID columns

In [None]:
header_pattern = "#CHROM\tPOS\tID"
with open(str("data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.vcf"), "r") as raw_f:
    with open(str("data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf"), "w") as new_f:
        passed_header = False
        for line_text in raw_f:
            if not passed_header:
                if re.search(header_pattern, line_text):
                    passed_header = True
                new_f.writelines([line_text])
            else:
                line_parts = line_text.split("\t")
                rsid_from_info = next(filter(lambda t: "avsnp150" in t, line_parts[-3].split(";"))).split("=")[1]
                line_parts[2] = rsid_from_info
                new_f.writelines(["\t".join(line_parts)])

### Compress and create tabix index

In [None]:
!bgzip -c data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf > data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz
!tabix -p vcf data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz

### Create index for rsid

In [None]:
!rsidx index data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.rsidx

## Viewing results

In [None]:
vcf_my = pysam.VariantFile("data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz")

recs = {}
# for id, rec in enumerate(vcf_my.fetch("chr1", 30404080, 30404280)):
# for id, rec in enumerate(vcf_my.fetch("chr6", 30404080, 30404280)):
for id, rec in enumerate(vcf_my.fetch("chr6", 29_831_013, 29_831_027)):
# for id, rec in enumerate(vcf_my.fetch("chr8_KI270810v1_alt")):
# for id, rec in enumerate(vcf_my.fetch("chr8_KI270814v1_alt")):
# for id, rec in enumerate(vcf_my.fetch("chr6_GL000254v2_alt")):
# for id, rec in enumerate(vcf_my.fetch("chr16_KI270853v1_alt")):
    print((rec.info["avsnp150"][0], str(rec)))
    recs[id] = (rec.info["avsnp150"], rec)

# rs9380142_pos = 29831017
# recs = {}
# for id, rec in enumerate(vcf_my.fetch("chr6", rs9380142_pos - 100000, rs9380142_pos + 10000)): # rs9380142 is visible here
#     print(rec)
#     recs[id] = rec

In [None]:
rsidlist = ['rs1260965680', 'rs1309677886', 'rs1174660622', 'rs1291927541', "rs4259577", "rs9380142"]
with sqlite3.connect("data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz.rsidx") as db, open("data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz", 'r') as vcf_f:
    res = next(rsidx.search.search(rsidlist, db, "data/data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz"))
    print(res)