# Searching for an alternative way to calculate my genotype

In [None]:
import pysam
import glob
import gzip
import json
import re
import shutil
import sqlite3
import time
from collections import defaultdict
from functools import reduce
from pathlib import Path
from typing import Any, List, Union, Dict, Iterable, Set

import numpy as np
import pandas as pd
import pysam
import requests
from IPython.core.display import display

from search_your_dna.hg_util import get_assembly_metadata_df
from search_your_dna.pgscatalog import calc_polygenic_score
from search_your_dna.snp_store import create_snp_db_rsid_index, insert_genotype_to_db
from search_your_dna.util import CHROM_LIST, get_my_snps_for_chromosome, get_chrom_reads_in_pos, is_alignment_supported, \
    calc_genotype_for_chrom_snp_reads

General idea is to use my vcf file and all vcf file to search for my genotype.

* Search needs to support inputs: chrom-pos_hg38_v7 or rsid
* Ideally, it would need to support accepting lists and return genotype for pgs files.
* Also search needs to take into consideration alt contigs with higher read coverage than main chromosome section.
* Output needs to be genotype string either single, double or in case of insertions longer strings. Deletions marked with D.

### Search by chrom-pos_hg38_v7

look up entries first from my vcf file. if nothing found look up value from the alignment file. if nothing found, look up value from reference genome.

### Search by rsid

First look up chrom-pos_hg38_v7 values from the all vcf file. follow chrom-pos_hg38_v7 search process.


### Search details

Take into account alt contigs when fetching stuff from my vcf.
Return genotype, need to check AC values in my vcf info field.
Consider looking into doing imputation?


In [None]:
project_root_dir = "/home/s/src/search_your_dna/"

vcf_my = pysam.VariantFile(f"{project_root_dir}/data/my_genome_data/GFX0237425.GRCh38.p7.vcf.gz")
vcf_all = pysam.VariantFile(f"{project_root_dir}/data/grch38.p7/00-All.vcf.gz")

# rs9380142_pos = 29831017
#
# my_recs = {}
# for id, rec in enumerate(vcf_my.fetch("chr6", rs9380142_pos - 100000, rs9380142_pos+20)): # rs9380142 is visible here
#     print(rec)
#     my_recs[id] = rec
# for rec in vcf_all.fetch("6", rs9380142_pos - 1, rs9380142_pos):
#     print(rec)
# rs9380142_pos_on_contig = rs9380142_pos - 28510120 # 28510120 start of alt contig
# for rec in vcf_my.fetch('chr6_GL000254v2_alt', rs9380142_pos_on_contig - 10):
#     print(rec)

bam_file_grch37 = f"{project_root_dir}/data/my_genome_data/GFX0237425.bam"
bam_file_hg38 = f"{project_root_dir}/data/my_genome_data/GFX0237425.GRCh38.p7.bam"
alignment_data_hg37 = pysam.AlignmentFile(bam_file_grch37, "rb")
alignment_data_hg38 = pysam.AlignmentFile(bam_file_hg38, "rb")

with open(f"{project_root_dir}/my_grch38_p7_build/region_contig_read_counts.json", "r") as f:
    region_contig_read_counts = json.load(f)

assembly_report_file = f"{project_root_dir}/data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_report.txt"
assembly_regions_file = f"{project_root_dir}/data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_regions.txt"

assembly_metadata_df = get_assembly_metadata_df(assembly_report_file=assembly_report_file,
                                                assembly_regions_file=assembly_regions_file)


def calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df):
    alt_contigs_to_use = pd.DataFrame(columns=["chrom", "start", "stop", "contig", "region"])
    for region, contig_read_count in region_contig_read_counts.items():
        region_metadata_df = assembly_metadata_df[assembly_metadata_df["region_name"] == region]
        chrom = region_metadata_df["chromosome"].iloc[0]
        chrom_start = region_metadata_df["chromosome_start"].iloc[0]
        chrom_stop = region_metadata_df["chromosome_stop"].iloc[0]

        regions_contig_with_highest_coverage = sorted(contig_read_count.items(), key=lambda item: item[1])[-1]
        current_contig = regions_contig_with_highest_coverage[0]
        if current_contig != "main":
            alt_contigs_to_use = alt_contigs_to_use.append(
                {"chrom": chrom, "start": chrom_start, "stop": chrom_stop, "contig": current_contig, "region": region},
                ignore_index=True)
    return alt_contigs_to_use

alt_contigs_to_use = calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df)

res_df = pd.DataFrame(columns=["rsid", "chrom", "pos", "genotype"])

### Implement search by chrom-pos_hg38_v7


In [None]:
def find_contig_and_pos_to_use(chrom, pos):
    contig = chrom
    contig_position = pos


    possible_alt_config_to_use = alt_contigs_to_use[(alt_contigs_to_use["chrom"] == chrom) & (alt_contigs_to_use["start"] <= pos) & (pos <= alt_contigs_to_use["stop"])]

    if len(possible_alt_config_to_use) != 0:
        contig = possible_alt_config_to_use["contig"].to_list()[0]
        metadata = assembly_metadata_df[assembly_metadata_df["ucsc_style_name"] == contig]
        alt_contig_chrom_start_pos = metadata["chromosome_start"].to_list()[0]
        contig_position = pos -  alt_contig_chrom_start_pos
    return contig, contig_position

def find_genome_from_the_vcf_file(chrom, pos):
    contig, contig_position = find_contig_and_pos_to_use(chrom, pos)

    for id, rec in enumerate(vcf_my.fetch(contig, contig_position - 10, contig_position + 10)):
        print("my ", rec)
    for id, rec in enumerate(vcf_my.fetch(f"chr{chrom}", pos - 10, pos + 10)):
        print("mym", rec)
    for id, rec in enumerate(vcf_all.fetch(chrom, pos - 10, pos + 10)):
        print("all", rec)

# find_genome_from_the_vcf_file("5", 1081122) # rs959632155
# find_genome_from_the_vcf_file("5", 1081236) # rs9380142
find_genome_from_the_vcf_file("5", 1081553) #


#### Need to check statistics how many alt contig rsids I can find.

In [None]:
ran = False
xmy = {}
xall = {}
xcomp = defaultdict()
for _, row in alt_contigs_to_use.iterrows():
    if ran:
        continue
    print(row["contig"], row["chrom"], row["start"], row["stop"])

    for index, rec_my in enumerate(vcf_my.fetch(row["contig"])):
        # print("my", rec_my)
        xmy[index] = rec_my
        xall = {}
        for index_2, rec_all in enumerate(vcf_all.fetch(row["chrom"], row["start"] + rec_my.start - 1, row["start"] + rec_my.stop + 1)):
            # print("all", rec_all)
            xall[index_2] = (rec_all.ref, rec_all.alts)
        xcomp[index] = {"my": (rec_my.ref, rec_my.alts), "al": xall}
    ran = True
