# Creating snp database

In [None]:
import json
import sqlite3
import time
from pathlib import Path
from typing import Dict

import pandas as pd
import pysam
from search_your_dna.hg_util import get_assembly_metadata_df

from search_your_dna.snp_store import persist_all_snps_to_db, create_snp_db_schema, create_snp_db_chrom_pos_index, \
    create_snp_db_rsid_index, insert_genotype_to_db
from search_your_dna.util import CHROM_LIST, get_my_snps_for_chromosome, _get_contig, get_chrom_reads_in_pos, \
    calc_genotype_for_chrom_snp_reads

In [None]:
snp_db_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite"

## Create SQL database

In [None]:
assert not Path(snp_db_file).exists(), "Warning! This is intended to be run only for populating a new database."
create_snp_db_schema(snp_db_file=snp_db_file)

## get all SNP chr/pos values from ncbi

available for download in: https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
as `00-All.vcf.gz`

## Store results in a sqlite db

In [None]:
all_rsid_file = "/home/s/src/search_your_dna/data/00-All.vcf"
conn = sqlite3.connect(snp_db_file)

In [None]:
%%time
persist_all_snps_to_db(conn, all_rsid_file)

## Build indices for fast lookups

In [None]:
%%time
create_snp_db_chrom_pos_index(snp_db_file=snp_db_file)
create_snp_db_rsid_index(snp_db_file=snp_db_file)

## Store my genotype

**important**

Depends on resulting json file `region_contig_read_counts.json` that can be generated from notebook: `selecting_contigs_for_alignment.ipynb`.

It is important for selecting reference genome contigs to use.

In [None]:
bam_file_grch37 = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.bam"
bam_file = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.GRCh38.p7_v2.bam"
alignment_data_grch37 = pysam.AlignmentFile(bam_file_grch37, "rb")
alignment_data_grch38 = pysam.AlignmentFile(bam_file, "rb")

with open("my_grch38_p7_build/region_contig_read_counts.json", "r") as f:
    region_contig_read_counts = json.load(f)

assembly_report_file = "data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_report.txt"
assembly_regions_file = "data/grch38.p7/GCA_000001405.22_GRCh38.p7_assembly_regions.txt"

assembly_metadata_df = get_assembly_metadata_df(assembly_report_file = assembly_report_file, assembly_regions_file=assembly_regions_file)

### Select contigs to use

In [None]:
snp_db_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite"


def calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df):
    alt_contigs_to_use = pd.DataFrame(columns=["chrom", "start", "stop", "contig", "region"])
    for region, contig_read_count in region_contig_read_counts.items():
        region_metadata_df = assembly_metadata_df[assembly_metadata_df["region_name"] == region]
        chrom = region_metadata_df["chromosome"].iloc[0]
        chrom_start = region_metadata_df["chromosome_start"].iloc[0]
        chrom_stop = region_metadata_df["chromosome_stop"].iloc[0]

        regions_contig_with_highest_coverage = sorted(contig_read_count.items(), key=lambda item: item[1])[-1]
        current_contig = regions_contig_with_highest_coverage[0]
        if current_contig != "main":
            alt_contigs_to_use = alt_contigs_to_use.append(
                {"chrom": chrom, "start": chrom_start, "stop": chrom_stop, "contig": current_contig, "region": region},
                ignore_index=True)
    return alt_contigs_to_use


alt_contigs_to_use = calc_alt_contigs_to_use(region_contig_read_counts, assembly_metadata_df)

res_df = pd.DataFrame(columns=["chrom", "pos", "genotype"])

for chrom in CHROM_LIST[::-1][18:]:
    print("inserting chrom", chrom)
    start_time = time.time()
    # setting these here to flush away previous values from the memory
    genotype_df = None
    genotype_main_df = None
    genotype_alt_contigs_dfs = []
    ## get main assembly genotype values
    genotype_main_df = get_my_snps_for_chromosome(alignment_data=alignment_data_grch38, snp_db_file=snp_db_file,
                                                  chrom=chrom,
                                                  cache_root_path="data/my_genotype_in_pos_hg38_main_contig_reads")
    getting_main_snp_finish_time = time.time()
    ## get active alt contig values
    chrom_alt_contigs = alt_contigs_to_use[alt_contigs_to_use["chrom"] == chrom]
    for chrom_alt_contig_dict in chrom_alt_contigs.to_dict(orient="records"):
        start = chrom_alt_contig_dict["start"]
        stop = chrom_alt_contig_dict["stop"]
        region = chrom_alt_contig_dict["region"]
        contig = chrom_alt_contig_dict["contig"]
        print(f"\tLooking into contig {contig}")
        _conn = sqlite3.connect(snp_db_file)

        contig_rsid_pos_df = pd.read_sql_query(
            f"SELECT rsid, pos FROM all_snp_pos WHERE chrom = '{chrom}' AND pos >= {start} AND pos <= {stop}",
            con=_conn)
        print(f"\tFound #{len(contig_rsid_pos_df.index)} snps in the database")
        rel_positions = set(contig_rsid_pos_df["pos"].apply(lambda pos: pos - start))
        my_alt_contig_relative_pos_snp_reads = get_chrom_reads_in_pos(alignment_data=alignment_data_grch38,
                                                                      contig=contig, positions=rel_positions)
        my_alt_contig_snp_reads = {
            start + rel_pos: reads for rel_pos, reads in my_alt_contig_relative_pos_snp_reads.items()
        }
        my_alt_contig_genotype_df = calc_genotype_for_chrom_snp_reads(my_alt_contig_snp_reads, chrom=chrom, sex="male")
        my_alt_contig_genotype_df["chrom"] = chrom
        genotype_alt_contigs_dfs.append(my_alt_contig_genotype_df)

    if len(genotype_alt_contigs_dfs) != 0:
        ## merge main and alt contig values so that if some chrom/pos match then prefer alt contigs
        alt_contigs_in_main = reduce(
            lambda acc, x: acc | x,
            [(genotype_main_df["pos"] >= genotype_alt_contigs_df["pos"].min()) & (
                genotype_main_df["pos"] <= genotype_alt_contigs_df["pos"].max())
             for genotype_alt_contigs_df in genotype_alt_contigs_dfs]
        )

        main_contig_without_alt_contig_values = genotype_main_df[~alt_contigs_in_main]
        genotype_df = pd.concat([main_contig_without_alt_contig_values] + genotype_alt_contigs_dfs, sort=True)
    else:
        genotype_df = genotype_main_df
    getting_alt_contig_snp_finish_time = time.time()

    insert_genotype_to_db(snp_db_file=snp_db_file, genotype_df=genotype_df, genotype_col_name="genotype_hg38")
    genotype_df[["pos", "genotype"]].to_csv(f"/home/s/src/search_your_dna/data/my_genotype_hg38/my_chrom_{chrom}_snp.csv")
    inserting_snp_time = time.time()
    print(
        f"\tTime for getting snp info: {getting_main_snp_finish_time - start_time}/{getting_alt_contig_snp_finish_time - getting_main_snp_finish_time}. SQL insert took: {inserting_snp_time - getting_alt_contig_snp_finish_time}")
    res_df = pd.concat([res_df, genotype_df])
