# Get results based on my snp values

In [None]:
import glob
import json
import sqlite3
import time
from functools import reduce
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
import pysam
from IPython.core.display import display

from search_your_dna.pgscatalog import get_all_pgs_api_data, read_or_download_pgs_scoring_file, calc_polygenic_score
from search_your_dna.snp_store import persist_all_snps_to_db, query_my_genotypes_for_rsids
from search_your_dna.util import chrom_list, get_genotype_for_chrom_pos, calc_genotype_for_chrom_snp_reads, \
    get_chrom_reads_in_pos, get_my_genotypes_for_pgs, merge_pgs_with_my_genotype, filter_out_none_effect_alleles, \
    filter_out_effect_alleles, get_my_snps_for_chromosome, load_vcf_to_df, read_raw_zipped_polygenic_score_file

In [None]:
cache_file_name = "data/vcf_records.parquet.gz"
snp_db_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite"
vcf_file_paths = [Path(f) for f in [
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.cnv.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.indel.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.snp.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.sv.vcf.gz"
]]


## Load variant files

In [None]:
vcf_df = load_vcf_to_df(vcf_files=vcf_file_paths, cache_file_name=cache_file_name)
vcf_df.shape

In [None]:
df1 = vcf_df.loc[vcf_df["POS"] == 7383583]

## Load polygenic risk scores to analyse

### For Alzheimer's

In [None]:
polygenic_risk_score_file_alzheimer = "data/PGS000025.txt.gz"
pgs_025_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_alzheimer)
pgs_025_df

### For Schizophrenia

In [None]:
polygenic_risk_score_file_schizophrenia = "data/PGS000133.txt.gz"
pgs_133_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_schizophrenia)
pgs_133_df

## Get my genotype for the disease

In [None]:
bam_file_old = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.bam"
bam_file = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.GRCh38.p7.bam"
alignment_data_old = pysam.AlignmentFile(bam_file_old, "rb")
alignment_data = pysam.AlignmentFile(bam_file, "rb")

In [None]:
print("ALPHA-1 ANTITRYPSIN DEFICIENCY")
display((get_genotype_for_chrom_pos(alignment_data, "14", 94847386),))
print("Aspirin")
display((get_genotype_for_chrom_pos(alignment_data, "5", 179220638),))

### Collect PGS data

#### Get all traits available in pgs catalogue

In [None]:
all_traits_result = get_all_pgs_api_data("trait/all")

In [None]:
all_pgs_traits_df = pd.DataFrame(all_traits_result)
print(all_pgs_traits_df.columns)
all_pgs_traits_df

In [None]:
pgs_ids = set(reduce(lambda a, b: a + b, all_pgs_traits_df["associated_pgs_ids"].to_list(), []))

#### Get pgs entities from pgs

#### Download all pgs scoring files

In [None]:
for pgs_id in sorted(pgs_ids):
    try:
        read_or_download_pgs_scoring_file(pgs_id)
        time.sleep(0.5)  # Not to overload api with requests
    except Exception as e:
        print(f"Something went wrong when parsing pgs file", e)

### Creating snp database
#### get all SNP chr/pos values from ncbi

available for download in: https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
as `00-All.vcf.gz`

#### Store results in a sqlite db

In [None]:
all_rsid_file = "/home/s/src/search_your_dna/data/00-All.vcf"
conn = sqlite3.connect(snp_db_file)

In [None]:
persist_all_snps_to_db(conn, all_rsid_file)

In [None]:
all_snp_pos = pd.read_sql("SELECT distinct (chrom) FROM all_snp_pos", con=conn)
all_snp_pos

#### find my genotype for all SNP values

In [None]:
%%time
for chrom in chrom_list:
    get_my_snps_for_chromosome(alignment_data=alignment_data, snp_db_file=snp_db_file, chrom=chrom)

#### store SNP values in sqlite database

### Calculate PGS for my dna



#### test with one

In [None]:
%%time
pgs_025_score, pgs_025_df = calc_polygenic_score(snp_db_file, "data/PGS000025.txt.gz")
display("Score pgs025: " + str(pgs_025_score))
display(pgs_025_df)

#### Run all PGS calculations

In [None]:
%time
pgs_for_traits = {}
pgs_for_traits_errors = {}
for pgs_file in glob.glob("data/PGS*.txt.gz"):
    try:
        pgs_for_traits[pgs_file] = calc_polygenic_score(snp_db_file, pgs_file, max_pgs_alleles=1000)
    except Exception as e:
        print("Failed to analyze", pgs_file, e)
        pgs_for_traits_errors[pgs_file] = (e, e.__traceback__)