# Get results based on my snp values

In [None]:
import json
import sqlite3
import time
from functools import reduce
from pathlib import Path

import numpy as np
import pandas as pd
import pysam
from IPython.core.display import display

from search_your_dna.pgscatalog import get_all_pgs_api_data, read_or_download_pgs_scoring_file
from search_your_dna.snp_store import persist_all_snps_to_db
from search_your_dna.util import chrom_list, get_genotype_for_chrom_pos, calc_genotype_for_chrom_snp_reads, \
    get_chrom_reads_in_pos, get_my_genotypes_for_pgs, merge_pgs_with_my_genotype, filter_out_none_effect_alleles, \
    filter_out_effect_alleles, get_my_snps_for_chromosome, load_vcf_to_df, read_raw_zipped_polygenic_score_file

In [None]:
cache_file_name = "data/vcf_records.parquet.gz"
vcf_database_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite"
vcf_file_paths = [Path(f) for f in [
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.cnv.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.indel.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.snp.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.sv.vcf.gz"
]]


## Load variant files

In [None]:
vcf_df = load_vcf_to_df(vcf_files=vcf_file_paths, cache_file_name=cache_file_name)
vcf_df.shape

In [None]:
df1 = vcf_df.loc[vcf_df["POS"] == 7383583]

## Load polygenic risk scores to analyse

### For Alzheimer's

In [None]:
polygenic_risk_score_file_alzheimer = "data/PGS000025.txt.gz"
pgs_025_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_alzheimer)
pgs_025_df

### For Schizophrenia

In [None]:
polygenic_risk_score_file_schizophrenia = "data/PGS000133.txt.gz"
pgs_133_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_schizophrenia)
pgs_133_df

## Get my genotype for the disease

In [None]:
bam_file = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.bam"
alignment_data = pysam.AlignmentFile(bam_file, "rb")

In [None]:
print("ALPHA-1 ANTITRYPSIN DEFICIENCY")
display((get_genotype_for_chrom_pos(alignment_data, "14", 94847386),))
print("Aspirin")
display((get_genotype_for_chrom_pos(alignment_data, "5", 179220638),))

In [None]:
calc_genotype_for_chrom_snp_reads(get_chrom_reads_in_pos(alignment_data, "14", {94847386}))


In [None]:
%%time
my_genotypes_for_pgs_025_cache_file = "my_genotypes_for_pgs_025.csv"
my_genotypes_for_pgs_025 = get_my_genotypes_for_pgs(alignment_data, pgs_025_df, my_genotypes_for_pgs_025_cache_file)
display(my_genotypes_for_pgs_025)

In [None]:
%%time
my_genotypes_for_pgs_133_cache_file = "my_genotypes_for_pgs_133.csv"
my_genotypes_for_pgs_133 = get_my_genotypes_for_pgs(alignment_data, pgs_133_df, my_genotypes_for_pgs_133_cache_file, filter=False)
display(my_genotypes_for_pgs_133)

## Search for Alzheimer's

### Polygenic risk score

In [None]:
pgs_025_df

### Select my alleles for list in PGS

In [None]:
my_genotypes_for_pgs_025

In [None]:
my_alzheimers_snps_df = merge_pgs_with_my_genotype(pgs_025_df, my_genotypes_for_pgs_025)
my_alzheimers_snps_df = filter_out_none_effect_alleles(my_alzheimers_snps_df)
my_alzheimers_snps_df

In [None]:
my_alzheimers_snps_df["effect_weight"].sum()

### combined using a weighted sum of allele dosages multiplied by their corresponding effect sizes

In [None]:
merged_df = merge_pgs_with_my_genotype(pgs_025_df, my_genotypes_for_pgs_025)
# sum(count effect allele in genotype * effect_weight)
merged_df["effect_allele_1"] = merged_df["genotype"].map(lambda x: x[0]) == merged_df["effect_allele"]
merged_df["effect_allele_2"] = merged_df["genotype"].map(lambda x: x[1]) == merged_df["effect_allele"]
merged_df["effect_allele_1"] = merged_df["effect_allele_1"].astype(int)
merged_df["effect_allele_2"] = merged_df["effect_allele_2"].astype(int)
merged_df["gene_dosage"] = merged_df["effect_allele_1"] + merged_df["effect_allele_2"]
merged_df["effect"] = merged_df["gene_dosage"] * merged_df["effect_weight"]
merged_df

In [None]:
merged_df["effect"].sum()

## Search for schizophrenia

### Get disease related SNPs

In [None]:
schizophrenia_snvs_ncbi_response = "/home/s/src/search_your_dna/.idea/httpRequests/2020-10-28T094336.200.json"
with open(schizophrenia_snvs_ncbi_response) as f:
    schizophrenia_data = json.load(f)
rsIDs_schizophrenia = schizophrenia_data["result"]["uids"]


schizophrenia_chr_positions = {}
for rsID in rsIDs_schizophrenia:
    variant = schizophrenia_data["result"][rsID]
    chromosome = variant["chr_sort"].lstrip("0")
    position = int(variant["location_sort"].lstrip("0"))
    schizophrenia_chr_positions[rsID] = [chromosome, position]

### Select only disease variance that I have

In [None]:
schizophrenia_snv_positions = list(map(lambda x: np.int64(x[1]), filter(lambda x: x[1] != 99999999999999999999, schizophrenia_chr_positions.values())))
df_row_selector = vcf_df["POS"].isin(schizophrenia_snv_positions)
my_schizophrenia_matches = vcf_df.loc[df_row_selector]
my_schizophrenia_matches

### Select my alleles for list in PGS

In [None]:
my_genotypes_for_pgs_133

In [None]:
my_schizophrenia_snps_df = merge_pgs_with_my_genotype(pgs_133_df, my_genotypes_for_pgs_133)
my_for_schizophrenia_snps_df = filter_out_none_effect_alleles(my_schizophrenia_snps_df)
my_against_schizophrenia_snps_df = filter_out_effect_alleles(my_schizophrenia_snps_df)
display(my_for_schizophrenia_snps_df)
display(my_against_schizophrenia_snps_df)

In [None]:
merged_df = merge_pgs_with_my_genotype(pgs_133_df, my_genotypes_for_pgs_133)
# sum(count effect allele in genotype * effect_weight)
merged_df["effect_allele_1"] = merged_df["genotype"].map(lambda x: x[0]) == merged_df["effect_allele"]
merged_df["effect_allele_2"] = merged_df["genotype"].map(lambda x: x[1]) == merged_df["effect_allele"]
merged_df["effect_allele_1"] = merged_df["effect_allele_1"].astype(int)
merged_df["effect_allele_2"] = merged_df["effect_allele_2"].astype(int)
merged_df["gene_dosage"] = merged_df["effect_allele_1"] + merged_df["effect_allele_2"]
merged_df["effect"] = merged_df["gene_dosage"] * merged_df["effect_weight"]
merged_df

In [None]:
merged_df["effect"].sum()

In [None]:
alignment_data.close()

### Collect PGS data

#### Get all traits available in pgs catalogue

In [None]:
all_traits_result = get_all_pgs_api_data("trait/all")

In [None]:
all_pgs_traits_df = pd.DataFrame(all_traits_result)
print(all_pgs_traits_df.columns)
all_pgs_traits_df

In [None]:
pgs_ids = set(reduce(lambda a, b: a + b, all_pgs_traits_df["associated_pgs_ids"].to_list(), []))

#### Get pgs entities from pgs

#### Download all pgs scoring files

In [None]:
for pgs_id in sorted(pgs_ids):
    try:
        read_or_download_pgs_scoring_file(pgs_id)
        time.sleep(0.5)  # Not to overload api with requests
    except Exception as e:
        print(f"Something went wrong when parsing pgs file", e)

### Creating snp database
#### get all SNP chr/pos values from ncbi

available for download in: https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
as `00-All.vcf.gz`

#### Store results in a sqlite db

In [None]:
all_rsid_file = "/home/s/src/search_your_dna/data/00-All.vcf"
conn = sqlite3.connect(vcf_database_file)

In [None]:
persist_all_snps_to_db(conn, all_rsid_file)

In [None]:
all_snp_pos = pd.read_sql("SELECT distinct (chrom) FROM all_snp_pos", con=conn)
all_snp_pos

#### find my genotype for all SNP values

In [None]:
%%time
for chrom in chrom_list:
    get_my_snps_for_chromosome(snp_db_file=vcf_database_file, chrom=chrom)

#### store SNP values in sqlite database

In [None]:
my_chrom_16_snps_file = Path("data/my_chrom_16_snp.csv")
my_chrom_16_snps = pd.read_csv(my_chrom_16_snps_file, index_col=0)
my_chrom_16_snps

In [None]:
vcf_database_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite_backup"
conn = sqlite3.connect(vcf_database_file)
cur = conn.cursor()
cur.execute("ALTER TABLE all_snp_pos ADD genotype CHAR(2)")
conn.commit()

In [None]:
%%time
vcf_database_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite_backup"
conn = sqlite3.connect(vcf_database_file)
cur = conn.cursor()
for pos, genotype in my_chrom_16_snps.to_dict()["genotype"].items():
    cur.execute(f"UPDATE all_snp_pos SET genotype = '{genotype}' WHERE chrom = '16' and pos = {pos}")
conn.commit()

### Calculate PGS for my dna

