# Get results based on my snp values

In [None]:
import gzip
import json
import re
import shutil
import sqlite3
import time
from collections import defaultdict
from functools import reduce
from pathlib import Path
from typing import Any, List, Union, Dict, Iterable

import numpy as np
import pandas as pd
import pysam
import requests
from IPython.core.display import display


In [None]:
cache_file_name = "data/vcf_records.parquet.gz"
vcf_file_paths = [Path(f) for f in [
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.cnv.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.indel.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.snp.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.sv.vcf.gz"
]]

In [None]:
def get_file_header_line_number(file_name: Union[str, Path], header_pattern: str) -> int:
    with gzip.open(str(file_name), "r") as f:
        line_number = 0
        for line in f:
            if re.search(header_pattern, line.decode("utf-8")):
                return line_number
            line_number += 1
    raise Exception(f"Couldn't find header in file {file_name}. Expected header: {header_pattern}")

def get_vcf_file_header_line_number(file_name: Union[str, Path]) -> int:
    return get_file_header_line_number(
        file_name=file_name,
        header_pattern="#CHROM\s+POS\s+ID\s+REF\s+ALT\s+QUAL\s+FILTER\s+INFO"
    )

def get_polygenic_score_file_header_line_number(file_name: Union[str, Path]) -> int:
    return get_file_header_line_number(
        file_name=file_name,
        header_pattern="rsID\s+chr_name\s+chr_position\s+effect_allele"
    )

def read_raw_zipped_vcf_file(file_name: Union[str, Path]) -> pd.DataFrame:
    header_row_number = get_vcf_file_header_line_number(file_name=file_name)
    result = pd.read_csv(file_name, sep="\s+", skiprows=header_row_number, dtype=str)
    result["POS"] = result["POS"].astype(np.int64)
    return result

def read_raw_zipped_polygenic_score_file(file_name: Union[str, Path]) -> pd.DataFrame:
    header_row_number = get_polygenic_score_file_header_line_number(file_name=file_name)
    result = pd.read_csv(file_name, sep="\s+", skiprows=header_row_number, dtype=str)
    result["effect_weight"] = result["effect_weight"].astype(np.float)
    result["chr_name"] = result["chr_name"].astype(np.int64)
    result["chr_position"] = result["chr_position"].astype(np.int64)
    return result


def load_vcf_to_df(vcf_files: List[Union[str, Path]], cache_file_name: str = "data/vcf_records.parquet.gz"):
    if Path(cache_file_name).exists():
        return pd.read_parquet(cache_file_name)

    dfs = []
    for vcf_file_path in vcf_files:
        print(f"Reading in source vcf file {vcf_file_path}")
        dfs.append(read_raw_zipped_vcf_file(vcf_file_path))
    raw_vcf_data = pd.concat(dfs, ignore_index=True)
    raw_vcf_data.to_parquet(cache_file_name)
    return raw_vcf_data

def load_polygenic_score_file_to_df(file_name: Union[str, Path]) -> pd.DataFrame:
    return read_raw_zipped_polygenic_score_file(file_name=file_name)

## Load variant files

In [None]:
vcf_df = load_vcf_to_df(vcf_files=vcf_file_paths, cache_file_name=cache_file_name)
vcf_df.shape

In [None]:
df1 = vcf_df.loc[vcf_df["POS"] == 7383583]

## Load polygenic risk scores to analyse

### For Alzheimer's

In [None]:
polygenic_risk_score_file_alzheimer = "data/PGS000025.txt.gz"
pgs_025_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_alzheimer)
pgs_025_df

### For Schizophrenia

In [None]:
polygenic_risk_score_file_schizophrenia = "data/PGS000133.txt.gz"
pgs_133_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_schizophrenia)
pgs_133_df

## Get my genotype for the disease

In [None]:
bam_file = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.bam"
alignment_data = pysam.AlignmentFile(bam_file, "rb")


In [None]:
def get_read_values_for_allele(chrom: Union[int,str], pos: int) -> Dict[int, List[str]]:
    sequence = defaultdict()
    for pileupcolumn in alignment_data.pileup(str(chrom), pos - 1, pos + 1):
#         print ("\ncoverage at base %s = %s" %
#                (pileupcolumn.pos, pileupcolumn.n), "pileups", len(pileupcolumn.pileups))
        if pos == pileupcolumn.pos + 1: # FIXME: not sure why is +1 needed, found it out based on reports from dantelabs
            if len(pileupcolumn.pileups) == 0:
                print(f"Chromosome {chrom} position {pos} does not have any READS")
                continue
            reads_at_current_position = []
            for pileupread in pileupcolumn.pileups:
                if pileupread.is_del:
                    reads_at_current_position.append("DEL")
                else:
#                     print(pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position])
#                     print ('\tbase in read %s = %s' % (pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position]))
                    reads_at_current_position.append(pileupread.alignment.query_sequence[pileupread.query_position])
            sequence[pos] = reads_at_current_position
    return sequence

def genotype_from_reads(reads):
    counts = {"A": 0, "C": 0, "G": 0, "T": 0, "D": 0}
    for read in reads:
        counts[read] += 1
    sorted_count_keys = sorted(counts, key=counts.__getitem__, reverse=True)
    sorted_count_values = [counts[k] for k in sorted_count_keys]
    if sorted_count_values[0] / sum(sorted_count_values) > 0.9:
        return f"{sorted_count_keys[0]}{sorted_count_keys[0]}"
    else:
        return f"{sorted_count_keys[0]}{sorted_count_keys[1]}"

def calculate_chromosome_read_values(loci_df: pd.DataFrame) -> Dict[str, Any]:
    chromosome_read_values = defaultdict()
    for entry in loci_df.to_dict(orient="records"):
        chrom = entry["chr_name"]
        pos = entry["chr_position"]
        if chrom not in chromosome_read_values:
            chromosome_read_values[chrom] = {}
        allele_read_values = get_read_values_for_allele(chrom, int(pos))

        chromosome_read_values[chrom] = {**chromosome_read_values[chrom], **allele_read_values}
    return chromosome_read_values

def calc_genotypes(loci_df: pd.DataFrame) -> pd.DataFrame:
    chromosome_read_values = calculate_chromosome_read_values(loci_df)

    seq = pd.DataFrame(columns=["chr","pos","genotype"])
    for chrom, pos_reads in chromosome_read_values.items():
        for pos, reads in pos_reads.items():
            allele = genotype_from_reads(reads)
            seq = seq.append({"chr": chrom, "pos": pos, "genotype": allele}, ignore_index=True)
    seq["genotype"] = seq["genotype"].astype(str)
    return seq

def get_my_genotypes_for_pgs(pgs_df: pd.DataFrame, cache_file_name: str, filter: bool = False) -> pd.DataFrame:
    cache_file = f"data/{cache_file_name}"
    if not Path(cache_file).exists():
        if filter:
            pgs_df_abs_weight = np.abs(pgs_df["effect_weight"])
            pgs_df = pgs_df[pgs_df_abs_weight > pgs_df_abs_weight.mean()]
        my_genotypes = calc_genotypes(pgs_df)
        my_genotypes.to_csv(cache_file, index=None)
    else:
        my_genotypes = pd.read_csv(cache_file, index_col=None)
    return my_genotypes

def merge_pgs_with_my_genotype(pgs_df: pd.DataFrame, my_genome_df: pd.DataFrame) -> pd.DataFrame:
    merged_df = my_genome_df.merge(pgs_df, left_on=["chr", "pos"], right_on=["chr_name", "chr_position"])
    return merged_df[["chr", "pos", "genotype", "effect_allele", "reference_allele", "effect_weight"]]

def filter_out_none_effect_alleles(merged_pgs_with_my_genotype):
    return merged_pgs_with_my_genotype[
        (merged_pgs_with_my_genotype["genotype"].map(lambda x: x[0]) == merged_pgs_with_my_genotype["effect_allele"])
        | (merged_pgs_with_my_genotype["genotype"].map(lambda x: x[1]) == merged_pgs_with_my_genotype["effect_allele"])
    ]

def filter_out_effect_alleles(merged_pgs_with_my_genotype):
    return merged_pgs_with_my_genotype[
        (merged_pgs_with_my_genotype["genotype"].map(lambda x: x[0]) != merged_pgs_with_my_genotype["effect_allele"])
        & (merged_pgs_with_my_genotype["genotype"].map(lambda x: x[1]) != merged_pgs_with_my_genotype["effect_allele"])
    ]

def get_genotype_for_chrom_pos(chrom: str, pos: int) -> str:
    reads = get_read_values_for_allele(chrom, pos)
    if len(reads) != 0:
        return genotype_from_reads(reads[pos])
    else:
        raise Exception(f"no reads found for chr{chrom}:{pos}")

In [None]:
print("ALPHA-1 ANTITRYPSIN DEFICIENCY")
display(get_genotype_for_chrom_pos("14", 94847386))
print("Aspirin")
display(get_genotype_for_chrom_pos("5", 179220638))


In [None]:
%%time
my_genotypes_for_pgs_025_cache_file = "my_genotypes_for_pgs_025.csv"
my_genotypes_for_pgs_025 = get_my_genotypes_for_pgs(pgs_025_df, my_genotypes_for_pgs_025_cache_file)
display(my_genotypes_for_pgs_025)

In [None]:
%%time
my_genotypes_for_pgs_133_cache_file = "my_genotypes_for_pgs_133.csv"
my_genotypes_for_pgs_133 = get_my_genotypes_for_pgs(pgs_133_df, my_genotypes_for_pgs_133_cache_file, filter=False)
display(my_genotypes_for_pgs_133)

## Search for Alzheimer's

### Polygenic risk score

In [None]:
pgs_025_df

### Select my alleles for list in PGS

In [None]:
my_genotypes_for_pgs_025

In [None]:
my_alzheimers_snps_df = merge_pgs_with_my_genotype(pgs_025_df, my_genotypes_for_pgs_025)
my_alzheimers_snps_df = filter_out_none_effect_alleles(my_alzheimers_snps_df)
my_alzheimers_snps_df

In [None]:
my_alzheimers_snps_df["effect_weight"].sum()

### combined using a weighted sum of allele dosages multiplied by their corresponding effect sizes

In [None]:
merged_df = merge_pgs_with_my_genotype(pgs_025_df, my_genotypes_for_pgs_025)
# sum(count effect allele in genotype * effect_weight)
merged_df["effect_allele_1"] = merged_df["genotype"].map(lambda x: x[0]) == merged_df["effect_allele"]
merged_df["effect_allele_2"] = merged_df["genotype"].map(lambda x: x[1]) == merged_df["effect_allele"]
merged_df["effect_allele_1"] = merged_df["effect_allele_1"].astype(int)
merged_df["effect_allele_2"] = merged_df["effect_allele_2"].astype(int)
merged_df["gene_dosage"] = merged_df["effect_allele_1"] + merged_df["effect_allele_2"]
merged_df["effect"] = merged_df["gene_dosage"] * merged_df["effect_weight"]
merged_df

In [None]:
merged_df["effect"].sum()

## Search for schizophrenia

### Get disease related SNPs

In [None]:
schizophrenia_snvs_ncbi_response = "/home/s/src/search_your_dna/.idea/httpRequests/2020-10-28T094336.200.json"
with open(schizophrenia_snvs_ncbi_response) as f:
    schizophrenia_data = json.load(f)
rsIDs_schizophrenia = schizophrenia_data["result"]["uids"]


schizophrenia_chr_positions = {}
for rsID in rsIDs_schizophrenia:
    variant = schizophrenia_data["result"][rsID]
    chromosome = variant["chr_sort"].lstrip("0")
    position = int(variant["location_sort"].lstrip("0"))
    schizophrenia_chr_positions[rsID] = [chromosome, position]

### Select only disease variance that I have

In [None]:
schizophrenia_snv_positions = list(map(lambda x: np.int64(x[1]), filter(lambda x: x[1] != 99999999999999999999, schizophrenia_chr_positions.values())))
df_row_selector = vcf_df["POS"].isin(schizophrenia_snv_positions)
my_schizophrenia_matches = vcf_df.loc[df_row_selector]
my_schizophrenia_matches

### Select my alleles for list in PGS

In [None]:
my_genotypes_for_pgs_133

In [None]:
my_schizophrenia_snps_df = merge_pgs_with_my_genotype(pgs_133_df, my_genotypes_for_pgs_133)
my_for_schizophrenia_snps_df = filter_out_none_effect_alleles(my_schizophrenia_snps_df)
my_against_schizophrenia_snps_df = filter_out_effect_alleles(my_schizophrenia_snps_df)
display(my_for_schizophrenia_snps_df)
display(my_against_schizophrenia_snps_df)

In [None]:
merged_df = merge_pgs_with_my_genotype(pgs_133_df, my_genotypes_for_pgs_133)
# sum(count effect allele in genotype * effect_weight)
merged_df["effect_allele_1"] = merged_df["genotype"].map(lambda x: x[0]) == merged_df["effect_allele"]
merged_df["effect_allele_2"] = merged_df["genotype"].map(lambda x: x[1]) == merged_df["effect_allele"]
merged_df["effect_allele_1"] = merged_df["effect_allele_1"].astype(int)
merged_df["effect_allele_2"] = merged_df["effect_allele_2"].astype(int)
merged_df["gene_dosage"] = merged_df["effect_allele_1"] + merged_df["effect_allele_2"]
merged_df["effect"] = merged_df["gene_dosage"] * merged_df["effect_weight"]
merged_df

In [None]:
merged_df["effect"].sum()

In [None]:
alignment_data.close()

### Collect PGS data

#### Get all traits available in pgs catalogue

In [None]:
def get_all_pgs_api_data(api_endpoint: str):
    cache_file = f"data/pgs_catalog_{api_endpoint.replace('/', '-')}.json"
    if Path(cache_file).exists():
        print(f"Found cache file {cache_file}. Loading data from cache.")
        with open(cache_file, "r") as f:
            return json.load(f)
    limit = 50
    offset = 0
    traits = []
    while True:
        url = f"https://www.pgscatalog.org/rest/{api_endpoint}?limit={limit}&offset={offset}"
        print(f"Requesting pgs data from {url}")
        traits_response = requests.get(url=url)
        data = traits_response.json()
        traits.extend(data["results"])
        if data["next"] == None:
            break
        offset += limit
    with open(cache_file, "w") as f:
        json.dump(traits, f)
    return traits

In [None]:
all_traits_result = get_all_pgs_api_data("trait/all")

In [None]:
all_pgs_traits_df = pd.DataFrame(all_traits_result)
print(all_pgs_traits_df.columns)
all_pgs_traits_df

In [None]:
pgs_ids = set(reduce(lambda a, b: a + b, all_pgs_traits_df["associated_pgs_ids"].to_list(), []))

#### Get pgs entities from pgs

In [None]:
def download_file(url: str, local_filename: str) -> None:
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

def read_or_download_pgs_scoring_file(pgs_id: str):
    cache_file = f"data/{pgs_id}.txt.gz"
    if Path(cache_file).exists():
        print(f"Found cache file {cache_file}. Loading data from cache.")
        return read_raw_zipped_polygenic_score_file(cache_file)
    url = f"https://www.pgscatalog.org/rest/score/{pgs_id}"
    print(f"Requesting pgs data from {url}")
    data = requests.get(url)
    response_data = data.json()
    scoring_file_url = response_data["ftp_scoring_file"]
    download_file(scoring_file_url, cache_file)
    return read_raw_zipped_polygenic_score_file(cache_file)

#### Download all pgs scoring files

In [None]:
for pgs_id in sorted(pgs_ids):
    try:
        read_or_download_pgs_scoring_file(pgs_id)
        time.sleep(0.5)  # Not to overload api with requests
    except Exception as e:
        print(f"Something went wrong when parsing pgs file", e)

### Creating snp database
#### get all SNP chr/pos values from ncbi

available for download in: https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
as `00-All.vcf.gz`

#### Store results in a sqlite db

In [None]:
all_rsid_file = "/home/s/src/search_your_dna/data/00-All.vcf"
vcf_database_file = "/home/s/src/search_your_dna/data/ncbi_snpdb_all_ids.sqlite"
conn = sqlite3.connect(vcf_database_file)
cursor = conn.cursor()

In [None]:
cursor.execute('''create table if not exists
	all_snp_pos
(
	chrom text,
	pos int
)''')
cursor.execute('''create unique index if not exists
	all_snp_pos_chrom_pos_index
on
	all_snp_pos
(
	chrom,
	pos
)''')
conn.commit()

In [None]:
def chunked_insert(all_values: Iterable, chunk_size = 10000):

    all_values = list(all_values)
    print(f"Inserting chrom {all_values[0][0]} values, totalling {len(all_values)}")
    for i in range(0, len(all_values), chunk_size):
        if i + chunk_size > len(all_values):
            cursor.execute(f"""INSERT INTO all_snp_pos (chrom,pos) VALUES {all_values[i:].__repr__()[1:-1]};""")
        else:
            cursor.execute(f"""INSERT INTO all_snp_pos (chrom,pos) VALUES {all_values[i:i+chunk_size].__repr__()[1:-1]};""")
        conn.commit()

def database_is_already_populated():
    cursor.execute("SELECT * FROM all_snp_pos limit 1")
    res = cursor.fetchall()
    return len(res) != 0

def persist_all_snps_to_db(file_name: Union[str, Path]) -> None:

    header_pattern = "#CHROM\s+POS\s+ID\s+REF\s+ALT\s+QUAL\s+FILTER\s+INFO"
    snps = set()
    with open(str(file_name), "r") as f:
        passed_header = False
        last_chrom = "1"
        for line_text in f:
            if not passed_header:
                if re.search(header_pattern, line_text):
                    passed_header = True
            else:
                line_parts = line_text.split("\t")
                current_chrom = line_parts[0]
                if last_chrom != current_chrom:
                    chunked_insert(all_values=snps)
                    snps = set()
                    last_chrom = current_chrom
                snps.add((current_chrom, line_parts[1]))

    # persist also final snps
    chunked_insert(all_values=snps)

In [None]:
persist_all_snps_to_db(all_rsid_file)

In [None]:
chrom_list = [
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    '10',
    '11',
    '12',
    '13',
    '14',
    '15',
    '16',
    '17',
    '18',
    '19',
    '20',
    '21',
    '22',
    'MT',
    'X',
    'Y'
]

In [None]:
all_snp_pos = pd.read_sql("SELECT distinct (chrom) FROM all_snp_pos", con=conn)
all_snp_pos

#### find my genotype for all SNP values

#### store SNP values in sqlite database

### Calculate PGS for my dna

