# Get results based on my snp values

In [None]:
import gzip
import json
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, List, Union, Dict

import numpy as np
import pandas as pd
from IPython.core.display import display


In [None]:
cache_file_name = "data/vcf_records.parquet.gz"
vcf_file_paths = [Path(f) for f in [
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.cnv.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.indel.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.filtered.snp.vcf.gz",
    "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.sv.vcf.gz"
]]

In [None]:
def get_file_header_line_number(file_name: Union[str, Path], header_pattern: str) -> int:
    with gzip.open(str(file_name), "r") as f:
        line_number = 0
        for line in f:
            if re.search(header_pattern, line.decode("utf-8")):
                return line_number
            line_number += 1
    raise Exception(f"Couldn't find header in file {file_name}. Expected header: {header_pattern}")

def get_vcf_file_header_line_number(file_name: Union[str, Path]) -> int:
    return get_file_header_line_number(
        file_name=file_name,
        header_pattern="#CHROM\s+POS\s+ID\s+REF\s+ALT\s+QUAL\s+FILTER\s+INFO\s+FORMAT"
    )

def get_polygenic_score_file_header_line_number(file_name: Union[str, Path]) -> int:
    return get_file_header_line_number(
        file_name=file_name,
        header_pattern="rsID\s+chr_name\s+chr_position\s+effect_allele"
    )

def read_raw_zipped_vcf_file(file_name: Union[str, Path]) -> pd.DataFrame:
    header_row_number = get_vcf_file_header_line_number(file_name=file_name)
    result = pd.read_csv(file_name, sep="\s+", skiprows=header_row_number, dtype=str)
    result["POS"] = result["POS"].astype(np.int64)
    return result

def read_raw_zipped_polygenic_score_file(file_name: Union[str, Path]) -> pd.DataFrame:
    header_row_number = get_polygenic_score_file_header_line_number(file_name=file_name)
    result = pd.read_csv(file_name, sep="\s+", skiprows=header_row_number, dtype=str)
    result["effect_weight"] = result["effect_weight"].astype(np.float)
    result["chr_position"] = result["chr_position"].astype(np.int64)
    return result


def load_vcf_to_df(vcf_files: List[Union[str, Path]], cache_file_name: str = "data/vcf_records.parquet.gz"):
    if Path(cache_file_name).exists():
        return pd.read_parquet(cache_file_name)

    dfs = []
    for vcf_file_path in vcf_files:
        print(f"Reading in source vcf file {vcf_file_path}")
        dfs.append(read_raw_zipped_vcf_file(vcf_file_path))
    raw_vcf_data = pd.concat(dfs, ignore_index=True)
    raw_vcf_data.to_parquet(cache_file_name)
    return raw_vcf_data

def load_polygenic_score_file_to_df(file_name: Union[str, Path]) -> pd.DataFrame:
    return read_raw_zipped_polygenic_score_file(file_name=file_name)

## Load variant files

In [None]:
vcf_df = load_vcf_to_df(vcf_files=vcf_file_paths, cache_file_name=cache_file_name)
vcf_df.shape

In [None]:
df1 = vcf_df.loc[vcf_df["POS"] == 7383583]

## Search for Alzheimer's

In [None]:
polygenic_risk_score_file_alzheimer = "data/PGS000025.txt.gz"
pgs_025_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file_alzheimer)
pgs_025_df

## Search for schizophrenia

### Get disease related SNPs

In [None]:
schizophrenia_snvs_ncbi_response = "/home/s/src/search_your_dna/.idea/httpRequests/2020-10-28T094336.200.json"
with open(schizophrenia_snvs_ncbi_response) as f:
    schizophrenia_data = json.load(f)
rsIDs_schizophrenia = schizophrenia_data["result"]["uids"]


schizophrenia_chr_positions = {}
for rsID in rsIDs_schizophrenia:
    variant = schizophrenia_data["result"][rsID]
    chromosome = variant["chr_sort"].lstrip("0")
    position = int(variant["location_sort"].lstrip("0"))
    schizophrenia_chr_positions[rsID] = [chromosome, position]

### Select only disease variance that I have

In [None]:
schizophrenia_snv_positions = list(map(lambda x: np.int64(x[1]), filter(lambda x: x[1] != 99999999999999999999, schizophrenia_chr_positions.values())))
df_row_selector = vcf_df["POS"].isin(schizophrenia_snv_positions)
my_schizophrenia_matches = vcf_df.loc[df_row_selector]
my_schizophrenia_matches

### Calculate polygenic score

In [None]:
polygenic_risk_score_file = "data/PGS000133.txt.gz"
pgs_133_df = read_raw_zipped_polygenic_score_file(polygenic_risk_score_file)
pgs_133_df

## Get genotype

In [None]:
import pysam
bam_file = "/home/s/Dropbox/Siim/health/genetest_2020/GFX0237425.bam"
alignment_data = pysam.AlignmentFile(bam_file, "rb")
ref_data = pysam.FastaFile("/home/s/src/data/ncbi-genomes-2020-11-01/GCF_000001405.25_GRCh37.p13_genomic.fna")


In [None]:
def get_read_values_for_allele(chrom: str, pos: int) -> Dict[int, List[str]]:
    sequence = defaultdict()
    for pileupcolumn in alignment_data.pileup(chrom, pos, pos + 1):
#         print ("\ncoverage at base %s = %s" %
#                (pileupcolumn.pos, pileupcolumn.n), "pileups", len(pileupcolumn.pileups))
        if pos == pileupcolumn.pos:
            if len(pileupcolumn.pileups) == 0:
                print(f"Chromosome {chrom} position {pos} does not have any READS")
                continue
            reads_at_current_position = []
            for pileupread in pileupcolumn.pileups:
                if pileupread.is_del:
                    reads_at_current_position.append("DEL")
                else:
#                     print(pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position])
#                     print ('\tbase in read %s = %s' % (pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position]))
                    reads_at_current_position.append(pileupread.alignment.query_sequence[pileupread.query_position])
            sequence[pileupcolumn.pos] = reads_at_current_position
    return sequence

def genotype_from_reads(reads):
    counts = {"A": 0, "C": 0, "G": 0, "T": 0, "DEL": 0}
    for read in reads:
        counts[read] += 1
    sorted_count_keys = sorted(counts, key=counts.__getitem__, reverse=True)
    sorted_count_values = [counts[k] for k in sorted_count_keys]
    if sorted_count_values[0] / sum(sorted_count_values) > 0.9:
        return f"{sorted_count_keys[0]}{sorted_count_keys[0]}"
    else:
        return f"{sorted_count_keys[0]}{sorted_count_keys[1]}"

def calculate_chromosome_read_values(loci_df: pd.DataFrame) -> Dict[str, Any]:
    chromosome_read_values = defaultdict()
    for entry in loci_df.to_dict(orient="records"):
        chrom = entry["chr_name"]
        pos = entry["chr_position"]
        if chrom not in chromosome_read_values:
            chromosome_read_values[chrom] = {}
        allele_read_values = get_read_values_for_allele(chrom, int(pos))

        chromosome_read_values[chrom] = {**chromosome_read_values[chrom], **allele_read_values}
    return chromosome_read_values

def calc_genotypes(loci_df: pd.DataFrame) -> pd.DataFrame:
    chromosome_read_values = calculate_chromosome_read_values(loci_df)
    
    seq = pd.DataFrame(columns=["chr","pos","value"])
    for chrom, pos_reads in chromosome_read_values.items():
        for pos, reads in pos_reads.items():
            allele = genotype_from_reads(reads)
            seq = seq.append({"chr": chrom, "pos": pos, "value": allele}, ignore_index=True)
    return seq

def get_my_genotypes_for_pgs(pgs_df: pd.DataFrame, cache_file_name: str, filter: bool = False) -> pd.DataFrame:
    cache_file = f"data/{cache_file_name}"
    if not Path(cache_file).exists():
        if filter:
            pgs_df_abs_weight = np.abs(pgs_df["effect_weight"])
            pgs_df = pgs_df[pgs_df_abs_weight > pgs_df_abs_weight.mean()]
        my_genotypes = calc_genotypes(pgs_df)
        my_genotypes.to_csv(cache_file, index=None)
    else:
        my_genotypes = pd.read_csv(cache_file, index_col=None)
    return my_genotypes

In [None]:
%%time
my_genotypes_for_pgs_025_cache_file = "my_genotypes_for_pgs_025.csv"
my_genotypes_for_pgs_025 = get_my_genotypes_for_pgs(pgs_025_df, my_genotypes_for_pgs_025_cache_file)
display(my_genotypes_for_pgs_025)

In [None]:
%%time
my_genotypes_for_pgs_133_cache_file = "my_genotypes_for_pgs_133.csv"
my_genotypes_for_pgs_133 = get_my_genotypes_for_pgs(pgs_133_df, my_genotypes_for_pgs_133_cache_file, filter=True)
display(my_genotypes_for_pgs_133)

In [None]:
a = my_genotypes_for_pgs_133["value"].map(lambda x: x[0])
b = my_genotypes_for_pgs_133["value"].map(lambda x: x[1])

my_genotypes_for_pgs_133[a != b]

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

In [None]:
my_genotypes_for_pgs_025.to_csv()

In [None]:
bam_file.close()
alignment_data.close()

### Compile an overview