# Calculating my polygenic scores

In [None]:
import sys
import traceback
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import sqlite3
from typing import List, Optional, Tuple
import pysam
import rsidx
from tqdm import tqdm
from IPython.core.display import display
from search_your_dna.pgscatalog import read_or_download_pgs_scoring_file, PGS_METHOD_MAPPING_TO_METHOD_CATEGORIES, \
    MethodCategories
from search_your_dna.util import read_raw_zipped_polygenic_score_file

file_my_vcf = "data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.gz"
file_my_vcf_rsidx = "data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf.rsidx"

In [None]:
def search_for_rsids(rsids: List[str]):
    with sqlite3.connect(file_my_vcf_rsidx) as db:
        return list(rsidx.search.search(rsids, db, file_my_vcf))

rsids = []
search_for_rsids(rsids)

def to_gene_dosage_df(variance_str: str):
    gene_dosages = []
    for v in variance_str:
        chrom, pos, rsid, ref, alt, qual, filter, info, format, _ = tuple(v.split())
        gene_dosage = int(info[info.find("AC") + 3:info.find("AC") + 4])
        gene_dosages.append({"rsid": rsid, "gene_dosage": gene_dosage})
    res = pd.DataFrame(gene_dosages)
    res["gene_dosage"] = pd.to_numeric(res["gene_dosage"])
    return res


def clean_rsids(rsids: pd.Series, pgs_name: str) -> List[str]:
    if np.any(rsids.isna()):
        print(f"PGS {pgs_name} has {np.count_nonzero(rsids.isna())} missing rsids")
        rsids = rsids.dropna()
    start_with_rs = rsids.str.startswith("rs")
    if np.any(~start_with_rs):
        print(f"PGS {pgs_name} has {np.count_nonzero(~start_with_rs)} non rsid values")
        rsids = rsids[start_with_rs]
    values_with_commas_or_underscores = rsids.str.contains(",") | rsids.str.contains("_")
    if np.any(values_with_commas_or_underscores):
        print(f"PGS {pgs_name} has {np.count_nonzero(values_with_commas_or_underscores)} rsids containing multiple values")
        rsids = rsids[~values_with_commas_or_underscores]
    return rsids.to_list()

def calc_polygenic_score(max_pgs_alleles: Optional[int] = None, pgs_df: Optional[pd.DataFrame] = None, pgs_file: Optional[str] = None) -> Tuple[float, pd.DataFrame]:
    if pgs_df is None:
        pgs_df = read_raw_zipped_polygenic_score_file(pgs_file)
    if max_pgs_alleles is not None and len(pgs_df.index) > max_pgs_alleles:
        raise Exception(f"Too many snps for {pgs_file}. Total {len(pgs_df.index)}")
    pgs_rsids = clean_rsids(pgs_df['rsid'], Path(pgs_file).stem)
    my_variance = search_for_rsids(pgs_rsids)
    gene_dosage_df = to_gene_dosage_df(my_variance)
    merged_df = pgs_df.merge(gene_dosage_df, on="rsid", how="outer")
    merged_df["gene_dosage"] = merged_df["gene_dosage"].fillna(0)
    merged_df["effect"] = merged_df["gene_dosage"] * merged_df["effect_weight"]
    return merged_df["effect"].sum(), merged_df


pgs, pgs_df = calc_polygenic_score(max_pgs_alleles=1000, pgs_file="data/pgs/PGS000034.txt.gz")
pgs

In [None]:
errors = {}
all_pgs_scores = pd.DataFrame(columns=["file", "score"])
for pgs_file in tqdm(sorted(glob("data/pgs/PGS00*.txt.gz"))):
    try:
        pgs, _ = calc_polygenic_score(max_pgs_alleles=200, pgs_file=pgs_file)
        all_pgs_scores = all_pgs_scores.append({"file": pgs_file, "score": pgs}, ignore_index=True)
    except Exception as e:
        errors[pgs_file] = [str(e), ''.join(traceback.format_exception(None, e, e.__traceback__))]
all_pgs_scores

In [None]:
all_pgs_scores["pgs_id"] = all_pgs_scores["file"].apply(lambda v: Path(v).stem[:-4])
all_pgs_scores = all_pgs_scores[["pgs_id", "score", "file"]]
all_pgs_scores.to_csv("data/pgs_results.csv", index=None, sep="\t")

# Add PGS metadata

In [None]:
pgs_score_dfs = []
method = pd.Series(dtype=str)
method_parsed = pd.Series(dtype=str)
trait = pd.Series(dtype=str)
for _, pgs_row in tqdm(all_pgs_scores.iterrows()):
    metadata_json, _ = read_or_download_pgs_scoring_file(pgs_row["pgs_id"])
    method = method.append(pd.Series([metadata_json["method_name"]]))
    method_parsed = method_parsed.append(pd.Series([PGS_METHOD_MAPPING_TO_METHOD_CATEGORIES.get(metadata_json["method_name"])]))
    trait = trait.append(pd.Series([metadata_json["trait_reported"]]))

all_pgs_scores_with_metadata = all_pgs_scores.copy(deep=True)
method.index = all_pgs_scores.index
method_parsed.index = all_pgs_scores.index
trait.index = all_pgs_scores.index
all_pgs_scores_with_metadata["method"] = method
all_pgs_scores_with_metadata["method_parsed"] = method_parsed
all_pgs_scores_with_metadata["trait"] = trait
all_pgs_scores_with_metadata = all_pgs_scores_with_metadata[["pgs_id", "trait", "score", "method_parsed"]]
all_pgs_scores_with_metadata