# Calculating my polygenic scores

In [None]:
import traceback
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import sqlite3
from typing import List, Optional, Tuple, Dict
import pysam
import rsidx
from tqdm import tqdm
from search_your_dna.pgscatalog import read_or_download_pgs_scoring_file, PGS_METHOD_MAPPING_TO_METHOD_CATEGORIES, \
    calc_polygenic_score, calc_all_polygenic_scores, to_gene_dosage_df, clean_rsids, do_polygenic_score_calculation, \
    calc_all_polygenic_scores_parallel, _do_calc_polygenic_score_single_input_arg, get_pgs_id_from_filename, \
    get_pgs_metadata
from search_your_dna.util import read_raw_zipped_polygenic_score_file, \
    read_raw_zipped_polygenic_score_file_with_chrom_pos, search_for_rsids

my_vcf_file = "data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf"

## Calc scores which have rsids in the pgs score files

In [None]:
pgs_file = "data/pgs/PGS000325.txt.gz"
res1 = calc_polygenic_score(my_vcf_file=my_vcf_file, pgs_file=pgs_file, hg19_rsid_chrom_pos_mapping_file="", max_pgs_alleles=200)
res1


## Calc scores which have only chrom/pos values in the pgs score files

### Download rsid to hg19 chrom-pos metadata files with annovar

In [None]:
!~/bin/annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar avsnp150 data/humandb/

### Create tabix index for metadata file

In [None]:
!bgzip -c data/humandb/hg19_avsnp150.txt > data/humandb/hg19_avsnp150.txt.gz
!tabix --begin 2 --end 3 --sequence 1 data/humandb/hg19_avsnp150.txt.gz

### Calculate pgs score

In [None]:
hg19_rsid_chrom_pos_mapping_file = "data/humandb/hg19_avsnp150.txt.gz"
pgs_file = "data/pgs/PGS000004.txt.gz"

res2 = calc_polygenic_score(my_vcf_file=my_vcf_file, pgs_file=pgs_file, hg19_rsid_chrom_pos_mapping_file=hg19_rsid_chrom_pos_mapping_file, max_pgs_alleles=5000)

## Calculating all PGS values for existing pgs files

In [None]:
%%time
pgs_ids = [get_pgs_id_from_filename(file) for file in sorted(glob("data/pgs/PGS00*.txt.gz"))]
num_parallel_processes, max_pgs_alleles = 8, 100_000
all_pgs_scores = calc_all_polygenic_scores_parallel(pgs_ids=pgs_ids, my_vcf_file=my_vcf_file, num_parallel_processes=num_parallel_processes, max_pgs_alleles=max_pgs_alleles)
errors = all_pgs_scores[["pgs_id", "error"]][~all_pgs_scores["error"].isna()]

pgs_metadata_df = pd.DataFrame(columns=["pgs_id", "trait", "method_categorized", "method", "ancestry"])
for pgs_metadata_file in sorted(glob("data/pgs/PGS00*.json")):
    pgs_id = Path(pgs_metadata_file).stem
    pgs_metadata_df = pgs_metadata_df.append(get_pgs_metadata(pgs_id), ignore_index=True)


result_df = all_pgs_scores[["pgs_id", "score"]].set_index("pgs_id").join(pgs_metadata_df.set_index("pgs_id"), on="pgs_id")

result_df.to_csv("data/pgs_results.csv", index=None, sep="\t")