# Calculating my polygenic scores

In [1]:
import traceback
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import sqlite3
from typing import List, Optional, Tuple, Dict
import pysam
import rsidx
from tqdm import tqdm
from search_your_dna.pgscatalog import read_or_download_pgs_scoring_file, PGS_METHOD_MAPPING_TO_METHOD_CATEGORIES, \
    calc_polygenic_score, calc_all_polygenic_scores, to_gene_dosage_df, clean_rsids, do_polygenic_score_calculation, \
    calc_all_polygenic_scores_parallel
from search_your_dna.util import read_raw_zipped_polygenic_score_file, \
    read_raw_zipped_polygenic_score_file_with_chrom_pos, search_for_rsids

my_vcf_file = "data/GFX0237425.GRCh38.p7.annotated.hg38_multianno.updated.vcf"
max_pgs_alleles = 200

## Calc scores which have rsids in the pgs score files

In [3]:
pgs_file = "data/pgs/PGS000001.txt.gz"
res1 = calc_polygenic_score(my_vcf_file=my_vcf_file, pgs_file=pgs_file, hg19_rsid_chrom_pos_mapping_file="", max_pgs_alleles=max_pgs_alleles)
res1


calc pgs based on rsid


(1.1707461914041657,
           rsid  effect_weight  gene_dosage    effect
 0    rs2736108      -0.064112            1 -0.064112
 1    rs2588809       0.064570            2  0.129140
 2     rs999737      -0.079151            1 -0.079151
 3     rs865686      -0.107029            2 -0.214057
 4    rs2981579       0.225062            1  0.225062
 5    rs7072776       0.056475            2  0.112950
 6   rs13387042      -0.128515            1 -0.128515
 7   rs16857609       0.069619            1  0.069619
 8    rs4973768       0.089658            2  0.179316
 9   rs10941679       0.113150            1  0.113150
 10    rs889312       0.111184            2  0.222367
 11   rs2046210       0.046024            1  0.046024
 12  rs13281615       0.090754            1  0.090754
 13   rs2380205      -0.023166            2 -0.046333
 14  rs10995190      -0.155134            1 -0.155134
 15    rs704010       0.067565            1  0.067565
 16   rs1292011      -0.081319            1 -0.081319
 17   r

## Calc scores which have only chrom/pos values in the pgs score files

### Download rsid to hg19 chrom-pos metadata files with annovar

In [None]:
!~/bin/annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar avsnp150 data/humandb/

### Create tabix index for metadata file

In [None]:
!bgzip -c data/humandb/hg19_avsnp150.txt > data/humandb/hg19_avsnp150.txt.gz
!tabix --begin 2 --end 3 --sequence 1 data/humandb/hg19_avsnp150.txt.gz

### Calculate pgs score

In [None]:
hg19_rsid_chrom_pos_mapping_file = "data/humandb/hg19_avsnp150.txt.gz"
pgs_file = "data/pgs/PGS000007.txt.gz"

res2 = calc_polygenic_score(my_vcf_file=my_vcf_file, pgs_file=pgs_file, hg19_rsid_chrom_pos_mapping_file=hg19_rsid_chrom_pos_mapping_file, max_pgs_alleles=max_pgs_alleles)

## Calculating all PGS values for existing pgs files

In [2]:
all_pgs_scores, errors = calc_all_polygenic_scores_parallel(files=sorted(glob("data/pgs/PGS00*.txt.gz")), my_vcf_file=my_vcf_file)

calc pgs based on rsid for PGS000002


In [None]:
all_pgs_scores.to_csv("data/pgs_results.csv", index=None, sep="\t")

In [None]:
all_pgs_scores = all_pgs_scores.sort_values(by=["score","pgs_id"])

