### Download wrapper function for plink

In [None]:
!pip install -U cython numpy

In [None]:
!git clone https://github.com/KangchengHou/dask-pgen.git

In [None]:
!cd dask-pgen; pip install -e .

In [None]:
!chmod +x dask-pgen/bin/dapgen

In [None]:
!./dask-pgen/bin/dapgen score

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from tqdm import tqdm
from IPython.display import display, Markdown, Latex

DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')
print(f"DATASET={DATASET}")
print(f"bucket={bucket}")

LOCALDIR = "pgs-test"
!mkdir -p {LOCALDIR}/genotype/
!mkdir -p {LOCALDIR}/pgs_file/
!gsutil ls gs://path/to/genotype/
!gsutil cp gs://path/to/genotype/* {LOCALDIR}/genotype/


pgs_dict = {
    "PGS004759": "PGS_Catalog",
}

column_dict = {
    "hm_chr": "CHROM",
    "hm_pos": "POS",
    "other_allele": "REF",
    "effect_allele": "ALT",
    "effect_weight": "WEIGHT"
}

for pgs_id in tqdm(pgs_dict.keys()):
    pgs_df = pd.read_csv(
        "https://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/" + 
        f"{pgs_id}/ScoringFiles/Harmonized/{pgs_id}_hmPOS_GRCh38.txt.gz", 
        sep='\t', comment='#', low_memory=False
    )[column_dict.keys()].rename(columns=column_dict)
    print(f"{len(pgs_df)} SNPs for {pgs_id}/{pgs_dict[pgs_id]}")
    pgs_df["CHROM"] = pgs_df["CHROM"].astype(str)
    pgs_df = pgs_df[pgs_df.CHROM.isin(np.arange(1, 23).astype(str))].dropna()
    pgs_df['POS'] = pgs_df["POS"].astype(int)
    pgs_df.to_csv(f"{LOCALDIR}/pgs_file/{pgs_id}.weight.tsv", sep='\t', index=False)
    print(f"{len(pgs_df)} SNPs for after filtering")
    
    !./dask-pgen/bin/dapgen score \
        --plink "{LOCALDIR}/genotype/*.bed" \
        --weights {LOCALDIR}/pgs_file/{pgs_id}.weight.tsv \
        --out {LOCALDIR}/pgs_file/{pgs_id}.score.tsv \
        --chrom-col CHROM --pos-col POS --alt-col ALT --ref-col REF --weight-col-prefix WEIGHT \
        --threads 12 --memory 30000

In [None]:
# Calculate PGS using weights from Als et al. 
column_dict = {
    "CHR": "CHROM",
    "POS_hg37": "POS_37",
    "POS_hg38": "POS",
    "A1": "REF",
    "A2": "ALT",
    "WEIGHT": "WEIGHT"
}

pgs_df = pd.read_csv(
    "./pgs-test/score/als_weights_pgs_hg38.txt",
    sep='\t', low_memory=False
)[column_dict.keys()].rename(columns=column_dict)
print(f"{len(pgs_df)} SNPs")
pgs_df["CHROM"] = pgs_df["CHROM"].astype(str)
pgs_df = pgs_df[pgs_df.CHROM.isin(np.arange(1, 23).astype(str))].dropna()
pgs_df['POS'] = pgs_df["POS"].astype(int)
pgs_df.to_csv(f"{LOCALDIR}/pgs_file/MDD.weight.tsv", sep='\t', index=False)
print(f"{len(pgs_df)} SNPs for after filtering")

!./dask-pgen/bin/dapgen score \
        --plink "{LOCALDIR}/genotype/*.bed" \
        --weights {LOCALDIR}/pgs_file/MDD.weight.tsv \
        --out {LOCALDIR}/pgs_file/MDD.score.tsv \
        --chrom-col CHROM --pos-col POS --alt-col REF --ref-col ALT --weight-col-prefix WEIGHT \
        --threads 12 --memory 30000