# PRS for imputed data

In [None]:
#set up Hail requirements on instance
import os
os.system('wget https://github.com/adoptium/temurin21-binaries/releases/download/jdk-21.0.4%2B7/OpenJDK21U-jre_x64_linux_hotspot_21.0.4_7.tar.gz')
!tar -xvzf OpenJDK21U-jre_x64_linux_hotspot_21.0.4_7.tar.gz
!pip install pyspark
# Set the JAVA_HOME environment variable
os.environ['JAVA_HOME'] = '/opt/notebooks/jdk-21.0.4+7-jre' 
os.environ['PATH'] = f"{os.environ['JAVA_HOME']}/bin:" + os.environ['PATH']

In [None]:
#set up Hail on instance
!pip install hail --force-reinstall
import hail as hl
hl.init(idempotent=True)
hl.default_reference('GRCh38')

In [None]:
#Download .bgen and .sample files for imputed data for each chromosome 1-22 into instance
import os
os.system('dx download <bgen file name>')         
os.system('dx download <sample file name>')

In [None]:
#Loop through chromosomes 1-22 and index files using Hail
for chromosome in range(1, 23):
    bgen_file = f'<bgen_name_chr{chromosome}>.bgen'
    try:
        print(f"Indexing {bgen_file}...")
        hl.index_bgen(bgen_file)
        print(f"{bgen_file} indexed successfully.")
    except Exception as e:
        print(f"Error indexing file for chromosome {chromosome}: {e}")

In [None]:
#Download PRSedm into instance
import os
os.system('wget https://files.pythonhosted.org/packages/6e/d9/ebd00d933502674a1072f226bc429e5092ab365941262def01f4cffdbb44/prsedm-1.0.0-py3-none-any.whl')
!pip install prsedm
import prsedm

In [None]:
from datetime import datetime

entry_fields = ['GT', 'GP'] 
mt_subsets = []

#Loop through chromosomes 1-22
for chromosome in range(1, 23):
    bgen_file = f'<bgen_name_chr{chromosome}>.bgen'
    sample_file = f'<sample_name_chr{chromosome}>.sample'
    #Import BGEN file into Hail
    print(f"Importing {bgen_file} into Hail...")
    try:
        mt = hl.import_bgen(bgen_file, sample_file=sample_file, entry_fields=entry_fields)
        print(f"{bgen_file} imported successfully.")
        mt_subsets.append(mt)
    except Exception as e:
        print(f"Error importing {bgen_file}: {e}")
        
#Get list of SNPs to extract from PRS model
df = prsedm.get_snp_db('t1dgrs2-sharp24')
df = df.drop_duplicates().reset_index(drop=True)
        
#Build a list of variant positions to extract
pos_col='position_hg38' #change to hg19 if required
variantIntervals = [
    f"chr{row['contig_id']}:{row[pos_col]}-{row[pos_col] + 1}"
    for _, row in df.iterrows()
]
print(f"Number of unique variants to extract: {len(variantIntervals)}")

#Chunk up the regions incase there are too many variants requested
print("Retrieve chunks from VDS and densify...")
chunk_size=1000
chunked_intervals = [variantIntervals[i:i + chunk_size] for i in range(0, len(variantIntervals), chunk_size)]
mt_subsets = []
for i,chunk in enumerate(chunked_intervals):
    print(f"Processing chunk: {i+1}")
    bgen_filtered = hl.filter_intervals(mt,[hl.parse_locus_interval(x) for x in chunk])
    bgen_filtered=hl.split_multi_hts(bgen_filtered)
    mt_subsets.append(bgen_filtered)


#Combine chunks
print("Combining retrieved chunks...")
if mt_subsets:
    combined_mt = mt_subsets[0]
    for mt_n in mt_subsets[1:]:
        combined_mt = combined_mt.union_rows(mt_n)

    #Process the merged data
    print("Processing merged MT...")
    combined_mt = hl.variant_qc(combined_mt)
    combined_mt = combined_mt.annotate_rows(info=hl.struct(AF=combined_mt.variant_qc.AF))

    #Export to VCF
    print("Exporting to VCF...")
    start = datetime.now()
    hl.export_vcf(combined_mt, f'./chr{chromosome}_temp.vcf.bgz')
    print(f"Export took {(datetime.now() - start).total_seconds():.2f} seconds")
else:
    print("No MatrixTables were imported; nothing to combine.")

In [None]:
#Use bcftools to merge VCF files
!bcftools concat -Oz chr1_temp.vcf.bgz chr2_temp.vcf.bgz chr3_temp.vcf.bgz chr4_temp.vcf.bgz chr5_temp.vcf.bgz chr6_temp.vcf.bgz chr7_temp.vcf.bgz chr8_temp.vcf.bgz chr9_temp.vcf.bgz chr10_temp.vcf.bgz chr11_temp.vcf.bgz chr12_temp.vcf.bgz chr13_temp.vcf.bgz chr14_temp.vcf.bgz chr15_temp.vcf.bgz chr16_temp.vcf.bgz chr17_temp.vcf.bgz chr18_temp.vcf.bgz chr19_temp.vcf.bgz chr20_temp.vcf.bgz chr21_temp.vcf.bg chr22_temp.vcf.bgz -o all_data.vcf.gz

In [None]:
#Index files
!tabix -fp vcf all_data.vcf.gz

In [None]:
#Download reference TOPMED data using command for individual account and then 
#Index files
!for f in reference.vcf.gz;do tabix -f $f;done

In [None]:
#Generate GRS
output = prsedm.gen_dm(vcf=vcf, 
                            col="GP", 
                            build="hg38", 
                            prsflags="t1dgrs2-luckett25", 
                            impute=1, 
                            refvcf=<path to TOPMED reference files>,
                            norm=1,
                            ntasks=16,
                            parallel=1,
                            batch_size=1)
#Save results
output.to_csv(f"prsedm_result.csv", index=False)