# Intro

**Author:** Stephan Cordogan

**Import a PRS scoring file with GRCh38 coordinates** [(a)](#Import-PRS-and-organize-data), which can be created using the UCSC liftover tool if your file does not use them.  
Use same Hail environment as used previously.

# Import Necessary Packages

In [None]:
from datetime import datetime
import os
import pandas as pd
import hail as hl

In [None]:
start = datetime.now()

In [None]:
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

In [None]:
hl.init(default_reference = "GRCh38")

## Loading the Hail MatrixTable containing the variants

In [None]:
# mt_path = os.getenv("WGS_CLINVAR_SPLIT_HAIL_PATH")
mt_path = os.getenv("WGS_ACAF_THRESHOLD_SPLIT_HAIL_PATH")
# mt_path = os.getenv("WGS_ACAF_THRESHOLD_MULTI_HAIL_PATH")

mt_path

In [None]:
mt = hl.read_matrix_table(mt_path)

Specify a subset of the data if desired

In [None]:
# test_intervals = ['chr5']

In [None]:
# mt = hl.filter_intervals(
#     mt,
#     [hl.parse_locus_interval(x,)
#      for x in test_intervals])

In [None]:
# mt.count()

## Load phenotypic data and link with genotype data

Read the phenotype file from your workspace bucket, created in Notebook 1

In [None]:
phenotype_filename = f'{bucket}/data/genomics_phenotypes.tsv'
phenotype_filename

In [None]:
phenotypes = (hl.import_table(phenotype_filename,
                              types={'person_id':hl.tstr},
                              impute=True,
                              key='person_id')
             )

Keep only samples which have phenotype values

In [None]:
mt = mt.semi_join_cols(phenotypes)

In [None]:
mt = mt.annotate_cols(pheno = phenotypes[mt.s])

# Pre-process the genomic data and perform QC

Remove related samples

In [None]:
related_samples_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv"

In [None]:
related_remove = hl.import_table(related_samples_path,
                                 types={"sample_id":"tstr"},
                                key="sample_id")

#related_remove.count()

In [None]:
mt = mt.anti_join_cols(related_remove)
#mt.count()

Import predicted ancestry


In [None]:
ancestry_pred_path = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"

In [None]:
ancestry_pred = hl.import_table(ancestry_pred_path,
                               key="research_id", 
                               impute=True, 
                               types={"research_id":"tstr","pca_features":hl.tarray(hl.tfloat)})

In [None]:
mt = mt.annotate_cols(ancestry_pred = ancestry_pred[mt.s])

Optionally filter ethnic groups

In [None]:
mt_eur = mt.filter_cols(mt.ancestry_pred.ancestry_pred == "eur")
# mt_pheno_eur.col.describe()
mt_orig = mt
mt = mt_eur

## Import PRS and organize data

Unzip, inspect, import, and rearrange the PRS information

In [None]:
print(os.getcwd())

In [None]:
!gunzip PGS001101_with_pos38.txt.gz

In [None]:
import gzip
import shutil

with gzip.open('/home/jupyter/workspaces/flagshipgwas/PGS001101_with_pos38.txt.gz', 'rb') as f_in:
    with open('/home/jupyter/workspaces/flagshipgwas/PGS001101_with_pos38.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
# print the first 15 lines of the file to inspect
with open('/home/jupyter/workspaces/flagshipgwas/PGS001101_with_pos38.txt', 'r') as file:
    for _ in range(16):
        print(file.readline())

In [None]:
BF_PRS = pd.read_csv('/home/jupyter/workspaces/flagshipgwas/PGS001101_with_pos38.txt', delimiter='\t', comment='#')

BF_PRS.rename(columns={
    'chr_name': 'chromosome',
    'chr_position': 'position',
    'effect_allele': 'effect_allele',
    'other_allele': 'other_allele',
    'effect_weight': 'beta'
}, inplace=True)

In [None]:
print(BF_PRS.head(5))

In [None]:
ht = hl.Table.from_pandas(BF_PRS)

# check chromosome is cast to string and position to integer
ht = ht.annotate(
    chromosome_str=hl.str(ht['chromosome']),
    position_int=hl.int(ht['position'])
)

ht = ht.filter(ht.position_int > 0)

# convert chromosome and position to locus
ht = ht.transmute(locus=hl.locus(ht.chromosome_str, ht.position_int))

ht.show(5)

In [None]:
mt.describe()

In [None]:
ht = ht.select('effect_allele', 'other_allele', 'beta', 'locus')
ht = ht.key_by('locus', 'effect_allele', 'other_allele')
ht.show(5)

In [None]:
ht = ht.key_by('locus')

In [None]:
mt.count()

Generate genotype matrix filtered to SNPs present in PRS (no allele swapping)

In [None]:
mt = mt.annotate_rows(ht_data=ht[mt.locus])

mt = mt.filter_rows(
    hl.is_defined(mt.ht_data) & (
        (mt.alleles[1] == mt.ht_data.effect_allele) & (mt.alleles[0] == mt.ht_data.other_allele)  # Direct match
    )
)

mt.count()

prs_filtered_mt = mt
mt_save_path = f'{bucket}/data/prs_filtered_mt.mt'
prs_filtered_mt.write(mt_save_path)

# mt = mt.filter_rows(
#     hl.is_defined(mt.ht_data) & (
#         (mt.alleles[1] == mt.ht_data.effect_allele) & (mt.alleles[0] == mt.ht_data.other_allele)  # direct match
#         |
#         (mt.alleles[0] == mt.ht_data.effect_allele) & (mt.alleles[1] == mt.ht_data.other_allele)  # switched alleles
#     )
# )

# mt = mt.annotate_rows(
#     gwas = hl.if_else(
#         (mt.alleles[0] == mt.ht_data.effect_allele) & (mt.alleles[1] == mt.ht_data.other_allele),
#         # direct match, no changes needed
#         mt.ht_data,
#         # switched alleles- swap alleles and multiply beta by -1
#         mt.ht_data.annotate(
#             effect_allele = mt.ht_data.other_allele,
#             other_allele = mt.ht_data.effect_allele,
#             beta = -mt.ht_data.beta
#         )
#     )
# )

## Calculate PRS

In [None]:
mt_file_path = f'{bucket}/data/prs_filtered_mt.mt'

mt = hl.read_matrix_table(mt_file_path)


In [None]:
mt.describe()

In [None]:
mt = mt.annotate_rows(mean_dosage=hl.agg.mean(hl.or_else(mt.GT.n_alt_alleles(), 0)))

# calculate mean dosage for each variant
mt = mt.annotate_rows(mean_dosage=hl.agg.mean(hl.or_else(mt.GT.n_alt_alleles(), 0)))

N = 3  # number of decimal places
mt = mt.annotate_entries(
    imputed_GT=hl.if_else(
        hl.is_missing(mt.GT),
        hl.call(hl.round(mt.mean_dosage * (10**N)) / (10**N)),
        mt.GT
    )
)

cols_table = mt.cols()
# select the relevant fields from the column table
prs_table = cols_table.select(cols_table.prs, cols_table.pheno.has_pheno)
# prs_table.show(5)

prs_table_save_path = f'{bucket}/data/prs_table.tsv.bgz'

prs_table.export(prs_table_save_path)


In [None]:
scores_file_path = f'{bucket}/data/prs_table.tsv.bgz'
!gsutil cp {scores_file_path} .
!bgzip -d prs_table.tsv.bgz 

# Compare PRS values across cases and controls

Note that this does not consider any other covariates, just a quick crude test.

In [None]:
prs_table = pd.read_csv('prs_table.tsv', delimiter='\t')

In [None]:
prs_table.head()

In [None]:
average_prs_pheno_0 = prs_table[prs_table['has_pheno'] == 0]['prs'].mean()

average_prs_pheno_1 = prs_table[prs_table['has_pheno'] == 1]['prs'].mean()

print(f"Average PRS for controls: {average_prs_pheno_0}")
print(f"Average PRS for cases: {average_prs_pheno_1}")


In [None]:
from scipy.stats import ttest_ind

# separate PRS values for the two groups
prs_pheno_0 = prs_table[prs_table['has_pheno'] == 0]['prs']
prs_pheno_1 = prs_table[prs_table['has_pheno'] == 1]['prs']

# perform two-sample t-test
t_stat, p_value = ttest_ind(prs_pheno_0, prs_pheno_1, equal_var=False)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
if p_value < 0.05:
    print("Statistically Significant (p < 0.05)")
else:
    print("Not Statistically Significant (p >= 0.05)")