Workflow used for: 
  1. Apply selected PGS scores into imputation data
  1. Standardize raw scores and calculate Odds Ratios between case/controls
  1. Generate visualization plots

In [2]:
# Import all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np

Step 1) **Apply** selected PGS scores into imputation data using **pgs-calc** (https://github.com/lukfor/pgs-calc)

In [None]:
import os

basedir = "/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs"

# Change working directory
os.chdir(basedir)

imputed_genotypes = "/labs/tassimes/rodrigoguarischi/projects/sea/imputed_data/michigan_hrc/*.vcf.gz"
# imputed_genotypes = "/labs/tassimes/rodrigoguarischi/projects/sea/imputed_data/topmed/liftover_hg19/*no_chr_prefix.vcf.gz"

ref_weights=["wGRS49", "PGS000349", "PGS000018", "PGS000667", "PGS000889"]
ref_weights_folder="./pgs_reference_weights/"

# Create full path to files
ref_weights_paths = ",".join([ref_weights_folder + pgs_name + ".txt.gz" for pgs_name in ref_weights])

for min_r2 in [0, 0.3, 0.5, 0.8]:
    
    print("Calculating scores for {0} at min R2 >= {1}".format( ", ".join(ref_weights), min_r2 ) )
    
    output_scores_filename = "_".join(ref_weights) + "_r" + str(min_r2).replace(".","") + ".scores.txt"
    html_report_filename = "_".join(ref_weights) + "_r" + str(min_r2).replace(".","") + ".html"
    
    !./pgs-calc/pgs-calc apply \
        --ref {ref_weights_paths} \
        --minR2 {min_r2} \
        --threads 22 \
        --no-ansi \
        # Run pgs-calc using GENOTYPE information, instead of DOSAGE (the default)
        # --genotypes=GT \ 
        --report-html={html_report_filename} \
        --out {output_scores_filename} \
        {imputed_genotypes}

Step 2) **Standardize** raw scores and calculate **Odds Ratios** between case/controls

In [8]:
# Read phenotypes and recode sex and race attributes
sea_phenotypes = pd.read_table( "./raw_files/SEA/SEA_Phase2_Subject_Phenotypes.txt", index_col="seaid" )

# Recode Sex and Race
sea_phenotypes = sea_phenotypes.replace( {
    "sex": { 1:"Male", 2:"Female" },
    "race": { 1:"White", 2:"Black" }
    } )

# Print counts by race and sex and first lines from dataframe 
print( sea_phenotypes.groupby(["race","sex"])["sex"].count() )
sea_phenotypes.head()

race   sex   
Black  Female     92
       Male      412
White  Female    128
       Male      436
Name: sex, dtype: int64


Unnamed: 0_level_0,dbGaP_SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,cr,rltotal,rlmean
seaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,3.3,0.0,0.0,0.0
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,3.4,0.6,0.6,0.2
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,4.0,0.0,0.0,0.0
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,1.7,1.0,15.8,5.266667
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,0.0,0.0,0.0,0.0


In [4]:
# Read files with raw scores of multiple PRSs

import os, glob

li = []
li.append(sea_phenotypes)

# for current_score_file in glob.glob( "./apply_grs/results_genotypes/*.scores.txt" ):
# for current_score_file in glob.glob( "./apply_grs/results_dosage/*.scores.txt" ):
# for current_score_file in glob.glob( "./apply_grs/michigan_hrc_20220419/*.scores.txt" ):
for current_score_file in glob.glob( "./apply_grs/topmed_20220419/*.scores.txt" ):
    pgs_results = pd.read_table( current_score_file, sep = "," )
    pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
    pgs_results = pgs_results.set_index("sample")
    min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
    pgs_results = pgs_results.add_suffix("_" + min_r2_used)

    li.append( pgs_results )
    
sea_merged = pd.concat( li, axis=1)
sea_merged.head()

Unnamed: 0,dbGaP_SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,wGRS49_r05,PGS000349_r05,PGS000018_r05,PGS000667_r05,PGS000889_r05,wGRS49_r08,PGS000349_r08,PGS000018_r08,PGS000667_r08,PGS000889_r08
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,...,-0.014892,0.876993,0.7106,2.38521,1.331384,0.500438,1.211101,0.464124,0.0,0.397157
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,...,-0.1194,1.052222,1.198771,1.64905,0.9482,0.174197,0.624084,0.458905,0.0,0.37311
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,...,0.137079,0.729801,0.746956,1.44351,1.508553,0.213983,0.41831,0.430099,0.0,0.814898
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,...,-0.301765,0.705801,1.184374,16.642935,0.264086,-0.009379,0.304889,0.743781,0.0,-0.341113
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,...,-0.19394,0.98057,0.744978,3.18974,-0.079091,0.330856,0.491178,0.349139,0.0,0.201303


In [None]:
# Assign case/control classes to samples using top quartile rule splitting by sex

# Phenotype of interest
phenotype = "cr"

# Subset cohort
sea_merged_whites = sea_merged[ sea_merged["race"] == "White" ]

# Calculate thresholds for each sex
male_threshold = sea_merged_whites[ sea_merged_whites["sex"]=="Male" ][phenotype].quantile(0.75)
female_threshold = sea_merged_whites[sea_merged_whites["sex"]=="Female"][phenotype].quantile(0.75)
print( "Thresholds to be used for case/control definition: male = {0} and female = {1}".format( male_threshold, female_threshold) )

if( male_threshold == 0 or female_threshold == 0 ):
    sys.exit("Male ({0}) and/or Female ({1}) thresholds equals to zero!".format(male_threshold, female_threshold))

# Add a new column called Case with all values equals to False
sea_merged_whites = sea_merged_whites.assign( Case=False )

# Identify subjects above threshold to assign them to group "Case"
for i in sea_merged_whites.index:
    if( sea_merged_whites.loc[i, "sex"] == "Male" and sea_merged_whites.loc[i, phenotype] > male_threshold ):
        sea_merged_whites.loc[i, "Case"] = True
    elif( sea_merged_whites.loc[i, "sex"] == "Female" and sea_merged_whites.loc[i, phenotype] > female_threshold ):
        sea_merged_whites.loc[i, "Case"] = True

sea_merged_whites.head()

In [None]:
# sea_merged_whites.head()
import numpy as np
from sklearn import preprocessing
import statsmodels.api as sm

standardize_scores = []
logit_models_dict = {}
models_summaries = pd.DataFrame()

# Recode dependent variable to 0 and 1 to fit glm
sea_merged_whites["Case_recoded"] = sea_merged_whites["Case"]
sea_merged_whites["Case_recoded"].replace(True, 1, inplace=True)
sea_merged_whites["Case_recoded"].replace(False, 0, inplace=True)

# Normalize all scores with mean = 0 and SD = 1
for grs in ["wGRS49","PGS000349","PGS000018", "PGS000667", "PGS000889"]:
    for threshold in ["r0", "r03", "r05", "r08"]:
        raw_score_name = grs + "_" + threshold
        standardize_score_name = grs + "_" + threshold + "_standardize"
        standardize_scores.append(standardize_score_name)
        
        # Standardize raw scores using method scale
        sea_merged_whites[ standardize_score_name ] = preprocessing.scale( sea_merged_whites[ raw_score_name ] )
        
        # Fit a logistic model using standardize scores and save it on dictionary 
        logit_models_dict[ standardize_score_name ] = sm.formula.glm(
            "Case_recoded ~ " + standardize_score_name + " + age + sex",
            family=sm.families.Binomial(),
            data=sea_merged_whites).fit()

        model_summary_series = pd.Series(
            data={
                "beta": np.exp( logit_models_dict[standardize_score_name].params )[ standardize_score_name ],
                "score_pvalue": logit_models_dict[standardize_score_name].pvalues[standardize_score_name],
                "conf_interval_lower": np.exp( logit_models_dict[standardize_score_name].conf_int()[0][standardize_score_name] ),
                "conf_interval_upper": np.exp( logit_models_dict[standardize_score_name].conf_int()[1][standardize_score_name] )
            },
            name=standardize_score_name
        )
        
        models_summaries = pd.concat( [models_summaries, model_summary_series], axis=1)

# Print summary statistics to confirm that all standadize scores have mean = 0 and SD = 1
# print( "Normalized scores:\n{0}\n".format(sea_merged_whites[ standardize_scores ].describe().transpose()) )
# print( "Normalized scores:" )
# print( sea_merged_whites[ standardize_scores ].describe().transpose() )
      
# Transpose models_summaries object and print results for all models tested
models_summaries = models_summaries.transpose()
# print(models_summaries, end="")
# print( "Models summaries:\n{0}\n".format(models_summaries.transpose()) )

In [None]:
model_name = "wGRS49_r03_standardize"
# print( logit_models_dict[model_name].summary() )
print( logit_models_dict[model_name].pvalues[model_name] )
# print( np.exp( logit_models_dict[model_name].params )[ model_name ] )
# print( np.exp( logit_models_dict[model_name].conf_int().loc[[ model_name ]] ) )

In [None]:
# Plot boxplots and histograms and boxplots of raw scores spliting by sex

sns.set_style('whitegrid')

score_list = ["wGRS49_r0", "wGRS49_r03", "wGRS49_r05", "wGRS49_r08"]
# score_list = ["PGS000349_r0", "PGS000349_r03", "PGS000349_r05", "PGS000349_r08"]
# score_list = ["PGS000018_r0", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(5*len(score_list),12))
for i in range(0, len(score_list)):
    sns.boxplot(
        x = "Case",
        y = score_list[i],
        data = sea_merged_whites,
        palette = reversed(sns.color_palette(n_colors=2)),
        width=0.4,
        fliersize=2,
        ax=axs[0,i],
    )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Male" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[1,i]
        ).set(title='Males (n={0})'.format( sum( sea_merged_whites["sex"] == "Male" ) ) )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Female" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[2,i]
    ).set(title='Females (n={0})'.format( sum( sea_merged_whites["sex"] == "Female" ) ) )

plt.subplots_adjust(top=1.25)

# # Plot only boxplots

# fig, axs = plt.subplots(ncols=len(score_list), figsize=(5*len(score_list),4))
# for i in range(0, len(score_list)):
#     sns.boxplot(
#         x = "Case",
#         y = score_list[i],
#         data = sea_merged_whites,
#         hue="sex",
#         hue_order=['Male','Female'],
#         width=0.4,
#         fliersize=2,
#         ax=axs[i],
#     )

In [None]:
# Test variables
sns.boxplot(
    x = "Case",
    y = "age",
    data = sea_merged_whites,
    width=0.4,
    fliersize=2
)

Export dataset to CSV files

In [None]:
# Export full dataset
sea_merged_whites.to_csv("exports/sea_whites_phenotypes_scores_full.csv")

# Export only subset of columns Tim asked
select_columns = ["dbGaP_SubjID","sex","race","age","agex2","bmi","cr","wGRS49_r05","PGS000349_r05","PGS000018_r05","wGRS49_r03","PGS000349_r03","PGS000018_r03","wGRS49_r0","PGS000349_r0","PGS000018_r0","wGRS49_r08","PGS000349_r08","PGS000018_r08", "Case"]
sea_merged_whites[select_columns].to_csv("exports/sea_whites_phenotypes_scores_column_subset.csv")

In [None]:
import seaborn as sns

sns.scatterplot(
    x = "wGRS49_r03_scaled",
    y = "Case",
    data = sea_merged_whites
    # ,
    # alpha=0.4,
    # s=6
    )



Test logistic regression in <code>R</code>

In [None]:
import scipy.stats as stats

# table = df.groupby(level="Cancer").sum().values

table = [[840,51663],[32,5053]]
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
sea_merged_whites.head()

In [None]:
from scipy import stats

for score in ["wGRS49_r03", "wGRS49_r05", "wGRS49_r08", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]:
    for traits in ["tf_quartile4", "tr_quartile4", "af_quartile4", "ar_quartile4", "cf_quartile4", "cr_quartile4"]:
        # score = "PGS000018_r08"
        cases = sea_merged_whites[traits]
        print( score , "\t", traits, "\t", stats.ttest_ind( sea_merged_whites[cases][score], sea_merged_whites[np.invert(cases)][score] ) )

In [None]:
#

current_score_file = "./apply_grs/wGRS49_PGS000349_PGS000018_r0.scores.txt"

pgs_results = pd.read_table( current_score_file, sep = "," )
pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
pgs_results = pgs_results.set_index("sample")
min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
pgs_results = pgs_results.add_suffix("_" + min_r2_used)
pgs_results.head()
    # score_name = os.path.basename( current_score_file ).split(".")[0]
    # pgs_results.rename( columns={ pgs_results.columns[0]:score_name }, inplace=True )
