Workflow used for:
  1. Read phenotypic information and consolidate it with raw PGS scores from **apply_grs.ipynb**
  1. Standardize raw scores and calculate Odds Ratios between case/controls
  1. Generate visualization plots

In [1]:
# Import all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np
import os, glob
from sklearn import preprocessing
import statsmodels.api as sm
import json

Step 1) Read phenotypic information and consolidate with raw PGS scores

In [2]:
os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/")

# Read phenotypes and recode sex and race attributes
sea_phenotypes = pd.read_table(
    "data_preparation_to_imputation/86679/NHLBI/SEA_Herrington/phs000349v1/p1/phenotype/phs000349.v1.pht002191.v1.p1.c1.SEA_Phase2_Subject_Phenotypes.GRU.txt",
    index_col="seaid",
    comment="#")

# Recode Sex and Race
sea_phenotypes = sea_phenotypes.replace( {
    "sex": { 1:"Male", 2:"Female" },
    "race": { 1:"White", 2:"Black" }
    } )

# Print counts by race and sex and first lines from dataframe 
print( sea_phenotypes.groupby(["race","sex"])["sex"].count() )
sea_phenotypes.head()

race   sex   
Black  Female     92
       Male      412
White  Female    128
       Male      436
Name: sex, dtype: int64


Unnamed: 0_level_0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,cr,rltotal,rlmean
seaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,3.3,0.0,0.0,0.0
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,3.4,0.6,0.6,0.2
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,4.0,0.0,0.0,0.0
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,1.7,1.0,15.8,5.266667
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,0.0,0.0,0.0,0.0


In [3]:
# Read files with raw scores of multiple GRSs

li = []
li.append(sea_phenotypes)

os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs/raw_scores_20220425/")

# Loop over info files and save info as a dictionary   
scores_info_dict = {}
for current_info_file in glob.glob( "*.info.txt" ):

    min_r2_used = os.path.basename( current_info_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_info_file ).split("_")[0]
    
    f = open( current_info_file )
    data = json.load(f)

    for score_result in data:
        key = "_".join( (reference_panel_name, min_r2_used, score_result["name"]) )
        scores_info_dict[key] = score_result

    f.close()

# Loop over scores files, load all into memory and save it to list object 'li'    
for current_score_file in glob.glob( "*.scores.txt" ):
        
    grs_results = pd.read_table( current_score_file, sep = "," )
    grs_results["sample"] = grs_results["sample"].str.split("_", expand = True)[0]
    grs_results = grs_results.set_index("sample")
    min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_score_file ).split("_")[0]
        
    # Add prefix to column names. Names should match pattern <hrc|topmed>_<threshold>_<scoreid>
    grs_results = grs_results.add_prefix(reference_panel_name + "_" + min_r2_used + "_")

    li.append( grs_results )

# Consolidate data into a dataframe and print first lines
sea_dataset_full = pd.concat( li, axis=1 )
sea_dataset_full.head()

Unnamed: 0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,topmed_r03_wGRS49,topmed_r03_PGS000349,topmed_r03_PGS000018,topmed_r03_PGS000667,topmed_r03_PGS000889,hrc_r03_wGRS49,hrc_r03_PGS000349,hrc_r03_PGS000018,hrc_r03_PGS000667,hrc_r03_PGS000889
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,...,0.107235,0.82235,0.548667,10.961485,0.925061,0.12172,1.000436,0.621893,7.995635,0.875832
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,...,-0.040811,0.916436,1.148271,23.910234,0.824932,0.060812,1.088792,1.469894,6.819585,1.232285
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,...,0.232942,0.573412,0.454624,3.480256,1.476948,0.267241,0.780331,0.787814,5.86047,1.8821
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,...,-0.275371,0.624512,1.062907,23.32543,-0.077072,-0.274384,0.671544,1.177668,18.352831,0.648143
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,...,-0.093052,0.847566,0.645384,5.053835,-0.506533,-0.068343,0.763513,0.723225,19.727756,0.095139


In [4]:
# Assign case/control classes to samples using top quartile rule splitting by sex

# Phenotype of interest
phenotype = "cr"

# Calculate thresholds for each race and sex
print( "Thresholds to be used for case/control definition:" )
thresholds = {}
for race in ["White", "Black"]:
    for sex in ["Male", "Female"]:
        
        # Define Q3 as threshold and save it in thresholds dictionary
        key = race + "_" + sex
        thresholds[ key ] = sea_dataset_full[ (( sea_dataset_full["race"]==race ) & ( sea_dataset_full["sex"]==sex ))  ][phenotype].quantile(0.75)
        
        # Print values and warnings, if needed
        note = ""
        if( thresholds[ key ] == 0 ):
            note = "(WARNING: This group has Q3 equals zero. Considering only non-zeros as CASE group!!)"
        print( " - {0} = {1:.4f} {2}".format( key, thresholds[key], note ) )

# Add a new column called Case with all values equals to False
sea_dataset_full = sea_dataset_full.assign( Case=False )

# Identify subjects above threshold to assign them to group "Case"
for i in sea_dataset_full.index:

    sex = sea_dataset_full.loc[ i, "sex"]
    race = sea_dataset_full.loc[ i, "race"]
    key = race + "_" + sex
    
    if( sea_dataset_full.loc[i, phenotype] > thresholds[ key ] ):
        sea_dataset_full.loc[i, "Case"] = True

# Print summary of Case/Controls for each subgroup
print( sea_dataset_full.groupby(["race","sex","Case"])["Case"].count() )
sea_dataset_full.head()

Thresholds to be used for case/control definition:
 - White_Male = 1.6000 
 - White_Female = 0.4250 
 - Black_Male = 1.0000 
race   sex     Case 
Black  Female  False     73
               True      19
       Male    False    311
               True     101
White  Female  False     96
               True      32
       Male    False    328
               True     108
Name: Case, dtype: int64


Unnamed: 0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,topmed_r03_PGS000349,topmed_r03_PGS000018,topmed_r03_PGS000667,topmed_r03_PGS000889,hrc_r03_wGRS49,hrc_r03_PGS000349,hrc_r03_PGS000018,hrc_r03_PGS000667,hrc_r03_PGS000889,Case
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,...,0.82235,0.548667,10.961485,0.925061,0.12172,1.000436,0.621893,7.995635,0.875832,False
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,...,0.916436,1.148271,23.910234,0.824932,0.060812,1.088792,1.469894,6.819585,1.232285,False
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,...,0.573412,0.454624,3.480256,1.476948,0.267241,0.780331,0.787814,5.86047,1.8821,False
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,...,0.624512,1.062907,23.32543,-0.077072,-0.274384,0.671544,1.177668,18.352831,0.648143,False
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,...,0.847566,0.645384,5.053835,-0.506533,-0.068343,0.763513,0.723225,19.727756,0.095139,False


Step 2) **Standardize** raw scores and calculate **Odds Ratios** between case/controls

In [53]:
# Standardize raw scores and calculate OR

li = []
standardize_scores = []
logit_models_dict = {}
models_summaries = pd.DataFrame()

# Get list of panels, thresholds and GRSs used from info dictonary
reference_panels = set( [ key.split('_')[0] for key in scores_info_dict.keys() ] )
thresholds = set( [ key.split('_')[1] for key in scores_info_dict.keys() ] )
grss = set( [ key.split('_')[2] for key in scores_info_dict.keys() ] )

# Normalize all scores with mean = 0 and SD = 1
for race in ["White", "Black"]:
    
    # Subset cohort between whites and blacks to run logistic regression individually
    sea_dataset_subset = sea_dataset_full[ sea_dataset_full["race"] == race ]

    # Create a dependent variable named "Case_recoded" based on column "Case", conding it as 0 and 1 to fit glm
    sea_dataset_subset = sea_dataset_subset.assign( Case_recoded=sea_dataset_subset["Case"].replace(True, 1).replace(False, 0) )
    
    for reference_panel_id in reference_panels:
        
        for threshold in thresholds:

            for grs in grss:

                raw_score_name = "_".join( (reference_panel_id, threshold, grs) )
                standardize_score_name = "_".join( (race, reference_panel_id, threshold, grs) )
                standardize_scores.append(standardize_score_name)
        
                # Standardize raw scores using method scale
                sea_dataset_subset[ standardize_score_name ] = preprocessing.scale( sea_dataset_subset[ raw_score_name ] )
        
                # Fit a logistic model using standardize scores and save it on dictionary 
                logit_models_dict[ standardize_score_name ] = sm.formula.glm(
                    "Case_recoded ~ " + standardize_score_name + " + age + sex",
                    family=sm.families.Binomial(),
                    data=sea_dataset_subset).fit()

                # Create dictonary with scores performances
                model_summary_series = {
                        "model_ref": raw_score_name,
                        "race": race,
                        "reference_panel": reference_panel_id,
                        "threshold": threshold,
                        "GRS": grs,
                        "odds_ratio": np.exp( logit_models_dict[ standardize_score_name ].params )[ standardize_score_name ],
                        "score_pvalue": logit_models_dict[ standardize_score_name ].pvalues[ standardize_score_name ],
                        "conf_interval_lower": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[0][ standardize_score_name ] ),
                        "conf_interval_upper": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[1][ standardize_score_name ] )
                    }
                # Add additional info from info files (coverage, etc)
                model_summary_series.update( scores_info_dict[raw_score_name] )
                
                # Transform dictionary on a pandas Series and name it as <standardize_score_name>
                model_summary_series = pd.Series(
                    data=model_summary_series,
                    name=standardize_score_name
                )
                                
                li.append(model_summary_series)
                     
# Concatenate results for all models tested and transpose object to make visualization easier 
models_summaries = pd.concat( li, axis=1 ).transpose().sort_values("odds_ratio", ascending=False)
models_summaries

Unnamed: 0,model_ref,race,reference_panel,threshold,GRS,odds_ratio,score_pvalue,conf_interval_lower,conf_interval_upper,name,...,variantsUsed,variantsIgnored,coverage,variantsSwitched,variantsMultiAllelic,variantsAlleleMissmatch,r2Filtered,notFound,filtered,coverageLabel
White_topmed_r0_PGS000889,topmed_r0_PGS000889,White,topmed,r0,PGS000889,1.3564,0.00367053,1.10427,1.66612,PGS000889,...,8749,0,0.97114,5238,130,348,0,290798284,0,high
White_hrc_r05_wGRS49,hrc_r05_wGRS49,White,hrc,r05,wGRS49,1.35076,0.00329944,1.1053,1.65073,wGRS49,...,29,0,0.591837,15,0,0,20,39117056,0,medium
White_topmed_r05_wGRS49,topmed_r05_wGRS49,White,topmed,r05,wGRS49,1.34401,0.00387992,1.09965,1.64267,wGRS49,...,40,0,0.816327,22,0,0,9,290807462,0,high
White_hrc_r03_PGS000889,hrc_r03_PGS000889,White,hrc,r03,PGS000889,1.34293,0.00404121,1.09838,1.64193,PGS000889,...,5275,0,0.585526,3010,0,20,3756,39108054,0,medium
White_hrc_r03_wGRS49,hrc_r03_wGRS49,White,hrc,r03,wGRS49,1.33925,0.00429241,1.09596,1.63655,wGRS49,...,41,0,0.836735,21,0,0,8,39117056,0,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black_topmed_r03_PGS000667,topmed_r03_PGS000667,Black,topmed,r03,PGS000667,0.961742,0.725307,0.773675,1.19552,PGS000667,...,27,0,0.627907,0,1,1,17,290807465,0,medium
Black_topmed_r0_PGS000667,topmed_r0_PGS000667,Black,topmed,r0,PGS000667,0.939581,0.580842,0.753112,1.17222,PGS000667,...,41,0,0.953488,0,1,4,0,290807465,0,high
White_hrc_r03_PGS000667,hrc_r03_PGS000667,White,hrc,r03,PGS000667,0.911138,0.39621,0.734882,1.12967,PGS000667,...,18,0,0.418605,0,0,0,23,39117064,0,medium
White_hrc_r0_PGS000667,hrc_r0_PGS000667,White,hrc,r0,PGS000667,0.902715,0.331497,0.734243,1.10984,PGS000667,...,40,0,0.930233,0,0,1,0,39117064,0,high


In [54]:
# Export dataframe with summaries on a xls file on the exports folder

models_summaries.to_excel( 
  "/labs/tassimes/rodrigoguarischi/projects/sea/exports/SEA_models_summary.xls",
  sheet_name="models_summary")

In [None]:
model_name = "White_hrc_r03_wGRS49"
print( logit_models_dict[model_name].summary() )
# print( logit_models_dict[model_name].pvalues[model_name] )
# print( np.exp( logit_models_dict[model_name].params )[ model_name ] )
# print( np.exp( logit_models_dict[model_name].conf_int().loc[[ model_name ]] ) )

In [None]:
# Plot boxplots and histograms and boxplots of raw scores spliting by sex

sns.set_style('whitegrid')

score_list = ["wGRS49_r0", "wGRS49_r03", "wGRS49_r05", "wGRS49_r08"]
# score_list = ["PGS000349_r0", "PGS000349_r03", "PGS000349_r05", "PGS000349_r08"]
# score_list = ["PGS000018_r0", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(5*len(score_list),12))
for i in range(0, len(score_list)):
    sns.boxplot(
        x = "Case",
        y = score_list[i],
        data = sea_merged_whites,
        palette = reversed(sns.color_palette(n_colors=2)),
        width=0.4,
        fliersize=2,
        ax=axs[0,i],
    )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Male" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[1,i]
        ).set(title='Males (n={0})'.format( sum( sea_merged_whites["sex"] == "Male" ) ) )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Female" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[2,i]
    ).set(title='Females (n={0})'.format( sum( sea_merged_whites["sex"] == "Female" ) ) )

plt.subplots_adjust(top=1.25)

# # Plot only boxplots

# fig, axs = plt.subplots(ncols=len(score_list), figsize=(5*len(score_list),4))
# for i in range(0, len(score_list)):
#     sns.boxplot(
#         x = "Case",
#         y = score_list[i],
#         data = sea_merged_whites,
#         hue="sex",
#         hue_order=['Male','Female'],
#         width=0.4,
#         fliersize=2,
#         ax=axs[i],
#     )

In [None]:
# Test variables
sns.boxplot(
    x = "Case",
    y = "age",
    data = sea_merged_whites,
    width=0.4,
    fliersize=2
)

Export dataset to CSV files

In [None]:
# Export full dataset
sea_merged_whites.to_csv("exports/sea_whites_phenotypes_scores_full.csv")

# Export only subset of columns Tim asked
select_columns = ["dbGaP_SubjID","sex","race","age","agex2","bmi","cr","wGRS49_r05","PGS000349_r05","PGS000018_r05","wGRS49_r03","PGS000349_r03","PGS000018_r03","wGRS49_r0","PGS000349_r0","PGS000018_r0","wGRS49_r08","PGS000349_r08","PGS000018_r08", "Case"]
sea_merged_whites[select_columns].to_csv("exports/sea_whites_phenotypes_scores_column_subset.csv")

In [None]:
import seaborn as sns

sns.scatterplot(
    x = "wGRS49_r03_scaled",
    y = "Case",
    data = sea_merged_whites
    # ,
    # alpha=0.4,
    # s=6
    )



Test logistic regression in <code>R</code>

In [None]:
import scipy.stats as stats

# table = df.groupby(level="Cancer").sum().values

table = [[840,51663],[32,5053]]
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
sea_merged_whites.head()

In [None]:
from scipy import stats

for score in ["wGRS49_r03", "wGRS49_r05", "wGRS49_r08", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]:
    for traits in ["tf_quartile4", "tr_quartile4", "af_quartile4", "ar_quartile4", "cf_quartile4", "cr_quartile4"]:
        # score = "PGS000018_r08"
        cases = sea_merged_whites[traits]
        print( score , "\t", traits, "\t", stats.ttest_ind( sea_merged_whites[cases][score], sea_merged_whites[np.invert(cases)][score] ) )

In [None]:
#

current_score_file = "./apply_grs/wGRS49_PGS000349_PGS000018_r0.scores.txt"

pgs_results = pd.read_table( current_score_file, sep = "," )
pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
pgs_results = pgs_results.set_index("sample")
min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
pgs_results = pgs_results.add_suffix("_" + min_r2_used)
pgs_results.head()
    # score_name = os.path.basename( current_score_file ).split(".")[0]
    # pgs_results.rename( columns={ pgs_results.columns[0]:score_name }, inplace=True )
