Workflow used for:
  1. Load metadata about GRSs used
  1. Read phenotypic information and consolidate it with raw PGS scores from **apply_grs.ipynb**
  1. Standardize raw scores and calculate Odds Ratios between case/controls
  1. Generate visualization plots

In [1]:
# Import all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np
import os, glob
from sklearn import preprocessing
import statsmodels.api as sm
import json
from xlsxwriter.utility import xl_rowcol_to_cell
import glob
import gzip

**Step 1)** Load metadata about GRSs used

In [2]:
os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs/pgs_reference_weights/")

# Load all key/values present on the header of PGS files into a dictionary
pgs_metadata = {}
for current_score_file in glob.glob( "*.txt.gz" ):
    score_name = current_score_file.replace(".txt.gz","")
    pgs_metadata[score_name] = {}
    with gzip.open( current_score_file,'rt') as f:
        for line in f:
            if( ( line.startswith("#") ) & (not line.startswith("##") ) ):
                line = line.strip()
                line = line.replace("#", "")
                key, value = line.split("=")
                pgs_metadata[score_name][key] = value
    if( str( pgs_metadata[score_name]["pgs_id"] ).startswith("PGS") ):
        pgs_metadata[score_name]["pgs_catalog_hyperlink"] = "https://www.pgscatalog.org/score/{0}/".format(pgs_metadata[score_name]["pgs_id"])
    else:
        pgs_metadata[score_name]["pgs_catalog_hyperlink"] = ""

**Step 2)** Read phenotypic information

In [3]:
os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/")

# Read phenotypes and recode sex and race attributes
sea_phenotypes = pd.read_table(
    "data_preparation_to_imputation/86679/NHLBI/SEA_Herrington/phs000349v1/p1/phenotype/phs000349.v1.pht002191.v1.p1.c1.SEA_Phase2_Subject_Phenotypes.GRU.txt",
    index_col="seaid",
    comment="#")

# Recode Sex and Race
sea_phenotypes = sea_phenotypes.replace( {
    "sex": { 1:"male", 2:"female" },
    "race": { 1:"white", 2:"black" }
    } )

# Drop unnecessary column
sea_phenotypes = sea_phenotypes.drop("dbGaP SubjID", axis = 1)

# Print counts by race and sex and first lines from dataframe 
print( sea_phenotypes.groupby(["race","sex"])["sex"].count() )
sea_phenotypes

race   sex   
black  female     92
       male      412
white  female    128
       male      436
Name: sex, dtype: int64


Unnamed: 0_level_0,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,cr,rltotal,rlmean
seaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
pd10016,female,white,29,841,19.9,6.7,0.0,10.7,0.0,3.3,0.0,0.0,0.000000
pd10018,male,white,30,900,28.3,27.3,0.0,22.0,0.0,3.4,0.6,0.6,0.200000
pd10023,male,white,27,729,21.7,27.3,0.0,55.0,0.0,4.0,0.0,0.0,0.000000
pd10028,male,black,31,961,29,16.0,0.0,36.8,14.8,1.7,1.0,15.8,5.266667
pd10031,male,white,28,784,21.5,20.0,0.0,16.7,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
pd12871,male,white,25,625,26,15.0,0.0,9.0,0.0,0.0,0.0,0.0,0.000000
pd12872,male,black,26,676,18.9,53.3,0.0,55.0,0.0,0.0,0.0,0.0,0.000000
pd12873,male,black,17,289,21.9,18.0,0.0,19.0,0.0,3.0,0.0,0.0,0.000000
pd12875,male,black,23,529,22.7,17.7,0.0,32.3,0.0,2.3,0.0,0.0,0.000000


Step 3) Load **PCA results** to dataframe

In [4]:
os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/pca_analysis/")

pca_results = {}

pca_results["whites"] = pd.read_table( "pca_whites.eigenvec", sep = "\t" )
pca_results["whites"] = pca_results["whites"].set_index("IID")

pca_results["blacks"] = pd.read_table( "pca_blacks.eigenvec", sep = "\t" )
pca_results["blacks"] = pca_results["blacks"].set_index("IID")

pca_results["all"] = pd.concat( [pca_results["whites"], pca_results["blacks"]], axis=0 ).drop("#FID", axis = 1)

sea_dataset_full = pd.concat( [sea_phenotypes, pca_results["all"]], axis = 1 )
sea_dataset_full

Unnamed: 0,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
pd10016,female,white,29,841,19.9,6.7,0.0,10.7,0.0,3.3,...,-0.006825,0.018891,-0.003455,-0.007874,0.026135,0.019208,0.004665,0.008936,-0.040799,0.000123
pd10018,male,white,30,900,28.3,27.3,0.0,22.0,0.0,3.4,...,-0.008184,0.002899,-0.001910,0.026848,0.019029,0.011589,-0.001970,-0.029758,-0.008889,0.037244
pd10023,male,white,27,729,21.7,27.3,0.0,55.0,0.0,4.0,...,0.021547,-0.017545,-0.011910,0.006369,0.021369,-0.028954,0.028462,0.003554,-0.003874,0.003383
pd10028,male,black,31,961,29,16.0,0.0,36.8,14.8,1.7,...,-0.059962,0.025734,0.003848,-0.019902,-0.055411,0.004292,0.046521,0.005009,-0.061828,-0.018428
pd10031,male,white,28,784,21.5,20.0,0.0,16.7,0.0,0.0,...,-0.003675,-0.002468,0.035139,-0.068820,0.086355,-0.012242,-0.072977,-0.018035,-0.090649,0.143556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pd12871,male,white,25,625,26,15.0,0.0,9.0,0.0,0.0,...,-0.013646,-0.009487,0.000942,0.008001,0.004313,-0.014603,0.003558,0.020290,-0.006042,0.016711
pd12872,male,black,26,676,18.9,53.3,0.0,55.0,0.0,0.0,...,-0.012626,0.019843,0.022616,0.054914,-0.068902,-0.012065,0.007603,-0.044239,-0.019055,-0.036954
pd12873,male,black,17,289,21.9,18.0,0.0,19.0,0.0,3.0,...,0.014341,0.017928,0.073089,-0.008301,0.052788,0.054721,0.008493,-0.021289,-0.013251,-0.009285
pd12875,male,black,23,529,22.7,17.7,0.0,32.3,0.0,2.3,...,-0.057942,0.152120,-0.119545,0.073026,0.134121,0.099255,-0.018629,0.017820,-0.017232,0.075466


Step 4) Consolidate raw **PGS scores**

In [5]:
# Read files with raw scores of multiple GRSs

li = []
li.append(sea_dataset_full)

os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs/raw_scores_20220525_wrong_name/")

# Loop over info files and save info as a dictionary   
scores_info_dict = {}
for current_info_file in glob.glob( "*.info.txt" ):

    # Parse filename to get info about GRSs.
    # Format should be: <hrc|topmed>_<whites|blacks>_<8-digits_date>_multiGRS_<minr2_used>.info.txt
    min_r2_used = os.path.basename( current_info_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_info_file ).split("_")[0]
    race = os.path.basename( current_info_file ).split("_")[1]
    
    f = open( current_info_file )
    data = json.load(f)

    # Load metrics on a dictionary of dictionaries using panel, r2 and PGS name and each race as keys
    for score_result in data:
        key = "_".join( (reference_panel_name, min_r2_used, score_result["name"]) )
        
        # If key doesn't exist yet in the dictionary, create it
        if key not in scores_info_dict:
            scores_info_dict[key] = {}
            
        scores_info_dict[key][race] = score_result

    f.close()

# Loop over scores files, load all into memory and save it to list object 'li'    
for current_score_file in glob.glob( "*.scores.txt" ):
        
    grs_results = pd.read_table( current_score_file, sep = "," )
    grs_results["sample"] = grs_results["sample"].str.split("_", expand = True)[0]
    grs_results = grs_results.set_index("sample")
    min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_score_file ).split("_")[0]
        
    # Add prefix to column names. Names should match pattern <hrc|topmed>_<threshold>_<scoreid>
    grs_results = grs_results.add_prefix(reference_panel_name + "_" + min_r2_used + "_")

    # Test if this set of scores already exists in li. 
    # If the other race was already loaded, append subjects to the dataframe. Otherwise, append dataframe to li
    new_score_set = True
    for i in range( len(li) ):
        if( set( grs_results.columns ) == set( li[i].columns ) ):
            li[i] = pd.concat( [li[i], grs_results], axis=0 )
            new_score_set = False

    if( new_score_set ):
        li.append( grs_results )

# Consolidate data into a dataframe and print first lines
sea_dataset_full = pd.concat( li, axis=1 )
sea_dataset_full

Unnamed: 0,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,...,hrc_r03_PGS001105,hrc_r03_PGS000349,hrc_r03_PGS001917,hrc_r03_PGS001818,hrc_r03_TEMprsCatherine,hrc_r03_HDL20201014Shoa,hrc_r03_PGS000957,hrc_r03_logTG20201014Shoa,hrc_r03_PGS000889,hrc_r03_PGS000018
pd10016,female,white,29,841,19.9,6.7,0.0,10.7,0.0,3.3,...,3.841305,0.873616,0.005746,-0.101349,16.285275,0.168135,-0.201919,-2.372354,0.921467,0.711844
pd10018,male,white,30,900,28.3,27.3,0.0,22.0,0.0,3.4,...,-0.416213,0.917900,-0.006679,0.706644,-4.972381,0.430336,0.826526,-0.133204,1.041347,1.533552
pd10023,male,white,27,729,21.7,27.3,0.0,55.0,0.0,4.0,...,1.079553,0.556460,0.011725,0.684267,18.696551,0.173749,0.751608,-0.524123,1.782580,0.996043
pd10028,male,black,31,961,29,16.0,0.0,36.8,14.8,1.7,...,2.323256,0.562552,0.020173,0.003794,18.868259,-0.199715,0.788126,5.545249,0.709041,1.059956
pd10031,male,white,28,784,21.5,20.0,0.0,16.7,0.0,0.0,...,1.167806,0.691087,-0.010445,-0.145423,13.379956,0.244205,0.582374,0.165059,-0.173128,0.845935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pd12871,male,white,25,625,26,15.0,0.0,9.0,0.0,0.0,...,3.000878,0.350081,0.016940,-0.224118,25.951367,0.291442,0.519859,-1.189634,1.192794,0.805912
pd12872,male,black,26,676,18.9,53.3,0.0,55.0,0.0,0.0,...,1.970732,1.308783,0.027034,0.586234,31.842752,-0.102002,0.845895,6.761289,0.627752,1.013249
pd12873,male,black,17,289,21.9,18.0,0.0,19.0,0.0,3.0,...,2.690796,0.956890,0.024645,0.041306,16.919972,-0.009641,0.574422,5.649729,1.058504,1.243416
pd12875,male,black,23,529,22.7,17.7,0.0,32.3,0.0,2.3,...,2.639480,-0.035250,0.011434,-0.065502,38.277627,-0.139036,0.946164,3.113457,-0.735947,0.465990


Step 5) Assign **case/control** to cases

In [6]:
# Assign case/control classes to samples using top quartile rule splitting by sex

# Phenotype of interest
phenotype = "cr"

# Calculate thresholds for each race and sex
print( "Thresholds to be used for case/control definition:" )
thresholds = {}
for race in ["white", "black"]:
    for sex in ["male", "female"]:
        
        # Define Q3 as threshold and save it in thresholds dictionary
        key = race + "_" + sex
        thresholds[ key ] = sea_dataset_full[ (( sea_dataset_full["race"]==race ) & ( sea_dataset_full["sex"]==sex ))  ][phenotype].quantile(0.75)
        
        # Print values and warnings, if needed
        note = ""
        if( thresholds[ key ] == 0 ):
            note = "(WARNING: This group has Q3 equals zero. Considering only non-zeros as CASE group!!)"
        print( " - {0} = {1:.4f} {2}".format( key, thresholds[key], note ) )

# Add a new column called Case with all values equals to False
sea_dataset_full = sea_dataset_full.assign( Case=False )

# Identify subjects above threshold to assign them to group "Case"
for i in sea_dataset_full.index:

    sex = sea_dataset_full.loc[ i, "sex"]
    race = sea_dataset_full.loc[ i, "race"]
    key = race + "_" + sex
    
    if( sea_dataset_full.loc[i, phenotype] > thresholds[ key ] ):
        sea_dataset_full.loc[i, "Case"] = True

# Print summary of Case/Controls for each subgroup
print( sea_dataset_full.groupby(["race","sex","Case"])["Case"].count() )
sea_dataset_full

Thresholds to be used for case/control definition:
 - white_male = 1.6000 
 - white_female = 0.4250 
 - black_male = 1.0000 
race   sex     Case 
black  female  False     73
               True      19
       male    False    311
               True     101
white  female  False     96
               True      32
       male    False    328
               True     108
Name: Case, dtype: int64


Unnamed: 0,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,...,hrc_r03_PGS000349,hrc_r03_PGS001917,hrc_r03_PGS001818,hrc_r03_TEMprsCatherine,hrc_r03_HDL20201014Shoa,hrc_r03_PGS000957,hrc_r03_logTG20201014Shoa,hrc_r03_PGS000889,hrc_r03_PGS000018,Case
pd10016,female,white,29,841,19.9,6.7,0.0,10.7,0.0,3.3,...,0.873616,0.005746,-0.101349,16.285275,0.168135,-0.201919,-2.372354,0.921467,0.711844,False
pd10018,male,white,30,900,28.3,27.3,0.0,22.0,0.0,3.4,...,0.917900,-0.006679,0.706644,-4.972381,0.430336,0.826526,-0.133204,1.041347,1.533552,False
pd10023,male,white,27,729,21.7,27.3,0.0,55.0,0.0,4.0,...,0.556460,0.011725,0.684267,18.696551,0.173749,0.751608,-0.524123,1.782580,0.996043,False
pd10028,male,black,31,961,29,16.0,0.0,36.8,14.8,1.7,...,0.562552,0.020173,0.003794,18.868259,-0.199715,0.788126,5.545249,0.709041,1.059956,False
pd10031,male,white,28,784,21.5,20.0,0.0,16.7,0.0,0.0,...,0.691087,-0.010445,-0.145423,13.379956,0.244205,0.582374,0.165059,-0.173128,0.845935,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pd12871,male,white,25,625,26,15.0,0.0,9.0,0.0,0.0,...,0.350081,0.016940,-0.224118,25.951367,0.291442,0.519859,-1.189634,1.192794,0.805912,False
pd12872,male,black,26,676,18.9,53.3,0.0,55.0,0.0,0.0,...,1.308783,0.027034,0.586234,31.842752,-0.102002,0.845895,6.761289,0.627752,1.013249,False
pd12873,male,black,17,289,21.9,18.0,0.0,19.0,0.0,3.0,...,0.956890,0.024645,0.041306,16.919972,-0.009641,0.574422,5.649729,1.058504,1.243416,False
pd12875,male,black,23,529,22.7,17.7,0.0,32.3,0.0,2.3,...,-0.035250,0.011434,-0.065502,38.277627,-0.139036,0.946164,3.113457,-0.735947,0.465990,False


**Step 6)** **Standardize** raw scores and calculate **Odds Ratios** between case/controls

In [7]:
# Standardize raw scores and calculate OR

li = []
standardize_scores = []
logit_models_dict = {}
models_summaries = pd.DataFrame()

# Get list of panels, thresholds and GRSs used from info dictonary
reference_panels = set( [ key.split('_')[0] for key in scores_info_dict.keys() ] )
thresholds = set( [ key.split('_')[1] for key in scores_info_dict.keys() ] )
grss = set( [ key.split('_')[2] for key in scores_info_dict.keys() ] )

# Normalize all scores with mean = 0 and SD = 1
for race in ["white", "black"]:
    
    # Subset cohort between whites and blacks to run logistic regression individually
    sea_dataset_subset = sea_dataset_full[ sea_dataset_full["race"] == race ]

    # Create a dependent variable named "Case_recoded" based on column "Case", conding it as 0 and 1 to fit glm
    sea_dataset_subset = sea_dataset_subset.assign( Case_recoded=sea_dataset_subset["Case"].replace(True, 1).replace(False, 0) )
    
    for reference_panel_id in reference_panels:
        
        for threshold in thresholds:

            for grs in grss:

                raw_score_name = "_".join( (reference_panel_id, threshold, grs) )
                standardize_score_name = "_".join( (race, reference_panel_id, threshold, grs) )
                standardize_scores.append(standardize_score_name)
        
                # Standardize raw scores using method scale
                sea_dataset_subset[ standardize_score_name ] = preprocessing.scale( sea_dataset_subset[ raw_score_name ] )
        
                # Fit a logistic model using standardize scores and save it on dictionary 
                logit_models_dict[ standardize_score_name ] = sm.formula.glm(
                    "Case_recoded ~ " + standardize_score_name + " + age + sex + PC1 + PC2 + PC3 + PC4 + PC5",
                    family=sm.families.Binomial(),
                    data=sea_dataset_subset).fit()

                # Create dictonary with scores performances
                model_summary_series = {
                        "model_ref": raw_score_name,
                        "race": race,
                        "reference_panel": reference_panel_id,
                        "threshold": threshold,
                        "GRS": grs,
                        "odds_ratio": np.exp( logit_models_dict[ standardize_score_name ].params )[ standardize_score_name ],
                        "score_pvalue": logit_models_dict[ standardize_score_name ].pvalues[ standardize_score_name ],
                        "conf_interval_lower": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[0][ standardize_score_name ] ),
                        "conf_interval_upper": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[1][ standardize_score_name ] )
                    }
                # Add additional info from info files (coverage, etc)
                model_summary_series.update( scores_info_dict[raw_score_name][race + "s"] )
                
                # Add metadata about the GRSs to the output
                model_summary_series.update( pgs_metadata[grs] )
                
                # Transform dictionary on a pandas Series and name it as <standardize_score_name>
                model_summary_series = pd.Series(
                    data=model_summary_series,
                    name=standardize_score_name
                )
                                
                li.append(model_summary_series)
                     
# Concatenate results for all models tested and transpose object to make visualization easier 
models_summaries = pd.concat( li, axis=1 ).transpose().sort_values("odds_ratio", ascending=False)

# Move trait_mapped column to first position to facilitate analysis of excel results
columns_order = models_summaries.columns.drop("trait_mapped").tolist()
columns_order.insert(0,"trait_mapped")
models_summaries = models_summaries.reindex(columns=columns_order, copy=False)
models_summaries

Unnamed: 0,trait_mapped,model_ref,race,reference_panel,threshold,GRS,odds_ratio,score_pvalue,conf_interval_lower,conf_interval_upper,...,pgs_name,trait_reported,trait_efo,weight_type,genome_build,variants_number,pgp_id,citation,pgs_catalog_hyperlink,license
black_topmed_r0_logTG20201014Shoa,triglyceride measurement,topmed_r0_logTG20201014Shoa,black,topmed,r0,logTG20201014Shoa,1.53941,0.021245,1.06647,2.22207,...,logTG20201014Shoa,Triglycerides,EFO_0004530,NR,hg19,30071,,Ask Shoa,,
black_topmed_r03_logTG20201014Shoa,triglyceride measurement,topmed_r03_logTG20201014Shoa,black,topmed,r03,logTG20201014Shoa,1.51839,0.0269178,1.04886,2.1981,...,logTG20201014Shoa,Triglycerides,EFO_0004530,NR,hg19,30071,,Ask Shoa,,
white_topmed_r0_PGS000018,coronary artery disease,topmed_r0_PGS000018,white,topmed,r0,PGS000018,1.48183,0.00113608,1.16933,1.87785,...,metaGRS_CAD,Coronary artery disease,EFO_0001645,NR,hg19,1745180,PGP000007,Inouye M et al. J Am Coll Cardiol (2018). doi:...,https://www.pgscatalog.org/score/PGS000018/,
white_hrc_r0_TEMprsCatherine,Coronary artery disease,hrc_r0_TEMprsCatherine,white,hrc,r0,TEMprsCatherine,1.43058,0.0156741,1.06998,1.91271,...,TEMprsCatherine,Coronary artery disease,EFO_0001645,NR,hg19,538096,,Ask Shoa or Catherine,,
black_topmed_r05_logTG20201014Shoa,triglyceride measurement,topmed_r05_logTG20201014Shoa,black,topmed,r05,logTG20201014Shoa,1.42706,0.0545909,0.993022,2.05081,...,logTG20201014Shoa,Triglycerides,EFO_0004530,NR,hg19,30071,,Ask Shoa,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
black_hrc_r08_PGS001979,triglyceride measurement,hrc_r08_PGS001979,black,hrc,r08,PGS001979,0.834608,0.128296,0.661142,1.05359,...,portability-PLR_log_triglycerides,Triglycerides,EFO_0004530,beta,GRCh37,71203,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS001979/,
black_hrc_r05_PGS001357,type 2 diabetes mellitus,hrc_r05_PGS001357,black,hrc,r05,PGS001357,0.834141,0.134904,0.657636,1.05802,...,T2D_AnnoPred_PRS,Type 2 diabetes,MONDO_0005148,beta,hg19,2996761,PGP000252,Ye Y et al. Circ Genom Precis Med (2021). doi:...,https://www.pgscatalog.org/score/PGS001357/,
black_hrc_r08_PGS001105,body fat percentage,hrc_r08_PGS001105,black,hrc,r08,PGS001105,0.820428,0.101502,0.647382,1.03973,...,GBE_INI23127,Body fat percentage (trunk fat),EFO_0007800,NR,GRCh37,25651,PGP000244,Tanigawa Y et al. PLoS Genet (2022). doi:10.13...,https://www.pgscatalog.org/score/PGS001105/,
black_hrc_r08_PGS001900,diastolic blood pressure,hrc_r08_PGS001900,black,hrc,r08,PGS001900,0.820027,0.101062,0.646885,1.03951,...,portability-PLR_diastolic_BP,"Diastolic blood pressure, automated reading",EFO_0006336,beta,GRCh37,66335,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS001900/,


**Step 7)** Export dataframe with summaries as a **xls** file

In [8]:
# Export dataframe with summaries on a xls file on the exports folder

writer = pd.ExcelWriter(
    path = "/labs/tassimes/rodrigoguarischi/projects/sea/exports/SEA_models_summary.xls",
    engine = 'xlsxwriter'
    )

models_summaries.to_excel(writer, sheet_name='models_summary')

# Get access to the workbook and sheet
workbook = writer.book
worksheet = writer.sheets['models_summary']

# Automatically add filters to columns
worksheet.autofilter( "A1:AH{0}".format(len( models_summaries.index ) + 1) )

# Add a percent format with 2 decimal points
percent_fmt = workbook.add_format({'num_format': '0.00%', 'align': 'left'})

# Add a left-alignment format
left_alignment_fmt = workbook.add_format({'align': 'left'})

# Format the columns by width and include number formats
default_column_width = 11
worksheet.set_column('A:A', 35, left_alignment_fmt)                         # Model full name
worksheet.set_column('B:B', 45, left_alignment_fmt)                         # Trait mapped
worksheet.set_column('C:C', 30, left_alignment_fmt)                         # Model ref
worksheet.set_column('D:O', default_column_width, left_alignment_fmt)     
worksheet.set_column('P:P', default_column_width, percent_fmt)              # Coverage
worksheet.set_column('Q:AF', default_column_width, left_alignment_fmt)
worksheet.set_column('AG:AG', 70, left_alignment_fmt)                       # Citation
worksheet.set_column('AH:AH', 40, left_alignment_fmt)                       # PGS catalog link
worksheet.set_column('AI:AI', default_column_width, left_alignment_fmt)     # License

# Transform cells on column AH on hyperlinks
for model_index in range(0, len(models_summaries)):

    # Skip one line for the header +1 because excel location is 1-based
    row_number = model_index + 2
    model_index_name = models_summaries.index[model_index]
    
    # Get text to transform it to URL
    hyperlink = models_summaries.loc[model_index_name,"pgs_catalog_hyperlink"]
    
    # write text as hyperlink
    worksheet.write_url( "AH" + str(row_number), hyperlink)

## Add colors to cells of coverage labels to easy visualization

# Create a red, yellow and green fills with dark text
red_format = workbook.add_format({'bg_color':   '#FFC7CE', 'font_color': '#9C0006'})
yellow_format = workbook.add_format({'bg_color':   '#FFEB9C', 'font_color': '#9C6500'})
green_format = workbook.add_format({'bg_color':   '#C6EFCE', 'font_color': '#006100'})

# Get location of coverage label cells
cells_location = 'W1:W' + str( len(models_summaries) + 1)

worksheet.conditional_format( cells_location, {'type': 'cell', 'criteria': 'equal to', 'value': '"high"', 'format': green_format} )
worksheet.conditional_format( cells_location, {'type': 'cell', 'criteria': 'equal to', 'value': '"medium"', 'format': yellow_format} )
worksheet.conditional_format( cells_location, {'type': 'cell', 'criteria': 'equal to', 'value': '"low"', 'format': red_format} )
worksheet.conditional_format( cells_location, {'type': 'cell', 'criteria': 'equal to', 'value': '"zero"', 'format': red_format} )

# Save modifications
writer.save()

In [None]:
# Export dataframe with summaries on a xls file on the exports folder

models_summaries.to_excel( 
  "/labs/tassimes/rodrigoguarischi/projects/sea/exports/SEA_models_summary.xls",
  sheet_name="models_summary")

In [None]:
model_name = "White_hrc_r03_wGRS49"
print( logit_models_dict[model_name].summary() )
# print( logit_models_dict[model_name].pvalues[model_name] )
# print( np.exp( logit_models_dict[model_name].params )[ model_name ] )
# print( np.exp( logit_models_dict[model_name].conf_int().loc[[ model_name ]] ) )

In [None]:
# Plot boxplots and histograms and boxplots of raw scores spliting by sex

sns.set_style('whitegrid')

score_list = ["wGRS49_r0", "wGRS49_r03", "wGRS49_r05", "wGRS49_r08"]
# score_list = ["PGS000349_r0", "PGS000349_r03", "PGS000349_r05", "PGS000349_r08"]
# score_list = ["PGS000018_r0", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(5*len(score_list),12))
for i in range(0, len(score_list)):
    sns.boxplot(
        x = "Case",
        y = score_list[i],
        data = sea_merged_whites,
        palette = reversed(sns.color_palette(n_colors=2)),
        width=0.4,
        fliersize=2,
        ax=axs[0,i],
    )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Male" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[1,i]
        ).set(title='Males (n={0})'.format( sum( sea_merged_whites["sex"] == "Male" ) ) )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Female" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[2,i]
    ).set(title='Females (n={0})'.format( sum( sea_merged_whites["sex"] == "Female" ) ) )

plt.subplots_adjust(top=1.25)

# # Plot only boxplots

# fig, axs = plt.subplots(ncols=len(score_list), figsize=(5*len(score_list),4))
# for i in range(0, len(score_list)):
#     sns.boxplot(
#         x = "Case",
#         y = score_list[i],
#         data = sea_merged_whites,
#         hue="sex",
#         hue_order=['Male','Female'],
#         width=0.4,
#         fliersize=2,
#         ax=axs[i],
#     )

In [None]:
# Test variables
sns.boxplot(
    x = "Case",
    y = "age",
    data = sea_merged_whites,
    width=0.4,
    fliersize=2
)

Export dataset to CSV files

In [None]:
# Export full dataset
sea_merged_whites.to_csv("exports/sea_whites_phenotypes_scores_full.csv")

# Export only subset of columns Tim asked
select_columns = ["dbGaP_SubjID","sex","race","age","agex2","bmi","cr","wGRS49_r05","PGS000349_r05","PGS000018_r05","wGRS49_r03","PGS000349_r03","PGS000018_r03","wGRS49_r0","PGS000349_r0","PGS000018_r0","wGRS49_r08","PGS000349_r08","PGS000018_r08", "Case"]
sea_merged_whites[select_columns].to_csv("exports/sea_whites_phenotypes_scores_column_subset.csv")

In [None]:
import seaborn as sns

sns.scatterplot(
    x = "wGRS49_r03_scaled",
    y = "Case",
    data = sea_merged_whites
    # ,
    # alpha=0.4,
    # s=6
    )



Test logistic regression in <code>R</code>

In [None]:
import scipy.stats as stats

# table = df.groupby(level="Cancer").sum().values

table = [[840,51663],[32,5053]]
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
sea_merged_whites.head()

In [None]:
from scipy import stats

for score in ["wGRS49_r03", "wGRS49_r05", "wGRS49_r08", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]:
    for traits in ["tf_quartile4", "tr_quartile4", "af_quartile4", "ar_quartile4", "cf_quartile4", "cr_quartile4"]:
        # score = "PGS000018_r08"
        cases = sea_merged_whites[traits]
        print( score , "\t", traits, "\t", stats.ttest_ind( sea_merged_whites[cases][score], sea_merged_whites[np.invert(cases)][score] ) )

In [None]:
#

current_score_file = "./apply_grs/wGRS49_PGS000349_PGS000018_r0.scores.txt"

pgs_results = pd.read_table( current_score_file, sep = "," )
pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
pgs_results = pgs_results.set_index("sample")
min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
pgs_results = pgs_results.add_suffix("_" + min_r2_used)
pgs_results.head()
    # score_name = os.path.basename( current_score_file ).split(".")[0]
    # pgs_results.rename( columns={ pgs_results.columns[0]:score_name }, inplace=True )
