Workflow used for:
  1. Load metadata about GRSs used
  1. Read phenotypic information and consolidate it with raw PGS scores from **apply_grs.ipynb**
  1. Standardize raw scores and calculate Odds Ratios between case/controls
  1. Generate visualization plots

In [1]:
# Import all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np
import os, glob
from sklearn import preprocessing
import statsmodels.api as sm
import json
from xlsxwriter.utility import xl_rowcol_to_cell

**Step 1)** Load metadata about GRSs used

In [2]:
import glob
import pandas as pd
import gzip

os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs/pgs_reference_weights/")

# Load all key/values present on the header of PGS files into a dictionary
pgs_metadata = {}
for current_score_file in glob.glob( "*.txt.gz" ):
    score_name = current_score_file.replace(".txt.gz","")
    pgs_metadata[score_name] = {}
    with gzip.open( current_score_file,'rt') as f:
        for line in f:
            if( ( line.startswith("#") ) & (not line.startswith("##") ) ):
                line = line.strip()
                line = line.replace("#", "")
                key, value = line.split("=")
                pgs_metadata[score_name][key] = value
    if( str( pgs_metadata[score_name]["pgs_id"] ).startswith("PGS") ):
        pgs_metadata[score_name]["pgs_catalog_hyperlink"] = "https://www.pgscatalog.org/score/{0}/".format(pgs_metadata[score_name]["pgs_id"])
    else:
        pgs_metadata[score_name]["pgs_catalog_hyperlink"] = ""

**Step 2)** Read phenotypic information and consolidate with raw PGS scores

In [3]:
os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/")

# Read phenotypes and recode sex and race attributes
sea_phenotypes = pd.read_table(
    "data_preparation_to_imputation/86679/NHLBI/SEA_Herrington/phs000349v1/p1/phenotype/phs000349.v1.pht002191.v1.p1.c1.SEA_Phase2_Subject_Phenotypes.GRU.txt",
    index_col="seaid",
    comment="#")

# Recode Sex and Race
sea_phenotypes = sea_phenotypes.replace( {
    "sex": { 1:"male", 2:"female" },
    "race": { 1:"white", 2:"black" }
    } )

# Print counts by race and sex and first lines from dataframe 
print( sea_phenotypes.groupby(["race","sex"])["sex"].count() )
sea_phenotypes.head()

race   sex   
black  female     92
       male      412
white  female    128
       male      436
Name: sex, dtype: int64


Unnamed: 0_level_0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,cf,cr,rltotal,rlmean
seaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
pd10016,445969,female,white,29,841,19.9,6.7,0.0,10.7,0.0,3.3,0.0,0.0,0.0
pd10018,445970,male,white,30,900,28.3,27.3,0.0,22.0,0.0,3.4,0.6,0.6,0.2
pd10023,445971,male,white,27,729,21.7,27.3,0.0,55.0,0.0,4.0,0.0,0.0,0.0
pd10028,445972,male,black,31,961,29.0,16.0,0.0,36.8,14.8,1.7,1.0,15.8,5.266667
pd10031,445973,male,white,28,784,21.5,20.0,0.0,16.7,0.0,0.0,0.0,0.0,0.0


In [4]:
# Read files with raw scores of multiple GRSs

li = []
li.append(sea_phenotypes)

os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/apply_grs/raw_scores_20220504/")

# Loop over info files and save info as a dictionary   
scores_info_dict = {}
for current_info_file in glob.glob( "*.info.txt" ):

    # Parse filename to get info about GRSs.
    # Format should be: <hrc|topmed>_<whites|blacks>_<8-digits_date>_multiGRS_<minr2_used>.info.txt
    min_r2_used = os.path.basename( current_info_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_info_file ).split("_")[0]
    race = os.path.basename( current_info_file ).split("_")[1]
    
    f = open( current_info_file )
    data = json.load(f)

    # Load metrics on a dictionary of dictionaries using panel, r2 and PGS name and each race as keys
    for score_result in data:
        key = "_".join( (reference_panel_name, min_r2_used, score_result["name"]) )
        
        # If key doesn't exist yet in the dictionary, create it
        if key not in scores_info_dict:
            scores_info_dict[key] = {}
            
        scores_info_dict[key][race] = score_result

    f.close()

# Loop over scores files, load all into memory and save it to list object 'li'    
for current_score_file in glob.glob( "*.scores.txt" ):
        
    grs_results = pd.read_table( current_score_file, sep = "," )
    grs_results["sample"] = grs_results["sample"].str.split("_", expand = True)[0]
    grs_results = grs_results.set_index("sample")
    min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
    reference_panel_name = os.path.basename( current_score_file ).split("_")[0]
        
    # Add prefix to column names. Names should match pattern <hrc|topmed>_<threshold>_<scoreid>
    grs_results = grs_results.add_prefix(reference_panel_name + "_" + min_r2_used + "_")

    # Test if this set of scores already exists in li. 
    # If the other race was already loaded, append subjects to the dataframe. Otherwise, append dataframe to li
    new_score_set = True
    for i in range( len(li) ):
        if( set( grs_results.columns ) == set( li[i].columns ) ):
            li[i] = pd.concat( [li[i], grs_results], axis=0 )
            new_score_set = False

    if( new_score_set ):
        li.append( grs_results )

# Consolidate data into a dataframe and print first lines
sea_dataset_full = pd.concat( li, axis=1 )
sea_dataset_full.head()

Unnamed: 0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,topmed_r03_PGS000013,topmed_r03_PGS001357,topmed_r03_PGS002114,topmed_r03_PGS001105,topmed_r03_PGS000349,topmed_r03_PGS001917,topmed_r03_PGS001818,topmed_r03_PGS000957,topmed_r03_PGS000889,topmed_r03_PGS000018
pd10016,445969,female,white,29,841,19.9,6.7,0.0,10.7,0.0,...,0.195099,-0.03217,2.662745,4.381828,0.899946,0.011109,-0.147684,0.027173,0.945526,0.63451
pd10018,445970,male,white,30,900,28.3,27.3,0.0,22.0,0.0,...,0.181852,-0.23224,1.028924,1.048539,1.140902,-0.013622,0.644547,0.874353,0.87019,1.261631
pd10023,445971,male,white,27,729,21.7,27.3,0.0,55.0,0.0,...,0.151067,-0.191068,3.098229,1.721006,0.694613,0.001444,0.470915,0.735025,1.365498,0.623044
pd10028,445972,male,black,31,961,29.0,16.0,0.0,36.8,14.8,...,-0.016966,-0.251807,3.16572,4.340293,0.627491,0.029261,0.090178,1.278612,-0.102817,1.027068
pd10031,445973,male,white,28,784,21.5,20.0,0.0,16.7,0.0,...,0.047663,-0.204522,3.074076,2.298869,1.016425,0.006443,-0.178284,0.897139,-0.176216,0.740625


In [5]:
# Assign case/control classes to samples using top quartile rule splitting by sex

# Phenotype of interest
phenotype = "cr"

# Calculate thresholds for each race and sex
print( "Thresholds to be used for case/control definition:" )
thresholds = {}
for race in ["white", "black"]:
    for sex in ["male", "female"]:
        
        # Define Q3 as threshold and save it in thresholds dictionary
        key = race + "_" + sex
        thresholds[ key ] = sea_dataset_full[ (( sea_dataset_full["race"]==race ) & ( sea_dataset_full["sex"]==sex ))  ][phenotype].quantile(0.75)
        
        # Print values and warnings, if needed
        note = ""
        if( thresholds[ key ] == 0 ):
            note = "(WARNING: This group has Q3 equals zero. Considering only non-zeros as CASE group!!)"
        print( " - {0} = {1:.4f} {2}".format( key, thresholds[key], note ) )

# Add a new column called Case with all values equals to False
sea_dataset_full = sea_dataset_full.assign( Case=False )

# Identify subjects above threshold to assign them to group "Case"
for i in sea_dataset_full.index:

    sex = sea_dataset_full.loc[ i, "sex"]
    race = sea_dataset_full.loc[ i, "race"]
    key = race + "_" + sex
    
    if( sea_dataset_full.loc[i, phenotype] > thresholds[ key ] ):
        sea_dataset_full.loc[i, "Case"] = True

# Print summary of Case/Controls for each subgroup
print( sea_dataset_full.groupby(["race","sex","Case"])["Case"].count() )
sea_dataset_full.head()

Thresholds to be used for case/control definition:
 - white_male = 1.6000 
 - white_female = 0.4250 
 - black_male = 1.0000 
race   sex     Case 
black  female  False     73
               True      19
       male    False    311
               True     101
white  female  False     96
               True      32
       male    False    328
               True     108
Name: Case, dtype: int64


Unnamed: 0,dbGaP SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,topmed_r03_PGS001357,topmed_r03_PGS002114,topmed_r03_PGS001105,topmed_r03_PGS000349,topmed_r03_PGS001917,topmed_r03_PGS001818,topmed_r03_PGS000957,topmed_r03_PGS000889,topmed_r03_PGS000018,Case
pd10016,445969,female,white,29,841,19.9,6.7,0.0,10.7,0.0,...,-0.03217,2.662745,4.381828,0.899946,0.011109,-0.147684,0.027173,0.945526,0.63451,False
pd10018,445970,male,white,30,900,28.3,27.3,0.0,22.0,0.0,...,-0.23224,1.028924,1.048539,1.140902,-0.013622,0.644547,0.874353,0.87019,1.261631,False
pd10023,445971,male,white,27,729,21.7,27.3,0.0,55.0,0.0,...,-0.191068,3.098229,1.721006,0.694613,0.001444,0.470915,0.735025,1.365498,0.623044,False
pd10028,445972,male,black,31,961,29.0,16.0,0.0,36.8,14.8,...,-0.251807,3.16572,4.340293,0.627491,0.029261,0.090178,1.278612,-0.102817,1.027068,False
pd10031,445973,male,white,28,784,21.5,20.0,0.0,16.7,0.0,...,-0.204522,3.074076,2.298869,1.016425,0.006443,-0.178284,0.897139,-0.176216,0.740625,False


**Step 3)** **Standardize** raw scores and calculate **Odds Ratios** between case/controls

In [6]:
# Standardize raw scores and calculate OR

li = []
standardize_scores = []
logit_models_dict = {}
models_summaries = pd.DataFrame()

# Get list of panels, thresholds and GRSs used from info dictonary
reference_panels = set( [ key.split('_')[0] for key in scores_info_dict.keys() ] )
thresholds = set( [ key.split('_')[1] for key in scores_info_dict.keys() ] )
grss = set( [ key.split('_')[2] for key in scores_info_dict.keys() ] )

# Normalize all scores with mean = 0 and SD = 1
for race in ["white", "black"]:
    
    # Subset cohort between whites and blacks to run logistic regression individually
    sea_dataset_subset = sea_dataset_full[ sea_dataset_full["race"] == race ]

    # Create a dependent variable named "Case_recoded" based on column "Case", conding it as 0 and 1 to fit glm
    sea_dataset_subset = sea_dataset_subset.assign( Case_recoded=sea_dataset_subset["Case"].replace(True, 1).replace(False, 0) )
    
    for reference_panel_id in reference_panels:
        
        for threshold in thresholds:

            for grs in grss:

                raw_score_name = "_".join( (reference_panel_id, threshold, grs) )
                standardize_score_name = "_".join( (race, reference_panel_id, threshold, grs) )
                standardize_scores.append(standardize_score_name)
        
                # Standardize raw scores using method scale
                sea_dataset_subset[ standardize_score_name ] = preprocessing.scale( sea_dataset_subset[ raw_score_name ] )
        
                # Fit a logistic model using standardize scores and save it on dictionary 
                logit_models_dict[ standardize_score_name ] = sm.formula.glm(
                    "Case_recoded ~ " + standardize_score_name + " + age + sex",
                    family=sm.families.Binomial(),
                    data=sea_dataset_subset).fit()

                # Create dictonary with scores performances
                model_summary_series = {
                        "model_ref": raw_score_name,
                        "race": race,
                        "reference_panel": reference_panel_id,
                        "threshold": threshold,
                        "GRS": grs,
                        "odds_ratio": np.exp( logit_models_dict[ standardize_score_name ].params )[ standardize_score_name ],
                        "score_pvalue": logit_models_dict[ standardize_score_name ].pvalues[ standardize_score_name ],
                        "conf_interval_lower": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[0][ standardize_score_name ] ),
                        "conf_interval_upper": np.exp( logit_models_dict[ standardize_score_name ].conf_int()[1][ standardize_score_name ] )
                    }
                # Add additional info from info files (coverage, etc)
                model_summary_series.update( scores_info_dict[raw_score_name][race + "s"] )
                
                # Add metadata about the GRSs to the output
                model_summary_series.update( pgs_metadata[grs] )
                
                # Transform dictionary on a pandas Series and name it as <standardize_score_name>
                model_summary_series = pd.Series(
                    data=model_summary_series,
                    name=standardize_score_name
                )
                                
                li.append(model_summary_series)
                     
# Concatenate results for all models tested and transpose object to make visualization easier 
models_summaries = pd.concat( li, axis=1 ).transpose().sort_values("odds_ratio", ascending=False)
models_summaries

Unnamed: 0,model_ref,race,reference_panel,threshold,GRS,odds_ratio,score_pvalue,conf_interval_lower,conf_interval_upper,name,...,trait_reported,trait_mapped,trait_efo,weight_type,genome_build,variants_number,pgp_id,citation,pgs_catalog_hyperlink,license
white_hrc_r03_PGS000013,hrc_r03_PGS000013,white,hrc,r03,PGS000013,1.43122,0.000648759,1.16472,1.75871,PGS000013,...,Coronary artery disease,coronary artery disease,EFO_0001645,NR,hg19,6630150,PGP000006,Khera AV et al. Nat Genet (2018). doi:10.1038/...,https://www.pgscatalog.org/score/PGS000013/,Freely available to the academic community for...
white_topmed_r0_PGS000013,topmed_r0_PGS000013,white,topmed,r0,PGS000013,1.41677,0.000907223,1.15324,1.74052,PGS000013,...,Coronary artery disease,coronary artery disease,EFO_0001645,NR,hg19,6630150,PGP000006,Khera AV et al. Nat Genet (2018). doi:10.1038/...,https://www.pgscatalog.org/score/PGS000013/,Freely available to the academic community for...
white_topmed_r03_PGS000013,topmed_r03_PGS000013,white,topmed,r03,PGS000013,1.40907,0.00108849,1.14702,1.73098,PGS000013,...,Coronary artery disease,coronary artery disease,EFO_0001645,NR,hg19,6630150,PGP000006,Khera AV et al. Nat Genet (2018). doi:10.1038/...,https://www.pgscatalog.org/score/PGS000013/,Freely available to the academic community for...
black_hrc_r08_wGRS49,hrc_r08_wGRS49,black,hrc,r08,wGRS49,1.40481,0.00165781,1.13668,1.73619,wGRS49,...,Coronary artery disease,coronary artery disease,EFO_0001645,beta,hg19,49,wGRS49,Circ Cardiovasc Genet. 2015;8:803-811. DOI: 10...,,
white_hrc_r0_PGS000013,hrc_r0_PGS000013,white,hrc,r0,PGS000013,1.40474,0.00113998,1.14467,1.72389,PGS000013,...,Coronary artery disease,coronary artery disease,EFO_0001645,NR,hg19,6630150,PGP000006,Khera AV et al. Nat Genet (2018). doi:10.1038/...,https://www.pgscatalog.org/score/PGS000013/,Freely available to the academic community for...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
black_hrc_r08_PGS002009,hrc_r08_PGS002009,black,hrc,r08,PGS002009,0.839041,0.100641,0.680446,1.0346,PGS002009,...,"Systolic blood pressure, automated reading",systolic blood pressure,EFO_0006335,beta,GRCh37,68449,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS002009/,
black_hrc_r03_PGS002037,hrc_r03_PGS002037,black,hrc,r03,PGS002037,0.838776,0.0937109,0.682909,1.03022,PGS002037,...,Tobacco use disorder,nicotine dependence,EFO_0003768,beta,GRCh37,847691,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS002037/,
black_hrc_r08_PGS001900,hrc_r08_PGS001900,black,hrc,r08,PGS001900,0.823963,0.0713973,0.667562,1.01701,PGS001900,...,"Diastolic blood pressure, automated reading",diastolic blood pressure,EFO_0006336,beta,GRCh37,66335,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS001900/,
black_hrc_r05_PGS002037,hrc_r05_PGS002037,black,hrc,r05,PGS002037,0.818149,0.0560459,0.665912,1.00519,PGS002037,...,Tobacco use disorder,nicotine dependence,EFO_0003768,beta,GRCh37,847691,PGP000263,Privé F et al. Am J Hum Genet (2022). doi:10.1...,https://www.pgscatalog.org/score/PGS002037/,


In [7]:
# Export dataframe with summaries on a xls file on the exports folder

writer = pd.ExcelWriter(
    path = "/labs/tassimes/rodrigoguarischi/projects/sea/exports/SEA_models_summary.xls",
    engine = 'xlsxwriter'
    )

models_summaries.to_excel(writer, sheet_name='models_summary')

# Get access to the workbook and sheet
workbook = writer.book
worksheet = writer.sheets['models_summary']

# Automatically add filters to columns
worksheet.autofilter( "A1:AH{0}".format(len( models_summaries.index ) + 1) )

# Add a percent format with 2 decimal points
percent_fmt = workbook.add_format({'num_format': '0.00%', 'align': 'left'})

# Add a left-alignment format
left_alignment_fmt = workbook.add_format({'align': 'left'})

# Set all columns to left alignment
worksheet.set_column('A:AH', 15, left_alignment_fmt )

# Format the columns by width and include number formats
worksheet.set_column('A:B', 30, left_alignment_fmt)
worksheet.set_column('Z:AA', 40, left_alignment_fmt)
worksheet.set_column('AG:AG', 70, left_alignment_fmt)
worksheet.set_column('AH:AH', 40, left_alignment_fmt)

# Format coverage column as percent
worksheet.set_column('O:O', 15, percent_fmt)

for model_index in range(0, len(models_summaries)):

    row_number = model_index + 1    
    model_index_name = models_summaries.index[model_index]
    
    # models_summaries.loc[model_index, "pgs_catalog_hyperlink"]
    hyperlink = models_summaries.loc[model_index_name,"pgs_catalog_hyperlink"]

    # Determine where we will place the formula
    cell_location = xl_rowcol_to_cell( row_number, 33)
    
    # print( "\t".join( [str( row_number ), model_index_name, str(hyperlink), cell_location] ) )
    worksheet.write_url(cell_location, hyperlink)
        
# Save modifications
writer.save()

In [None]:
# Export dataframe with summaries on a xls file on the exports folder

models_summaries.to_excel( 
  "/labs/tassimes/rodrigoguarischi/projects/sea/exports/SEA_models_summary.xls",
  sheet_name="models_summary")

In [None]:
model_name = "White_hrc_r03_wGRS49"
print( logit_models_dict[model_name].summary() )
# print( logit_models_dict[model_name].pvalues[model_name] )
# print( np.exp( logit_models_dict[model_name].params )[ model_name ] )
# print( np.exp( logit_models_dict[model_name].conf_int().loc[[ model_name ]] ) )

In [None]:
# Plot boxplots and histograms and boxplots of raw scores spliting by sex

sns.set_style('whitegrid')

score_list = ["wGRS49_r0", "wGRS49_r03", "wGRS49_r05", "wGRS49_r08"]
# score_list = ["PGS000349_r0", "PGS000349_r03", "PGS000349_r05", "PGS000349_r08"]
# score_list = ["PGS000018_r0", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(5*len(score_list),12))
for i in range(0, len(score_list)):
    sns.boxplot(
        x = "Case",
        y = score_list[i],
        data = sea_merged_whites,
        palette = reversed(sns.color_palette(n_colors=2)),
        width=0.4,
        fliersize=2,
        ax=axs[0,i],
    )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Male" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[1,i]
        ).set(title='Males (n={0})'.format( sum( sea_merged_whites["sex"] == "Male" ) ) )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Female" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[2,i]
    ).set(title='Females (n={0})'.format( sum( sea_merged_whites["sex"] == "Female" ) ) )

plt.subplots_adjust(top=1.25)

# # Plot only boxplots

# fig, axs = plt.subplots(ncols=len(score_list), figsize=(5*len(score_list),4))
# for i in range(0, len(score_list)):
#     sns.boxplot(
#         x = "Case",
#         y = score_list[i],
#         data = sea_merged_whites,
#         hue="sex",
#         hue_order=['Male','Female'],
#         width=0.4,
#         fliersize=2,
#         ax=axs[i],
#     )

In [None]:
# Test variables
sns.boxplot(
    x = "Case",
    y = "age",
    data = sea_merged_whites,
    width=0.4,
    fliersize=2
)

Export dataset to CSV files

In [None]:
# Export full dataset
sea_merged_whites.to_csv("exports/sea_whites_phenotypes_scores_full.csv")

# Export only subset of columns Tim asked
select_columns = ["dbGaP_SubjID","sex","race","age","agex2","bmi","cr","wGRS49_r05","PGS000349_r05","PGS000018_r05","wGRS49_r03","PGS000349_r03","PGS000018_r03","wGRS49_r0","PGS000349_r0","PGS000018_r0","wGRS49_r08","PGS000349_r08","PGS000018_r08", "Case"]
sea_merged_whites[select_columns].to_csv("exports/sea_whites_phenotypes_scores_column_subset.csv")

In [None]:
import seaborn as sns

sns.scatterplot(
    x = "wGRS49_r03_scaled",
    y = "Case",
    data = sea_merged_whites
    # ,
    # alpha=0.4,
    # s=6
    )



Test logistic regression in <code>R</code>

In [None]:
import scipy.stats as stats

# table = df.groupby(level="Cancer").sum().values

table = [[840,51663],[32,5053]]
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
sea_merged_whites.head()

In [None]:
from scipy import stats

for score in ["wGRS49_r03", "wGRS49_r05", "wGRS49_r08", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]:
    for traits in ["tf_quartile4", "tr_quartile4", "af_quartile4", "ar_quartile4", "cf_quartile4", "cr_quartile4"]:
        # score = "PGS000018_r08"
        cases = sea_merged_whites[traits]
        print( score , "\t", traits, "\t", stats.ttest_ind( sea_merged_whites[cases][score], sea_merged_whites[np.invert(cases)][score] ) )

In [None]:
#

current_score_file = "./apply_grs/wGRS49_PGS000349_PGS000018_r0.scores.txt"

pgs_results = pd.read_table( current_score_file, sep = "," )
pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
pgs_results = pgs_results.set_index("sample")
min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
pgs_results = pgs_results.add_suffix("_" + min_r2_used)
pgs_results.head()
    # score_name = os.path.basename( current_score_file ).split(".")[0]
    # pgs_results.rename( columns={ pgs_results.columns[0]:score_name }, inplace=True )
