In [1]:
# Import all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np

In [2]:
# Read phenotypes and recode sex and race attributes

sea_phenotypes = pd.read_table( "./raw_files/SEA/SEA_Phase2_Subject_Phenotypes.txt", index_col="seaid" )

sex_recode = {1:"Male", 2:"Female"}
sea_phenotypes = sea_phenotypes.replace({"sex":sex_recode})

race_recode = {1:"White", 2:"Black"}
sea_phenotypes = sea_phenotypes.replace({"race":race_recode})

print( "Type: ", type(sea_phenotypes) )
print( sea_phenotypes.head() )
sea_phenotypes.groupby(["race","sex"])["sex"].count()

Type:  <class 'pandas.core.frame.DataFrame'>
         dbGaP_SubjID     sex   race  age  agex2   bmi    tf   tr    af    ar  \
seaid                                                                           
pd10016        445969  Female  White   29    841  19.9   6.7  0.0  10.7   0.0   
pd10018        445970    Male  White   30    900  28.3  27.3  0.0  22.0   0.0   
pd10023        445971    Male  White   27    729  21.7  27.3  0.0  55.0   0.0   
pd10028        445972    Male  Black   31    961    29  16.0  0.0  36.8  14.8   
pd10031        445973    Male  White   28    784  21.5  20.0  0.0  16.7   0.0   

          cf   cr  rltotal    rlmean  
seaid                                 
pd10016  3.3  0.0      0.0  0.000000  
pd10018  3.4  0.6      0.6  0.200000  
pd10023  4.0  0.0      0.0  0.000000  
pd10028  1.7  1.0     15.8  5.266667  
pd10031  0.0  0.0      0.0  0.000000  


race   sex   
Black  Female     92
       Male      412
White  Female    128
       Male      436
Name: sex, dtype: int64

In [3]:
# Read files with raw scores of multiple PRSs

import os, glob

li = []
li.append(sea_phenotypes)

# for current_score_file in glob.glob( "./apply_grs/results_genotypes/*.scores.txt" ):
for current_score_file in glob.glob( "./apply_grs/results_dosage/*.scores.txt" ):
    pgs_results = pd.read_table( current_score_file, sep = "," )
    pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
    pgs_results = pgs_results.set_index("sample")
    min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
    pgs_results = pgs_results.add_suffix("_" + min_r2_used)

    li.append( pgs_results )
    
sea_merged = pd.concat( li, axis=1)
sea_merged.head()

Unnamed: 0,dbGaP_SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,wGRS49_r05,PGS000349_r05,PGS000018_r05,PGS000667_r05,PGS000889_r05,wGRS49_r08,PGS000349_r08,PGS000018_r08,PGS000667_r08,PGS000889_r08
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,...,0.156018,0.415888,0.352879,0.0,0.430879,0.25144,0.777171,0.223938,0.0,0.391506
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,...,0.121146,0.327556,0.619952,0.0,0.390182,0.205458,0.621752,0.271482,0.0,0.165293
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,...,0.086578,-0.043722,0.221966,0.0,0.709424,0.114574,0.282849,0.044667,0.0,0.229231
pd10028,445972,Male,Black,31,961,29.0,16.0,0.0,36.8,14.8,...,,,,,,,,,,
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,...,0.034744,0.255067,0.390702,0.0,-0.253576,0.026546,0.553965,0.125797,0.0,-0.210833


In [4]:
# Assign case/control classes to samples using top quartile rule splitting by sex

# Phenotype of interest
phenotype = "cr"

# Subset cohort
sea_merged_whites = sea_merged[ sea_merged["race"] == "White" ]

# Calculate thresholds for each sex
male_threshold = sea_merged_whites[ sea_merged_whites["sex"]=="Male" ][phenotype].quantile(0.75)
female_threshold = sea_merged_whites[sea_merged_whites["sex"]=="Female"][phenotype].quantile(0.75)
print( "Thresholds to be used for case/control definition: male = {0} and female = {1}".format( male_threshold, female_threshold) )

if( male_threshold == 0 or female_threshold == 0 ):
    sys.exit("Male or Female threshold equals to zero!")

# Add a new column called Case with all values equals to False
sea_merged_whites = sea_merged_whites.assign( Case=False )

# Identify subjects above threshold to assign them to group "Case"
for i in sea_merged_whites.index:
    if( sea_merged_whites.loc[i, "sex"] == "Male" and sea_merged_whites.loc[i, phenotype] > male_threshold ):
        sea_merged_whites.loc[i, "Case"] = True
    elif( sea_merged_whites.loc[i, "sex"] == "Female" and sea_merged_whites.loc[i, phenotype] > female_threshold ):
        sea_merged_whites.loc[i, "Case"] = True

sea_merged_whites.head()

Thresholds to be used for case/control definition: male = 1.6 and female = 0.42500000000000004


Unnamed: 0,dbGaP_SubjID,sex,race,age,agex2,bmi,tf,tr,af,ar,...,PGS000349_r05,PGS000018_r05,PGS000667_r05,PGS000889_r05,wGRS49_r08,PGS000349_r08,PGS000018_r08,PGS000667_r08,PGS000889_r08,Case
pd10016,445969,Female,White,29,841,19.9,6.7,0.0,10.7,0.0,...,0.415888,0.352879,0.0,0.430879,0.25144,0.777171,0.223938,0.0,0.391506,False
pd10018,445970,Male,White,30,900,28.3,27.3,0.0,22.0,0.0,...,0.327556,0.619952,0.0,0.390182,0.205458,0.621752,0.271482,0.0,0.165293,False
pd10023,445971,Male,White,27,729,21.7,27.3,0.0,55.0,0.0,...,-0.043722,0.221966,0.0,0.709424,0.114574,0.282849,0.044667,0.0,0.229231,False
pd10031,445973,Male,White,28,784,21.5,20.0,0.0,16.7,0.0,...,0.255067,0.390702,0.0,-0.253576,0.026546,0.553965,0.125797,0.0,-0.210833,False
pd10037,445974,Male,White,24,576,23.0,21.0,0.0,10.0,0.0,...,0.204,0.346151,0.0,0.484111,-0.001591,0.62315,0.20324,0.0,0.290753,False


In [None]:
# Plot boxplots and histograms and boxplots of raw scores spliting by sex

sns.set_style('whitegrid')

score_list = ["wGRS49_r0", "wGRS49_r03", "wGRS49_r05", "wGRS49_r08"]
# score_list = ["PGS000349_r0", "PGS000349_r03", "PGS000349_r05", "PGS000349_r08"]
# score_list = ["PGS000018_r0", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(5*len(score_list),12))
for i in range(0, len(score_list)):
    sns.boxplot(
        x = "Case",
        y = score_list[i],
        data = sea_merged_whites,
        palette = reversed(sns.color_palette(n_colors=2)),
        width=0.4,
        fliersize=2,
        ax=axs[0,i],
    )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Male" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[1,i]
        ).set(title='Males (n={0})'.format( sum( sea_merged_whites["sex"] == "Male" ) ) )
    sns.histplot(
        x = score_list[i],
        data = sea_merged_whites[ sea_merged_whites["sex"] == "Female" ],
        hue="Case",
        hue_order=[True, False],
        kde=True,
        ax=axs[2,i]
    ).set(title='Females (n={0})'.format( sum( sea_merged_whites["sex"] == "Female" ) ) )

plt.subplots_adjust(top=1.25)

# # Plot only boxplots

# fig, axs = plt.subplots(ncols=len(score_list), figsize=(5*len(score_list),4))
# for i in range(0, len(score_list)):
#     sns.boxplot(
#         x = "Case",
#         y = score_list[i],
#         data = sea_merged_whites,
#         hue="sex",
#         hue_order=['Male','Female'],
#         width=0.4,
#         fliersize=2,
#         ax=axs[i],
#     )

In [None]:
# Test variables
sns.boxplot(
    x = "Case",
    y = "age",
    data = sea_merged_whites,
    width=0.4,
    fliersize=2
)

Export dataset to CSV files

In [5]:
# Export full dataset
sea_merged_whites.to_csv("exports/sea_whites_phenotypes_scores_full.csv")

# Export only subset of columns Tim asked
select_columns = ["dbGaP_SubjID","sex","race","age","agex2","bmi","cr","wGRS49_r05","PGS000349_r05","PGS000018_r05","wGRS49_r03","PGS000349_r03","PGS000018_r03","wGRS49_r0","PGS000349_r0","PGS000018_r0","wGRS49_r08","PGS000349_r08","PGS000018_r08", "Case"]
sea_merged_whites[select_columns].to_csv("exports/sea_whites_phenotypes_scores_column_subset.csv")

Test logistic regression in <code>R</code>

In [None]:
import scipy.stats as stats

# table = df.groupby(level="Cancer").sum().values

table = [[840,51663],[32,5053]]
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
sea_merged_whites.head()

In [None]:
from scipy import stats

for score in ["wGRS49_r03", "wGRS49_r05", "wGRS49_r08", "PGS000018_r03", "PGS000018_r05", "PGS000018_r08"]:
    for traits in ["tf_quartile4", "tr_quartile4", "af_quartile4", "ar_quartile4", "cf_quartile4", "cr_quartile4"]:
        # score = "PGS000018_r08"
        cases = sea_merged_whites[traits]
        print( score , "\t", traits, "\t", stats.ttest_ind( sea_merged_whites[cases][score], sea_merged_whites[np.invert(cases)][score] ) )

In [None]:
#

current_score_file = "./apply_grs/wGRS49_PGS000349_PGS000018_r0.scores.txt"

pgs_results = pd.read_table( current_score_file, sep = "," )
pgs_results["sample"] = pgs_results["sample"].str.split("_", expand = True)[0]
pgs_results = pgs_results.set_index("sample")
min_r2_used = os.path.basename( current_score_file ).split(".")[0].split("_")[-1]
pgs_results = pgs_results.add_suffix("_" + min_r2_used)
pgs_results.head()
    # score_name = os.path.basename( current_score_file ).split(".")[0]
    # pgs_results.rename( columns={ pgs_results.columns[0]:score_name }, inplace=True )
