# PRS-PheWAS result

In [1]:
import pandas as pd

PRSPhewas_file = "MetS_noUKB_PRSPheWAS_results.MultipleCorrection.csv"
df_prsphewas = pd.read_csv(PRSPhewas_file, sep=",", index_col=False)
df_prsphewas = df_prsphewas[df_prsphewas['Bonferroni significant'] == "YES"]
print('Number of significant phenotypes from PRS-PheWAS: {}'.format(len(df_prsphewas)))
# df_prsphewas


Number of significant phenotypes from PRS-PheWAS: 350


In [2]:
df_prsphewas[df_prsphewas['ICD10'].isna()]

Unnamed: 0,ICD10,ICD10 Category,Phecode,Phenotype,n_obs,Beta,SE,Beta_CI_lower,Beta_CI_upper,Z-score,P-value,OR,OR_CI_lower,OR_CI_upper,Bonferroni P-value,Bonferroni significant,FDR P-value,FDR significant


In [3]:
df_prsphewas = df_prsphewas[['ICD10', 'ICD10 Category', 'Phecode', 'Phenotype']]
df_prsphewas

Unnamed: 0,ICD10,ICD10 Category,Phecode,Phenotype
0,"E104,E11,E110,E111,E112,E113,E114,E115,E116,E1...",endocrine/metabolic,250.20,Type 2 diabetes
1,"E10,E100,E101,E102,E103,E104,E105,E106,E107,E1...",endocrine/metabolic,250.00,Diabetes mellitus
2,"I10,I11,I110,I119,I12,I120,I129,I13,I130,I131,...",circulatory system,401.00,Hypertension
3,"I10,401,4010,4011,4019",circulatory system,401.10,Essential hypertension
4,"E66,E660,E661,E662,E668,E669",endocrine/metabolic,278.10,Obesity
...,...,...,...,...
345,"K920,5780",digestive,578.10,Hematemesis
346,"A021,A207,A392,A393,A394,A40,A400,A401,A402,A4...",infectious diseases,38.10,Gram negative septicemia
347,"X71,X710,X711,X712,X713,X714,X715,X716,X717,X7...",mental disorders,297.00,Suicidal ideation or attempt
348,"N10,N11,N110,N111,N118,N119,N12,N136,N151,N159...",genitourinary,590.00,Pyelonephritis


# UKB Neale lab Metadata

In [4]:
Neale_phenotype_file = "phenotypes.both_sexes.v2.tsv"
df_ukb_meta = pd.read_csv(Neale_phenotype_file, sep="\t", index_col=False)
n_phenotype_initial = len(df_ukb_meta)
print("Number of phenotypes in UKB metadata: {:,}".format(n_phenotype_initial))
# df_ukb_meta

Number of phenotypes in UKB metadata: 4,359


In [5]:
df_ukb_meta = df_ukb_meta[df_ukb_meta['source'].isin(['icd10'])]
print("Number of pheonotypes excluded: {:,}".format(n_phenotype_initial - len(df_ukb_meta)))
print("Remaining: {:,}".format(len(df_ukb_meta)))
df_ukb_meta

Number of pheonotypes excluded: 3,726
Remaining: 633


Unnamed: 0,phenotype,description,variable_type,source,n_non_missing,n_missing,n_controls,n_cases,PHESANT_transformation,notes
3165,L72,Diagnoses - main ICD10: L72 Follicular cysts o...,binary,icd10,361194,0,354550.0,6644.0,,
3166,L73,Diagnoses - main ICD10: L73 Other follicular d...,binary,icd10,361194,0,360948.0,246.0,,
3167,L71,Diagnoses - main ICD10: L71 Rosacea,binary,icd10,361194,0,361085.0,109.0,,
3168,K44,Diagnoses - main ICD10: K44 Diaphragmatic hernia,binary,icd10,361194,0,353152.0,8042.0,,
3169,K46,Diagnoses - main ICD10: K46 Unspecified abdomi...,binary,icd10,361194,0,361088.0,106.0,,
...,...,...,...,...,...,...,...,...,...,...
3793,M60,Diagnoses - main ICD10: M60 Myositis,binary,icd10,361194,0,361086.0,108.0,,
3794,M62,Diagnoses - main ICD10: M62 Other disorders of...,binary,icd10,361194,0,360851.0,343.0,,
3795,M65,Diagnoses - main ICD10: M65 Synovitis and teno...,binary,icd10,361194,0,358382.0,2812.0,,
3796,M66,Diagnoses - main ICD10: M66 Spontaneous ruptur...,binary,icd10,361194,0,360806.0,388.0,,


# Match phenotype from PRS-PheWAS to UKB Neale metadata


In [6]:

icd10_codes = df_prsphewas['ICD10'].tolist()

df = None
df2 = None

for icd10_code in icd10_codes:
    icd10_code_list = icd10_code.split(sep=",")
    temp = df_ukb_meta[df_ukb_meta['phenotype'].isin(icd10_code_list)]
    temp.reset_index(inplace=True, drop=True)
    temp2 = df_prsphewas[df_prsphewas['ICD10'] == icd10_code]
    if temp.empty:
        df2 = pd.concat([df2, temp2])
    else:
        if len(temp) > 1:
            temp2 = temp2.loc[temp2.index.repeat(len(temp))]
        temp2.reset_index(inplace=True, drop=True)
        temp3 = pd.concat([temp, temp2], axis=1)

        df = pd.concat([df, temp3])



In [7]:
print("Number of unique phenotypes initially found in UKB: {}".format(len(df['Phenotype'].unique().tolist())))
print("Number of phenotypes not found in UKB: {}".format(len(df2)))

Number of unique phenotypes initially found in UKB: 176
Number of phenotypes not found in UKB: 174


In [8]:
df.to_csv("metadata_v1.tsv", sep="\t", index=False)

# Check phenotype


In [9]:
file = "metadata_v1_manual_check.tsv.txt"
df3 = pd.read_csv(file, sep="\t", index_col=False)
# df3

In [10]:
df4 = df3[df3['Include'] == "y"]

df5 = df4[df4.duplicated('description', keep=False) == True] # Retained only duplicates
df5_1 = df4[df4.duplicated('description', keep=False) == False] # Removed all duplicates
df5.to_csv("metadata_v2.tsv", sep="\t", index=False)


In [11]:
file2 = "metadata_v2_manual_check.tsv.txt"
df6 = pd.read_csv(file2, sep="\t", index_col=False)
df6 = df6[df6['Include2'] == "y"]


In [12]:
df7 = pd.concat([df5_1, df6])

In [13]:
df7 = df7[['Phenotype', 'phenotype', 'ICD10 Category', 'Phecode', 
            'description' , 'variable_type', 'n_non_missing', 'n_cases', 'n_controls']]
df7.columns = ['Phenotype from PRS-PheWAS', 'ICD10', 'ICD10 Category', 'PheCode', 
                'Phenotype description from Neale lab', 'Type', 'N total', 'N case', 'N control']
df7

Unnamed: 0,Phenotype from PRS-PheWAS,ICD10,ICD10 Category,PheCode,Phenotype description from Neale lab,Type,N total,N case,N control
0,Type 2 diabetes,E11,endocrine/metabolic,250.20,Diagnoses - main ICD10: E11 Non-insulin-depend...,binary,361194,705,360489
5,Essential hypertension,I10,circulatory system,401.10,Diagnoses - main ICD10: I10 Essential (primary...,binary,361194,787,360407
6,Obesity,E66,endocrine/metabolic,278.10,Diagnoses - main ICD10: E66 Obesity,binary,361194,353,360841
13,"Other chronic ischemic heart disease, unspecified",I25,circulatory system,411.80,Diagnoses - main ICD10: I25 Chronic ischaemic ...,binary,361194,12769,348425
14,Angina pectoris,I20,circulatory system,411.30,Diagnoses - main ICD10: I20 Angina pectoris,binary,361194,6246,354948
...,...,...,...,...,...,...,...,...,...
27,"Pulmonary embolism and infarction, acute",I26,circulatory system,415.11,Diagnoses - main ICD10: I26 Pulmonary embolism,binary,361194,2118,359076
29,Other acute and subacute forms of ischemic hea...,I24,circulatory system,411.90,Diagnoses - main ICD10: I24 Other acute ischae...,binary,361194,487,360707
31,Hypertensive chronic kidney disease,I12,circulatory system,401.22,Diagnoses - main ICD10: I12 Hypertensive renal...,binary,361194,176,361018
33,Hypovolemia,E86,endocrine/metabolic,276.50,Diagnoses - main ICD10: E86 Volume depletion,binary,361194,190,361004


In [14]:
# remove phenoytpes included in gSEM
phenotypes_to_exclude = ['Type 2 diabetes', 'Essential hypertension', 'Obesity', 'Type 1 diabetes']
df8 = df7[~df7['Phenotype from PRS-PheWAS'].isin(phenotypes_to_exclude)]

In [15]:
df8.to_csv("TSMR_metadata_for_supple.tsv", sep="\t", index=False)

In [16]:
print("Number of phenotypes from PRS-PheWAS retained: {}".format(len(df8)))

Number of phenotypes from PRS-PheWAS retained: 132


# GWAS summary statistics download information

In [17]:
import pandas as pd

file = "TSMR_metadata_for_supple.tsv"
df = pd.read_csv(file, sep="\t", index_col=False)

file2 = "UKBB_GWAS_Imputed_v3-File_Manifest_Release_20180731-Manifest_201807.csv"
df2 = pd.read_csv(file2, sep=",", index_col=False)

In [18]:
df3 = df2[(df2['Phenotype Description'].isin(df['Phenotype description from Neale lab'].tolist())) & (df2['Sex'] == 'both_sexes')]
df3 = df3[['Phenotype Description', 'File', 'wget command']]
df3['File'] = df3['File'].apply(lambda x: x.replace(".tsv.bgz", ""))

In [19]:
df4 = pd.merge(df, df3, left_on="Phenotype description from Neale lab", right_on="Phenotype Description", how="left")

df4.drop(columns=['Phenotype Description'], inplace=True)

df4.to_csv("TSMR_metadata_for_analysis.tsv", sep="\t", index=False)