In [1]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.discrete.discrete_model import Logit
from sklearn.metrics import roc_auc_score

In [2]:
# LDL, Height, BMI, diabetes, renal failure, heart attack, asthma, non-melanoma skin cancer
pheno_list = ['INI50','INI21001','HC221','HC294','HC326','HC382','cancer1060']
phenos = pd.read_table('/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.phe',
                       usecols=['IID','age','sex','PC1','PC2','PC3','PC4']+pheno_list, 
                       index_col=['IID'],
                       na_values=-9)
phenos[[i for i in phenos.columns if 'INI' not in i]] -= 1
phenos.loc[~phenos.index.duplicated(keep='first')]
phenos.describe()

Unnamed: 0,age,sex,PC1,PC2,PC3,PC4,HC294,INI50,cancer1060,HC326,HC382,HC221,INI21001
count,488377.0,488377.0,488377.0,488377.0,488377.0,488377.0,488374.0,500060.0,488374.0,488374.0,488374.0,488374.0,499520.0
mean,64.454925,-0.542335,-2.123313,-0.640132,-1.089341,-0.870598,0.021469,168.419126,0.046743,0.036075,0.129935,0.065823,27.433331
std,8.112788,0.498205,54.305102,27.858077,14.891779,10.403127,0.144942,9.276846,0.211088,0.186477,0.336233,0.247972,4.8013
min,45.0,-1.0,-20.2699,-283.317,-144.258,-108.573,0.0,75.0,0.0,0.0,0.0,0.0,12.1212
25%,58.0,-1.0,-14.3352,1.44508,-3.67377,-1.910279,0.0,161.5,0.0,0.0,0.0,0.0,24.1415
50%,66.0,-1.0,-13.1619,2.64877,-2.506,0.21096,0.0,168.0,0.0,0.0,0.0,0.0,26.7433
75%,71.0,0.0,-11.7735,3.76299,-1.248864,2.57551,0.0,175.0,0.0,0.0,0.0,0.0,29.9041
max,82.0,0.0,418.396,85.1124,97.8226,36.1031,1.0,209.0,1.0,1.0,1.0,1.0,74.6837


In [3]:
a,b = 'white_british', 'non_british_white'
with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+a+'.phe', 'r') as f:
    pop = {int(line.split()[0]) for line in f}

with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+b+'.phe', 'r') as f:
    pop2 = {int(line.split()[0]) for line in f}

In [4]:
z='all_beta_center_p001_20190530_500PCs'
npz = np.load('/oak/stanford/groups/mrivas/projects/degas-risk/datasets/all_pop/'+z+'.npz')
scores = pd.read_table('/oak/stanford/groups/mrivas/projects/degas-risk/scorefiles/'+z+'_full.profile',
                       sep='\s+',
                       index_col='IID')
scores = scores.loc[scores.index.isin(phenos.index),:]
phenos = phenos.loc[phenos.index.isin(scores.index),:]

scores.iloc[:,:10].head()

Unnamed: 0_level_0,SCORE_PC1,SCORE_PC2,SCORE_PC3,SCORE_PC4,SCORE_PC5,SCORE_PC6,SCORE_PC7,SCORE_PC8,SCORE_PC9,SCORE_PC10
IID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1,103.765,-56.8411,-329.433,72.466,-226.16,43.4152,77.9104,7.44528,-39.1203,-52.6195
-2,-936.599,99.5094,-47.4708,-63.9602,-111.88,-53.3234,-3.12852,-38.4587,114.53,289.305
-3,-330.688,175.759,201.225,177.994,21.2985,-16.4617,88.9027,58.5945,-54.6371,-2.89849
-4,-188.023,58.2651,-250.153,-23.0189,-110.555,57.3824,33.2555,-51.5851,131.004,176.632
-5,24.3653,-7.05218,60.2319,74.4917,-301.166,142.04,-56.6653,-18.1794,61.7944,159.031


In [5]:
for p,name in zip([pop,pop2],['White British:','Non-British White:']):
    print(name)
    for phe in pheno_list:
        print(phe),
        pcvec = npz['V'][np.where(npz['label_phe_code'] == phe),:].flatten()
        score = scores.loc[scores.index.isin(p),:].sort_index().dot(pcvec).dropna()
        if 'INI' in phe:
            pheno = ols(phe+'~age+sex+PC1+PC2+PC3+PC4',
                        data=phenos.loc[phenos.index.isin(p),:]).fit().resid.sort_index().dropna()
            pheno = pheno.subtract(pheno.mean()).divide(pheno.std())
        else:
            pheno = phenos.loc[phenos.index.isin(p),phe].sort_index().dropna()
        data = pd.concat([score,pheno], axis=1)
        if 'INI' in phe:
            print('r^2={:.4}'.format(data.corr(method='spearman').iloc[0,1]**2))
        else:
            print('AUC={:.4}'.format(roc_auc_score(data.iloc[:,1].astype(bool), data.iloc[:,0])))
    print('')

White British:
INI50 r^2=0.2129
INI21001 r^2=0.1402
HC221 AUC=0.7357
HC294 AUC=0.6774
HC326 AUC=0.8682
HC382 AUC=0.6392
cancer1060 AUC=0.7659

Non-British White:
INI50 r^2=0.1732
INI21001 r^2=0.05086
HC221 AUC=0.5895
HC294 AUC=0.5432
HC326 AUC=0.5759
HC382 AUC=0.5726
cancer1060 AUC=0.6012

