In [1]:
import numpy as np
import pandas as pd
from statsmodels.api import OLS
from statsmodels.discrete.discrete_model import Logit
from sklearn.metrics import roc_auc_score

  from pandas.core import datetools


In [2]:
# Height, BMI, diabetes, renal failure, asthma
phenos = pd.read_table('/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.phe',
                       usecols=['IID','INI50','INI21001','HC221','HC294','HC326','HC382'], 
                       index_col=['IID'],
                       na_values=-9)
phenos[[i for i in phenos.columns if 'INI' not in i]] -= 1
phenos.loc[~phenos.index.duplicated(keep='first')]
phenos.describe()

Unnamed: 0,HC294,INI50,HC326,HC382,HC221,INI21001
count,488374.0,500060.0,488374.0,488374.0,488374.0,499520.0
mean,0.021469,168.419126,0.036075,0.129935,0.065823,27.433331
std,0.144942,9.276846,0.186477,0.336233,0.247972,4.8013
min,0.0,75.0,0.0,0.0,0.0,12.1212
25%,0.0,161.5,0.0,0.0,0.0,24.1415
50%,0.0,168.0,0.0,0.0,0.0,26.7433
75%,0.0,175.0,0.0,0.0,0.0,29.9041
max,1.0,209.0,1.0,1.0,1.0,74.6837


In [3]:
scores = pd.read_table('/oak/stanford/groups/mrivas/projects/degas-risk/scorefiles/'+
                       'all_z_nonCenter_p001_20190530_500PCs_full.profile', 
                       sep='\s+',
                       index_col='IID')
scores = scores.loc[scores.index.isin(phenos.index),:]
phenos = phenos.loc[phenos.index.isin(scores.index),:]

scores.iloc[:,:10].head()

Unnamed: 0_level_0,SCORE_PC1,SCORE_PC2,SCORE_PC3,SCORE_PC4,SCORE_PC5,SCORE_PC6,SCORE_PC7,SCORE_PC8,SCORE_PC9,SCORE_PC10
IID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1,3.52598,-2608.22,-804.918,-647.092,-255.543,-1501.01,-1583.02,416.229,-265.432,-446.139
-2,10448.6,-1554.15,-5631.39,310.692,8414.69,11085.9,-2808.07,1471.44,-1688.91,1013.97
-3,3094.03,1988.0,-2476.2,-1808.0,1098.6,1295.63,-1305.19,1420.98,-942.426,-43.5174
-4,3151.35,-3356.36,-4772.61,-134.581,5393.84,6618.12,-992.219,-431.341,-1104.9,980.058
-5,-1120.47,71.5653,185.472,-690.771,2846.27,-2517.28,-931.419,-1551.34,655.466,612.066


In [4]:
z=np.load('/oak/stanford/groups/mrivas/projects/degas-risk/datasets/all_pop/all_z_nonCenter_p001_20190530_500PCs.npz')
print('\n'.join([i for i in z]))

label_var
D
label_var_minor_allele
variance_explained
U
variance_explained_ratio
V
label_phe_code


In [5]:
a,b = 'white_british', 'non_british_white'
with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+a+'.phe', 'r') as f:
    pop = {int(line.split()[0]) for line in f}

with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+b+'.phe', 'r') as f:
    pop2 = {int(line.split()[0]) for line in f}

In [6]:
for phe in map(str,phenos.columns):
    print(phe),
    pcvec = z['V'][np.where(z['label_phe_code'] == phe),:].flatten()
    score = scores.loc[scores.index.isin(pop),:].sort_index().dot(pcvec).dropna()
    pheno = phenos.loc[phenos.index.isin(pop),phe].sort_index().dropna()
    data = pd.concat([score,pheno], axis=1)
    if 'INI' in phe:
        print('r^2={:.4}'.format(data.corr().iloc[0,1]))
    else:
        print('AUC={:.4}'.format(roc_auc_score(data.iloc[:,1].astype(bool), data.iloc[:,0])))

HC294 AUC=0.551
INI50 r^2=0.3083
HC326 AUC=0.5744
HC382 AUC=0.5484
HC221 AUC=0.5622
INI21001 r^2=0.348


In [7]:
for phe in map(str,phenos.columns):
    print(phe),
    pcvec = z['V'][np.where(z['label_phe_code'] == phe),:].flatten()
    score = scores.loc[scores.index.isin(pop2),:].sort_index().dot(pcvec).dropna()
    pheno = phenos.loc[phenos.index.isin(pop2),phe].sort_index().dropna()
    data = pd.concat([score,pheno], axis=1)
    if 'INI' in phe:
        print('r^2={:.4}'.format(data.corr().iloc[0,1]**2))
    else:
        print('AUC={:.4}'.format(roc_auc_score(data.iloc[:,1].astype(bool), data.iloc[:,0])))

HC294 AUC=0.5506
INI50 r^2=0.08007
HC326 AUC=0.544
HC382 AUC=0.5356
HC221 AUC=0.5509
INI21001 r^2=0.05354
