In [1]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.discrete.discrete_model import Logit
from sklearn.metrics import roc_auc_score

In [2]:
# LDL, Height, BMI, diabetes, renal failure, heart attack, asthma, non-melanoma skin cancer
# pheno_list = ['INI50','INI21001','HC221','HC294','HC326','HC382','cancer1060']
pheno_list=['BIN_FC1002247']
phenos = pd.read_table('/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.phe',
                       usecols=['IID','age','sex','PC1','PC2','PC3','PC4']+pheno_list, 
                       index_col=['IID'],
                       na_values=-9)
phenos[[i for i in phenos.columns if 'INI' not in i]] -= 1
phenos.loc[~phenos.index.duplicated(keep='first')]
phenos.describe()

Unnamed: 0,age,sex,PC1,PC2,PC3,PC4,BIN_FC1002247
count,488377.0,488377.0,488377.0,488377.0,488377.0,488377.0,479457.0
mean,64.454925,-0.542335,-2.123313,-0.640132,-1.089341,-0.870598,0.266737
std,8.112788,0.498205,54.305102,27.858077,14.891779,10.403127,0.442254
min,45.0,-1.0,-20.2699,-283.317,-144.258,-108.573,0.0
25%,58.0,-1.0,-14.3352,1.44508,-3.67377,-1.910279,0.0
50%,66.0,-1.0,-13.1619,2.64877,-2.506,0.21096,0.0
75%,71.0,0.0,-11.7735,3.76299,-1.248864,2.57551,1.0
max,82.0,0.0,418.396,85.1124,97.8226,36.1031,1.0


In [3]:
a,b = 'white_british', 'non_british_white'
with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+a+'.phe', 'r') as f:
    pop = {int(line.split()[0]) for line in f}

with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification/ukb24983_'+b+'.phe', 'r') as f:
    pop2 = {int(line.split()[0]) for line in f}

In [None]:
z='all_beta_center_p001_20190530_500PCs'
npz = np.load('/oak/stanford/groups/mrivas/projects/degas-risk/datasets/all_pop/tsvd/'+z+'.npz')
scores = pd.read_table('/oak/stanford/groups/mrivas/projects/degas-risk/scorefiles/'+z+'_full.profile',
                       sep='\s+',
                       index_col='IID')
scores = scores.loc[scores.index.isin(phenos.index),:]
phenos = phenos.loc[phenos.index.isin(scores.index),:]

scores.iloc[:,:10].head()

In [None]:
for p,name in zip([pop,pop2],['White British:','Non-British White:']):
    print(name)
    for phe in pheno_list:
        print(phe),
        pcvec = npz['V'][np.where(npz['label_phe_code'] == phe),:].flatten()
        score = scores.loc[scores.index.isin(p),:].sort_index().dot(pcvec).dropna()
        if 'INI' in phe:
            pheno = ols(phe+'~age+sex+PC1+PC2+PC3+PC4',
                        data=phenos.loc[phenos.index.isin(p),:]).fit().resid.sort_index().dropna()
            pheno = pheno.subtract(pheno.mean()).divide(pheno.std())
        else:
            pheno = phenos.loc[phenos.index.isin(p),phe].sort_index().dropna()
        data = pd.concat([score,pheno], axis=1)
        if 'INI' in phe:
            print('r^2={:.4}'.format(data.corr(method='spearman').iloc[0,1]**2))
        else:
            print('AUC={:.4}'.format(roc_auc_score(data.iloc[:,1].astype(bool), data.iloc[:,0])))
    print('')