```
python /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/compute_r_or_auc.py -i /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/4_score/Alkaline_phosphatase.sscore -o /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/5_eval/Alkaline_phosphatase.sscore.eval -k /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/1_split/Alkaline_phosphatase.test -p /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/0_input/Alkaline_phosphatase.phe -t qt -c /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv -b /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/3_snpnet/Alkaline_phosphatase.covars.tsv.gz
```

In [20]:
!cat /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/compute_r_or_auc.py

_README_ = '''
-------------------------------------------------------------------------
compute_r_or_auc.py

Evaluation script for PRS score by computing r or AUC

Author: Yosuke Tanigawa (ytanigaw@stanford.edu)
Date: 2019/02/25 (updated on 2019/3/19)
-------------------------------------------------------------------------
'''


import argparse, os, collections


import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score

from rivaslab_PRS_misc import pd_read_csv_usecols_by_key

class PRS_eval:
    '''
    This class computes R or AUC for multiple PRS models
    '''
    
    ######################################################
    # Constructor
    ######################################################
    
    def __init__(self, phe_type, seed_file=None):
        '''
        Constructor for PRS_eval class
        
        phe_type: [ binary | bin | linear | qt ]
        seed_file: optional.

In [21]:
import argparse, os, collections

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score

import sys
sys.path.append(os.path.abspath(os.path.dirname('/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/compute_r_or_auc.py')))
from rivaslab_PRS_misc import pd_read_csv_usecols_by_key


In [None]:
/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/4_score/Alkaline_phosphatase.sscore
-o /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/5_eval/Alkaline_phosphatase.sscore.eval 
-k /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/1_split/Alkaline_phosphatase.test
-p /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/0_input/Alkaline_phosphatase.phe 
-t qt 
-c /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv 
-b /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/3_snpnet/Alkaline_phosphatase.covars.tsv.gz





In [7]:
in_score='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/4_score/Alkaline_phosphatase.sscore'
phe='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/0_input/Alkaline_phosphatase.phe'
phe_type='qt'
covar_phe='/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv'
betas='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/3_snpnet/Alkaline_phosphatase.covars.tsv.gz'
out_file='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/5_eval/Alkaline_phosphatase.sscore.eval'
keep='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/1_split/Alkaline_phosphatase.test'
seed_file='/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/rand.seed.txt'


In [9]:
class PRS_eval:
    '''
    This class computes R or AUC for multiple PRS models
    '''
    
    ######################################################
    # Constructor
    ######################################################
    
    def __init__(self, phe_type, seed_file=None):
        '''
        Constructor for PRS_eval class
        
        phe_type: [ binary | bin | linear | qt ]
        seed_file: optional. used as a seed for logistic regression
        '''
        self._set_consts()        
        self.metrics = collections.OrderedDict()
        self.phe_type = phe_type                
        assert(self._is_bin() or self._is_qt())
        self.seed_file = seed_file
        self.seed = self._read_seed()
    
    ######################################################    
    # Private funcs
    ######################################################    
            
    def _set_consts(self):
        self.const = collections.OrderedDict()
        self.const['phe_type_bin'] = set(['binary', 'bin'])
        self.const['phe_type_qt']  = set(['linear', 'qt'])
        
    def _is_bin(self):
        return (self.phe_type in self.const['phe_type_bin'])

    def _is_qt(self):
        return (self.phe_type in self.const['phe_type_qt'])
            
    def _read_seed(self):
        if self._is_bin() and self.seed_file is not None:
            return int(np.loadtxt(self.seed_file))
        else: 
            return None
            
    def _compute_r(self, x, y):
        lm = LinearRegression().fit(x, y)    
        return np.corrcoef(y, lm.predict(x))[0, 1]

    def _compute_auc(self, x, y, seed=None):
        lm = LogisticRegression(
            random_state=seed, solver='lbfgs'
        ).fit(x, y)
        roc_auc = roc_auc_score(y, lm.predict_log_proba(x)[:, 1])
        return max(roc_auc, 1 - roc_auc)

    ######################################################    
    # public funcs
    ######################################################    
    
    def compute_r_or_auc(self, model_name, x, y):
        if self._is_qt():
            self.metrics[model_name] = self._compute_r(x, y)

        elif self._is_bin():
            print(collections.Counter(y))        
            self.metrics[model_name] = self._compute_auc(x, y, self.seed)
        
    def get_metrics_str(self):
        return ['{}\t{:.6e}'.format(x[0], x[1]) for x in self.metrics.items()]
    
    def format_metrics(self, info_str=None):
        metrics_str = self.get_metrics_str()
        if info_str is None:
            return '\n'.join(metrics_str)
        else:
            return '\n'.join(['{}\t{}'.format(str(info_str), x) for x in metrics_str])

In [10]:
def read_data_for_eval(in_score, phe, covar_phe, keep=None):
    print(in_score, phe, covar_phe, keep)
    df = pd_read_csv_usecols_by_key(
        in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
    ).merge(
        pd.read_csv(
            phe, sep='\t', usecols=[1,2], names=['IID', 'phe']
        ),
        on='IID'
    ).merge(
        pd.read_csv(covar_phe, sep='\t'),
        on='IID'
    )
    df_non_missing=df[df['phe'] != 9]
    if keep is None:
        return df_non_missing
    else:
        return df_non_missing.merge(
            pd.read_csv(
                keep, sep='\t', usecols=[0], names=['IID']
            ),
            on='IID',
            how='inner'
        )
    

In [11]:
def compute_score_for_covariates(df, betas, center=False, Z=False):
    betas_df=pd.read_csv(betas, compression='gzip', sep='\t')
    covar_mat = df[list(betas_df['ID'])].values
    
    betas_vec = np.array(betas_df['BETA'])[:,np.newaxis]
    
    if 'mean' in set(betas_df.columns) and (center or Z):
        covar_mat_centered = (covar_mat - np.array(betas_df['mean'])[np.newaxis, :])
        if 'Z' in set(betas_df.columns) and Z:
            covar_mat_Z = (covar_mat_centered / np.array(betas_df['std'])[np.newaxis, :])
            return np.dot(covar_mat_Z, betas_vec)
        else:
            return np.dot(covar_mat_centered, betas_vec)
    else:            
        return np.dot(covar_mat, betas_vec)
    
        


In [13]:
def compute_r_or_auc_main(in_score, phe, phe_type, covar_phe, betas, keep, out_file, seed_file=None):  
    df = read_data_for_eval(in_score=in_score, phe=phe, covar_phe=covar_phe, keep=keep)
    
    PRS = collections.OrderedDict()
    
    PRS['Genotype_only'] = df[['SCORE1_SUM']].values
    PRS['Covariates_only'] = compute_score_for_covariates(df, betas)
#    PRS['Covariates_only_center'] = compute_score_for_covariates(df, betas, center=True)
#    PRS['Covariates_only_Z'] = compute_score_for_covariates(df, betas, Z=True)
    PRS['Genotype_and_covariates'] = PRS['Genotype_only'] + PRS['Covariates_only']
#    PRS['Genotype_and_covariates_center'] = PRS['Genotype_only'] + PRS['Covariates_only_center']
#    PRS['Genotype_and_covariates_Z'] = PRS['Genotype_only'] + PRS['Covariates_only_Z']

    Y  = np.array(df['phe'])
    prs_eval = PRS_eval(phe_type, seed_file)
    for k,v in PRS.items():
        prs_eval.compute_r_or_auc(k, v, Y)    

    info_str='\t'.join([in_score, phe_type])
    results_str = prs_eval.format_metrics(info_str)
    with open(out_file, 'w') as fw:        
        fw.write(results_str + '\n')        
    print(results_str)

In [14]:
df = read_data_for_eval(in_score=in_score, phe=phe, covar_phe=covar_phe, keep=keep)

/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/4_score/Alkaline_phosphatase.sscore /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/0_input/Alkaline_phosphatase.phe /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190406/1_split/Alkaline_phosphatase.test


  if (yield from self.run_code(code, result)):


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [15]:
df = pd_read_csv_usecols_by_key(
    in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
)

In [17]:
phe_df = pd.read_csv(
    phe, sep='\t', usecols=[1,2], names=['IID', 'phe']
)


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
 print(in_score, phe, covar_phe, keep)
    df = pd_read_csv_usecols_by_key(
        in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
    ).merge(
        pd.read_csv(
            phe, sep='\t', usecols=[1,2], names=['IID', 'phe']
        ),
        on='IID'
    ).merge(
        pd.read_csv(covar_phe, sep='\t'),
        on='IID'
    )
    df_non_missing=df[df['phe'] != 9]
    if keep is None:
        return df_non_missing
    else:
        return df_non_missing.merge(
            pd.read_csv(
                keep, sep='\t', usecols=[0], names=['IID']
            ),
            on='IID',
            how='inner'
        )

In [None]:
```
python /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/compute_r_or_auc.py -i /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore -o /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/5_eval/Fasting_glucose.sscore.eval -k /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test -p /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe -t qt -c /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv -b /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/3_snpnet/Fasting_glucose.covars.tsv.gz
/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test
```

In [None]:

i /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore
o /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/5_eval/Fasting_glucose.sscore.eval
k /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test
p /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe
t qt
c /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv
b /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/3_snpnet/Fasting_glucose.covars.tsv.gz
/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test





In [22]:
in_score='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore'
phe='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe'
phe_type='qt'
covar_phe='/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv'
betas='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/3_snpnet/Fasting_glucose.covars.tsv.gz'
out_file='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/5_eval/Fasting_glucose.sscore.eval'
keep='/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test'
seed_file='/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/helper/rand.seed.txt'


In [23]:
df = read_data_for_eval(in_score=in_score, phe=phe, covar_phe=covar_phe, keep=keep)

/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [24]:
def read_data_for_eval(in_score, phe, covar_phe, keep=None):
    print(in_score, phe, covar_phe, keep)
    df = pd_read_csv_usecols_by_key(
        in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
    ).merge(
        pd.read_csv(
            phe, sep='\t', usecols=[1,2], names=['IID', 'phe'], comment='#'
        ),
        on='IID'
    ).merge(
        pd.read_csv(covar_phe, sep='\t'),
        on='IID'
    )
    df_non_missing=df[df['phe'] != 9]
    if keep is None:
        return df_non_missing
    else:
        return df_non_missing.merge(
            pd.read_csv(
                keep, sep='\t', usecols=[0], names=['IID']
            ),
            on='IID',
            how='inner'
        )

In [25]:
phe_df = pd.read_csv(
            phe, sep='\t', usecols=[1,2], names=['IID', 'phe'], comment='#'
        )

In [26]:
phe_df.shape

(461984, 2)

In [27]:
score_df = pd_read_csv_usecols_by_key(
        in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
    )

In [28]:
score_df.shape

(337151, 2)

In [29]:
covar_df = pd.read_csv(covar_phe, sep='\t')

In [30]:
covar_df.shape

(461984, 129)

In [37]:
df = score_df.merge(
    phe_df,
    on='IID'
).merge(
    covar_df,
    on='IID'
)

In [41]:
def read_data_for_eval(in_score, phe, covar_phe, keep=None):
    print(in_score, phe, covar_phe, keep)
    score_df = pd_read_csv_usecols_by_key(
        in_score, cols=['IID', 'SCORE1_SUM'], sep='\t'
    )    
    phe_df = pd.read_csv(
        phe, sep='\t', usecols=[1,2], names=['IID', 'phe'], comment='#'
    )    
    covar_df = pd.read_csv(covar_phe, sep='\t')    
    df = score_df.merge(
        phe_df, on='IID'
    ).merge(
        covar_df, on='IID'
    )
    df_non_missing=df[df['phe'] != 9]
    if keep is None:
        return df_non_missing
    else:
        return df_non_missing.merge(
            pd.read_csv(
                keep, sep='\t', usecols=[0], names=['IID']
            ),
            on='IID',
            how='inner'
        )

In [42]:
df = read_data_for_eval(in_score=in_score, phe=phe, covar_phe=covar_phe, keep=keep)

/oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/4_score/Fasting_glucose.sscore /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/0_input/Fasting_glucose.phe /oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/PRS/notebook/20190406_biomarker_covar/biomarker_covar.tsv /oak/stanford/groups/mrivas/projects/PRS/private_output/snpnet_biomarker_PRS/biomarkers_20190407/1_split/Fasting_glucose.test
