# Process Plink Results

Read plink results and keep only the test results for the HLA alleles.

In [11]:
import glob
import os
import sys

import pandas as pd

import cdpybio as cpb

In [29]:
outdir = os.path.realpath(os.path.join('../output/process_plink_results'))
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [121]:
traits = pd.read_table('../data/traits.tsv', header=0, index_col=0)
# traits = traits.drop([3, 4, 5], axis=1)
# traits.columns = ['num_cases', 'pheno']
# Rename cancer codes to match codes that were used in HLA analysis.
traits.index = [x.replace('cancer', '') for x in traits.index]

In [119]:
traits.head()

Unnamed: 0_level_0,regtype,category,numcases,phenotype
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BIN20483,logistic,BIN,2433.0,Ever_attempted_suicide
BIN20484,logistic,BIN,502618.0,Attempted_suicide_vs_all
BIN2188,logistic,BIN,109472.0,"Long-standing_illness,_disability_or_infirmity"
BIN22126,logistic,BIN,19390.0,Doctor_diagnosed_hayfever_or_allergic_rhinitis
BIN22127,logistic,BIN,10841.0,Doctor_diagnosed_asthma


In [25]:
fns = glob.glob('/oak/stanford/groups/mrivas/users/jolivier/repos/hla-assoc/data/PLINK_results/*hybrid')
codes = [os.path.split(x)[1].split('.')[0] for x in fns]
dfs = []
for fn in fns:
    t = cpb.plink.read_logistic2(fn)
    t['code'] = os.path.split(fn)[1].split('.')[0]
    t = t[['code', 'FIRTH?', 'TEST', 'OBS_CT', 'OR', 'SE', 'T_STAT', 'P']]
    dfs.append(t)
results = pd.concat(dfs)

In [30]:
fn = os.path.join(outdir, 'plink_results_all.tsv.gz')
results.to_csv(fn, sep='\t', compression='gzip')

In [37]:
# Filter results by allele and disease frequency
hla = pd.read_table('/oak/stanford/groups/mrivas/ukbb/24983/hla/ukb_hla_v2.txt')
covar = pd.read_table('/oak/stanford/groups/mrivas/ukbb/24983/phe_qc/ukb24983_GWAS_covar.phe', 
                      index_col=0)
hla.index = covar.index
remove = pd.read_table('/oak/stanford/groups/mrivas/ukbb/24983/phe_qc/ukb24983_remove.phe',
                       index_col=0, header=None, squeeze=True)
hla = hla.drop(remove)

In [82]:
counts = (hla.fillna(0) > 0).sum()
freqs = counts / hla.shape[0]
(freqs >= 0.001).sum()

184

In [151]:
additive_res = pd.read_csv('../manuscript/additive_assoc_adj_p_all.csv', index_col=0)

shared = list(set(codes) & set(traits.index))
missing = list(set(additive_res.index) - set(traits[traits['numcases'] >= 500].index))
traits.loc[missing]

Unnamed: 0,regtype,category,numcases,phenotype
HC69,logistic,HC,482.0,polycythaemia_vera
HC432,logistic,HC,487.0,mitral_valve_prolapse
HC421,logistic,HC,478.0,other_abdominal_problem
HC352,logistic,HC,474.0,systemic_lupus_erythematosis/sle
HC12,logistic,HC,492.0,testicular_problems_(not_cancer)


It seems that the number of cases for some of the diseases is less than 500
according to the counts I have from the gcorr app. Maybe those counts aren't accurate.