In [1]:
import pandas as pd
from topmed_manuscript import phenotype_id_to_gene_id, top_pip_variants, format_modality
import glob
import os
import re

def parse_modality(s):
    RE = '((cis|trans)-?([es]qtl))'
    m = re.search(RE, s, flags=re.IGNORECASE)
    assert(m is not None)
    return m.group(0)


def parse_tissue(s):
    tissues = ['Whole_blood', 'Lung', 'Nasal_epithelial', 'PBMC', 'Monocyte', 'T_cell']
    RE = '(' + '|'.join(tissues) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def parse_maf(s):
    mafs = ['maf001', 'maf0001', 'maf005']
    RE = '(' + '|'.join(mafs) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def format_maf(s):
    x = {
        'maf001': '1%',
        'maf0001': '0.1%',
        'maf005': '5%'
         }
    if s not in x:
        raise ValueError("Cannot format MAF: {}".format(s))
    return x[s]


PREFIX = 'rank-signals.'
JOINT_MODELING_DIR = '../work/joint-model-with-cs-variants/results/joint'

In [2]:
joint_modeling = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], maf=os.path.basename(f).split('.')[1], modality=os.path.basename(f).split('.')[2]) for f in glob.glob(f'{JOINT_MODELING_DIR}/*.tsv')])
joint_modeling.maf = joint_modeling.maf.map({'maf0001': '0.1%', 'maf001': '1%', 'maf005': '5%'})
joint_modeling.modality = joint_modeling.modality.map(format_modality)
joint_modeling.head()

Unnamed: 0,variant_id,Coef.,Std.Err.,t,P>|t|,[0.025,0.975],phenotype_id,tissue,maf,modality
0,chr19_36925890_A_G,-0.685595,0.078404,-8.744372,1.226736e-16,-0.839839,-0.531351,ENSG00000250565.7,T_cell,5%,trans-eQTL
0,chr16_3276825_G_GT,0.345773,0.045786,7.551969,8.448147e-14,0.255944,0.435602,ENSG00000083814.13,Lung,5%,trans-eQTL
1,chr9_21413704_C_T,0.704024,0.037192,18.92943,3.749309e-70,0.631056,0.776993,ENSG00000088827.12,Lung,5%,trans-eQTL
2,chr1_247556467_G_A,0.712992,0.085528,8.336374,2.070856e-16,0.545192,0.880792,ENSG00000092009.10,Lung,5%,trans-eQTL
3,chr1_247556467_G_A,0.820312,0.084724,9.682125,2.1169e-21,0.654088,0.986536,ENSG00000100448.4,Lung,5%,trans-eQTL


In [3]:
joint_modeling[['tissue', 'maf', 'modality']].drop_duplicates()

Unnamed: 0,tissue,maf,modality
0,T_cell,5%,trans-eQTL
0,Lung,5%,trans-eQTL
0,Nasal_epithelial,5%,trans-eQTL
0,Monocyte,5%,trans-eQTL
0,PBMC,5%,trans-eQTL
0,Lung,5%,trans-sQTL
0,Whole_blood,5%,trans-sQTL
0,Whole_blood,5%,trans-eQTL
0,PBMC,5%,trans-sQTL
0,Monocyte,1%,cis-sQTL


In [4]:
cs = pd.DataFrame(
    {'file': glob.glob('../data/scan-results/joint/cis-eqtl/susie/maf00*/*.cs.txt') + glob.glob('../data/scan-results/joint/cis-sqtl/susie/maf00*/postprocessed/*.by-gene.cs.txt') + glob.glob('../data/scan-results/joint/trans-*qtl/maf005/trans-susie/*.cs.txt')}
)
cs['modality'] = cs.file.map(lambda x: format_modality(parse_modality(x)))
cs['tissue'] = cs.file.map(lambda x: parse_tissue(x))
cs['maf'] = cs.file.map(lambda x: format_maf(parse_maf(x)))
cs = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=tissue, maf=maf, modality=modality) for tissue, maf, modality, f in zip(cs.tissue, cs.maf, cs.modality, cs.file)])
cs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,1%,cis-eQTL
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,1%,cis-eQTL
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,1%,cis-eQTL
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,1%,cis-eQTL
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,1%,cis-eQTL


In [5]:
cs[['tissue', 'maf', 'modality']].drop_duplicates()

Unnamed: 0,tissue,maf,modality
0,Lung,1%,cis-eQTL
0,Nasal_epithelial,1%,cis-eQTL
0,Whole_blood,1%,cis-eQTL
0,Monocyte,1%,cis-eQTL
0,PBMC,1%,cis-eQTL
0,T_cell,1%,cis-eQTL
0,Whole_blood,0.1%,cis-eQTL
0,Lung,1%,cis-sQTL
0,Nasal_epithelial,1%,cis-sQTL
0,Whole_blood,1%,cis-sQTL


In [6]:
cs = cs.groupby(['tissue', 'maf', 'modality']).apply(top_pip_variants).reset_index(drop=True)
cs['gene_id'] = cs.phenotype_id.map(phenotype_id_to_gene_id)
cs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id
0,ENSG00000107819.13,chr10_101032154_C_A,1.0,0.205267,2,Lung,1%,cis-eQTL,ENSG00000107819.13
1,ENSG00000107819.13,chr10_101035974_G_A,1.0,0.089466,1,Lung,1%,cis-eQTL,ENSG00000107819.13
2,ENSG00000107821.14,chr10_101061173_G_A,1.0,0.01433,1,Lung,1%,cis-eQTL,ENSG00000107821.14
3,ENSG00000269609.5,chr10_102437570_A_G,1.0,0.106894,3,Lung,1%,cis-eQTL,ENSG00000269609.5
4,ENSG00000156398.13,chr10_102727625_C_T,1.0,0.163439,1,Lung,1%,cis-eQTL,ENSG00000156398.13


In [7]:
cs = cs.merge(joint_modeling, on=['tissue', 'maf', 'modality', 'phenotype_id', 'variant_id'])
cs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
0,ENSG00000107819.13,chr10_101032154_C_A,1.0,0.205267,2,Lung,1%,cis-eQTL,ENSG00000107819.13,0.519421,0.04219,12.311589,6.881283e-33,0.436648,0.602195
1,ENSG00000107819.13,chr10_101035974_G_A,1.0,0.089466,1,Lung,1%,cis-eQTL,ENSG00000107819.13,-1.258614,0.062621,-20.098991,1.104974e-77,-1.381473,-1.135756
2,ENSG00000107821.14,chr10_101061173_G_A,1.0,0.01433,1,Lung,1%,cis-eQTL,ENSG00000107821.14,-1.521066,0.172906,-8.797051,4.779691e-18,-1.860298,-1.181835
3,ENSG00000269609.5,chr10_102437570_A_G,1.0,0.106894,3,Lung,1%,cis-eQTL,ENSG00000269609.5,0.935849,0.065889,14.203317,1.95173e-42,0.806578,1.06512
4,ENSG00000156398.13,chr10_102727625_C_T,1.0,0.163439,1,Lung,1%,cis-eQTL,ENSG00000156398.13,-1.279856,0.045434,-28.169447,1.881859e-134,-1.368995,-1.190716


In [8]:
cs = cs.rename(columns={'P>|t|': 'pvalue', 'Coef.': 'slope'})
cs['abs_t'] = cs.t.abs()
cs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,slope,Std.Err.,t,pvalue,[0.025,0.975],abs_t
0,ENSG00000107819.13,chr10_101032154_C_A,1.0,0.205267,2,Lung,1%,cis-eQTL,ENSG00000107819.13,0.519421,0.04219,12.311589,6.881283e-33,0.436648,0.602195,12.311589
1,ENSG00000107819.13,chr10_101035974_G_A,1.0,0.089466,1,Lung,1%,cis-eQTL,ENSG00000107819.13,-1.258614,0.062621,-20.098991,1.104974e-77,-1.381473,-1.135756,20.098991
2,ENSG00000107821.14,chr10_101061173_G_A,1.0,0.01433,1,Lung,1%,cis-eQTL,ENSG00000107821.14,-1.521066,0.172906,-8.797051,4.779691e-18,-1.860298,-1.181835,8.797051
3,ENSG00000269609.5,chr10_102437570_A_G,1.0,0.106894,3,Lung,1%,cis-eQTL,ENSG00000269609.5,0.935849,0.065889,14.203317,1.95173e-42,0.806578,1.06512,14.203317
4,ENSG00000156398.13,chr10_102727625_C_T,1.0,0.163439,1,Lung,1%,cis-eQTL,ENSG00000156398.13,-1.279856,0.045434,-28.169447,1.881859e-134,-1.368995,-1.190716,28.169447


In [9]:
cs = cs.sort_values('abs_t', ascending=False).reset_index(drop=True)
cs['rank_no_tiebreak'] = cs.groupby(['gene_id', 'tissue', 'maf', 'modality']).abs_t.rank(ascending=False)
# handle cases where the same variant is the representative for multiple credible sets by using the CS ID as a tiebreaker
cs = cs.sort_values(['abs_t', 'cs_id'], ascending=False)
cs['rank_tiebreak'] = cs.groupby(['gene_id', 'tissue', 'maf', 'modality']).cs_id.transform(lambda x: range(1, len(x) + 1))
cs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,slope,Std.Err.,t,pvalue,[0.025,0.975],abs_t,rank_no_tiebreak,rank_tiebreak
0,ENSG00000179344.16,chr6_32667428_C_T,0.333277,0.610707,1,Whole_blood,0.1%,cis-eQTL,ENSG00000179344.16,-1.347558,0.005445,-247.496258,0.0,-1.358232,-1.336884,247.496258,1.0,1
1,ENSG00000179344.16,chr6_32667428_C_T,0.333277,0.610707,1,Whole_blood,1%,cis-eQTL,ENSG00000179344.16,-1.347558,0.005445,-247.496258,0.0,-1.358232,-1.336884,247.496258,1.0,1
2,ENSG00000013573.17,chr12_31076435_A_C,0.308937,0.538271,1,Whole_blood,1%,cis-eQTL,ENSG00000013573.17,-1.383764,0.005718,-242.012454,0.0,-1.394973,-1.372556,242.012454,1.0,1
3,ENSG00000013573.17,chr12_31076435_A_C,0.308104,0.538271,1,Whole_blood,0.1%,cis-eQTL,ENSG00000013573.17,-1.383764,0.005718,-242.012454,0.0,-1.394973,-1.372556,242.012454,1.0,1
4,ENSG00000113504.21,chr5_1104823_C_T,1.0,0.468237,1,Whole_blood,1%,cis-eQTL,ENSG00000113504.21,-1.418568,0.006626,-214.085258,0.0,-1.431558,-1.405579,214.085258,1.0,1


In [10]:
# should almost always be equal, as a tiebreaker is rarely necessary
(cs.rank_no_tiebreak==cs.rank_tiebreak.astype(float)).mean()

0.9990641645764529

In [11]:
cs_ranks = cs[['tissue', 'maf', 'modality', 'phenotype_id', 'cs_id', 'variant_id', 'rank_tiebreak']].rename(columns={'rank_tiebreak': 'rank'})
cs_ranks.to_csv(f'tables/{PREFIX}cis-ranks.tsv', sep='\t', index=False)

In [12]:
# for how many genes is the same variant a representative for > 1 CS?
x = cs.groupby(['tissue', 'maf', 'modality', 'variant_id', 'phenotype_id']).size().rename('n_cs').reset_index()
x.head()

Unnamed: 0,tissue,maf,modality,variant_id,phenotype_id,n_cs
0,Lung,1%,cis-eQTL,chr10_100162586_C_T,ENSG00000107566.14,1
1,Lung,1%,cis-eQTL,chr10_100188925_TAAATA_T,ENSG00000213341.11,1
2,Lung,1%,cis-eQTL,chr10_100192229_T_C,ENSG00000196072.12,1
3,Lung,1%,cis-eQTL,chr10_100227710_T_C,ENSG00000095485.18,1
4,Lung,1%,cis-eQTL,chr10_100269675_T_TC,ENSG00000095485.18,1


In [13]:
x.groupby(['tissue', 'maf', 'modality']).n_cs.apply(lambda x: (x>1).mean())

tissue            maf   modality  
Lung              1%    cis-eQTL      0.000080
                        cis-sQTL      0.000000
                  5%    trans-eQTL    0.000000
                        trans-sQTL    0.000000
Monocyte          1%    cis-eQTL      0.000000
                        cis-sQTL      0.000000
                  5%    trans-eQTL    0.000000
Nasal_epithelial  1%    cis-eQTL      0.000000
                        cis-sQTL      0.000000
                  5%    trans-eQTL    0.000000
PBMC              1%    cis-eQTL      0.000145
                        cis-sQTL      0.000000
                  5%    trans-eQTL    0.000000
                        trans-sQTL    0.000000
T_cell            1%    cis-eQTL      0.000000
                        cis-sQTL      0.000000
                  5%    trans-eQTL    0.000000
Whole_blood       0.1%  cis-eQTL      0.000955
                        cis-sQTL      0.000000
                  1%    cis-eQTL      0.001320
                        c

In [14]:
x[x.n_cs>1].groupby(['tissue', 'maf', 'modality']).size()

tissue       maf   modality
Lung         1%    cis-eQTL     2
PBMC         1%    cis-eQTL     4
Whole_blood  0.1%  cis-eQTL    73
             1%    cis-eQTL    92
dtype: int64