In [1]:
import pandas as pd
import glob
import re

def phenotype_id_to_gene_id(x):
    # first, try to match with a version
    ENSEMBL_RE_WITH_VERSION = r'ENSG\d+\.\d+'
    ENSEMBL_RE_WITHOUT_VERSION = r'ENSG\d+'
    with_version = re.search(ENSEMBL_RE_WITH_VERSION, x)
    without_version = re.search(ENSEMBL_RE_WITHOUT_VERSION, x)
    if with_version:
        return with_version.group(0)
    elif without_version:
        return without_version.group(0)
    else:
        raise ValueError(f'Not able to infer gene ID from {x}')


def format_modality(modality):
    modality_lower = modality.lower()
    assert('trans' in modality_lower or 'cis' in modality_lower)
    cis_trans = 'trans' if 'trans' in modality_lower else 'cis'
    assert('eqtl' in modality_lower or 'sqtl' in modality_lower)
    e_sqtl = 'eQTL' if 'eqtl' in modality_lower else 'sQTL'
    return cis_trans + '-' + e_sqtl


def parse_modality(s):
    RE = '((cis|trans)-?([es]qtl))'
    m = re.search(RE, s, flags=re.IGNORECASE)
    assert(m is not None)
    return m.group(0)


def parse_tissue(s):
    tissues = ['Whole_blood', 'Lung', 'Nasal_epithelial', 'PBMC', 'Monocyte', 'T_cell']
    RE = '(' + '|'.join(tissues) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def parse_maf(s):
    mafs = ['maf001', 'maf0001', 'maf005']
    RE = '(' + '|'.join(mafs) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def format_maf(s):
    x = {
        'maf001': '1%',
        'maf0001': '0.1%',
        'maf005': '5%'
         }
    if s not in x:
        raise ValueError("Cannot format MAF: {}".format(s))
    return x[s]


PREFIX = 'summarize-cis-eqtl-coloc-with-cis-sqtl.'
GTF = '../../manuscript-intermediate-processing/data/gtf/gencode.v30.GRCh38.ERCC.genes.collapsed_only.gtf'


In [2]:
to_load = glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-eqtl/susie/maf*/*.cs.txt') + glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-sqtl/susie/maf*/postprocessed/*.by-gene.cs.txt')
to_load = pd.DataFrame({'file': to_load})
to_load['tissue'] = to_load.file.apply(parse_tissue)
to_load['modality'] = to_load.file.apply(parse_modality).map(format_modality)
to_load['maf'] = to_load.file.apply(parse_maf).map(format_maf)

In [3]:
susie = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=tissue, modality=modality, maf=maf) for f, tissue, modality, maf in zip(to_load.file, to_load.tissue, to_load.modality, to_load.maf)])
susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%


In [4]:
# how many cis-sQTL signals coloc with at least one cis-eQTL?
coloced = pd.read_csv('../../manuscript-intermediate-processing/work/coloc/xqtl/joint/postprocessed.txt', sep='\t')
coloced.head()

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl1_tissue,xqtl2_tissue,xqtl1_maf,xqtl2_maf,xqtl1_phenotype,xqtl2_phenotype,xqtl1_ancestry,xqtl2_ancestry,xqtl1_gene,xqtl2_gene
0,3995,chr3_56815721_T_C,chr3_56815721_T_C,7.675906e-117,1.407465e-22,1.090742e-97,1.136868e-16,1.0,1,1,...,Whole_blood,Whole_blood,5%,5%,ENSG00000278828.1,chr22:39901330:39947093:clu_29509_+:ENSG000001...,joint,joint,ENSG00000278828.1,ENSG00000100351.16
1,9503,chr20_1482244_G_T,chr20_1482244_G_T,7.484160000000001e-159,6.427243e-55,2.328886e-107,4.092726e-15,1.0,1,1,...,Monocyte,Monocyte,1%,1%,ENSG00000196209.12,chr20:1458274:1464329:clu_22293_-:ENSG00000088...,joint,joint,ENSG00000196209.12,ENSG00000088833.17
2,6266,chr16_10923871_G_C,chr16_10923871_G_C,0.0,6.536402e-49,0.0,0.0,1.0,5,4,...,Whole_blood,Whole_blood,1%,5%,ENSG00000280153.1,ENSG00000204287.14,joint,joint,ENSG00000280153.1,ENSG00000204287.14
3,6266,chr16_10923871_G_C,chr16_10923871_G_C,0.0,4.297033e-45,0.0,0.0,1.0,5,3,...,Whole_blood,Whole_blood,1%,5%,ENSG00000280153.1,ENSG00000204257.15,joint,joint,ENSG00000280153.1,ENSG00000204257.15
4,8370,chr15_81308110_T_G,chr15_81308110_T_G,1.967001e-72,1.390803e-36,2.8285830000000003e-39,4.604317e-15,1.0,3,1,...,Whole_blood,Whole_blood,1%,1%,ENSG00000172349.17,chr15:81306545:81308605:clu_14795_+:ENSG000001...,joint,joint,ENSG00000172349.17,ENSG00000172349.17


In [5]:
coloced = coloced[(coloced.xqtl1_modality.str.contains('cis')) & (coloced.xqtl2_modality.str.contains('cis')) & (coloced.xqtl1_maf==coloced.xqtl2_maf)]
coloced.head()

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl1_tissue,xqtl2_tissue,xqtl1_maf,xqtl2_maf,xqtl1_phenotype,xqtl2_phenotype,xqtl1_ancestry,xqtl2_ancestry,xqtl1_gene,xqtl2_gene
1,9503,chr20_1482244_G_T,chr20_1482244_G_T,7.484160000000001e-159,6.427243e-55,2.328886e-107,4.092726e-15,1.0,1,1,...,Monocyte,Monocyte,1%,1%,ENSG00000196209.12,chr20:1458274:1464329:clu_22293_-:ENSG00000088...,joint,joint,ENSG00000196209.12,ENSG00000088833.17
4,8370,chr15_81308110_T_G,chr15_81308110_T_G,1.967001e-72,1.390803e-36,2.8285830000000003e-39,4.604317e-15,1.0,3,1,...,Whole_blood,Whole_blood,1%,1%,ENSG00000172349.17,chr15:81306545:81308605:clu_14795_+:ENSG000001...,joint,joint,ENSG00000172349.17,ENSG00000172349.17
5,1554,chr14_106522289_T_C,chr14_106522289_T_C,0.0,9.555001e-47,0.0,0.0,1.0,11,3,...,Whole_blood,Whole_blood,1%,1%,ENSG00000211959.2,chr14:106470571:106470675:clu_12792_-:ENSG0000...,joint,joint,ENSG00000211959.2,ENSG00000211964.3
6,1953,chr14_106817991_C_T,chr14_106817991_C_T,4.90207e-79,4.452413e-47,2.201983e-35,0.0,1.0,9,4,...,Whole_blood,Whole_blood,1%,1%,ENSG00000211959.2,chr14:106422018:106422102:clu_12799_-:ENSG0000...,joint,joint,ENSG00000211959.2,ENSG00000211959.2
7,43,chr2_90067545_T_C,chr2_90067545_T_C,0.0,2.4362550000000002e-60,0.0,0.0,1.0,4,2,...,Whole_blood,Whole_blood,1%,1%,ENSG00000242766.1,chr2:90154152:90154279:clu_26375_+:ENSG0000027...,joint,joint,ENSG00000242766.1,ENSG00000278857.1


In [6]:
assert(all(coloced.xqtl1_tissue == coloced.xqtl2_tissue))
assert((coloced.xqtl1_modality=='ciseqtl').all())
assert((coloced.xqtl2_modality=='cissqtl').all())

In [7]:
# for each cis-sQTL CS, note whether it coloced with a cis-eQTL CS
# and vice versa
susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%


In [8]:
tmp = coloced[['xqtl1_tissue', 'xqtl2_tissue', 'xqtl1_modality', 'xqtl2_modality', 'xqtl1_cs', 'xqtl2_cs', 'xqtl1_phenotype', 'xqtl2_phenotype', 'xqtl1_maf', 'xqtl2_maf']].drop_duplicates()
tmp.head()

Unnamed: 0,xqtl1_tissue,xqtl2_tissue,xqtl1_modality,xqtl2_modality,xqtl1_cs,xqtl2_cs,xqtl1_phenotype,xqtl2_phenotype,xqtl1_maf,xqtl2_maf
1,Monocyte,Monocyte,ciseqtl,cissqtl,L1,L1,ENSG00000196209.12,chr20:1458274:1464329:clu_22293_-:ENSG00000088...,1%,1%
4,Whole_blood,Whole_blood,ciseqtl,cissqtl,L2,L3,ENSG00000172349.17,chr15:81306545:81308605:clu_14795_+:ENSG000001...,1%,1%
5,Whole_blood,Whole_blood,ciseqtl,cissqtl,L1,L1,ENSG00000211959.2,chr14:106470571:106470675:clu_12792_-:ENSG0000...,1%,1%
6,Whole_blood,Whole_blood,ciseqtl,cissqtl,L3,L3,ENSG00000211959.2,chr14:106422018:106422102:clu_12799_-:ENSG0000...,1%,1%
7,Whole_blood,Whole_blood,ciseqtl,cissqtl,L1,L1,ENSG00000242766.1,chr2:90154152:90154279:clu_26375_+:ENSG0000027...,1%,1%


In [9]:
cis_eqtl_colocs_with_cis_sqtl = tmp.groupby(['xqtl1_tissue', 'xqtl1_cs', 'xqtl1_phenotype', 'xqtl1_maf']).xqtl2_phenotype.apply(lambda x: ','.join(x.unique())).reset_index().rename(columns={'xqtl1_tissue': 'tissue', 'xqtl1_cs': 'cs_id', 'xqtl1_phenotype': 'phenotype_id', 'xqtl1_maf': 'maf', 'xqtl2_phenotype': 'colocing_phenotypes'})
cis_sqtl_colocs_with_cis_eqtl = tmp.groupby(['xqtl2_tissue', 'xqtl2_cs', 'xqtl2_phenotype', 'xqtl2_maf']).xqtl1_phenotype.apply(lambda x: ','.join(x.unique())).reset_index().rename(columns={'xqtl2_tissue': 'tissue', 'xqtl2_cs': 'cs_id', 'xqtl2_phenotype': 'phenotype_id', 'xqtl2_maf': 'maf', 'xqtl1_phenotype': 'colocing_phenotypes'})
cis_eqtl_colocs_with_cis_sqtl['cs_id'] = cis_eqtl_colocs_with_cis_sqtl.cs_id.str.replace('L', '').astype(int)
cis_sqtl_colocs_with_cis_eqtl['cs_id'] = cis_sqtl_colocs_with_cis_eqtl.cs_id.str.replace('L', '').astype(int)
cis_eqtl_colocs_with_cis_sqtl.head()

Unnamed: 0,tissue,cs_id,phenotype_id,maf,colocing_phenotypes
0,Lung,1,ENSG00000001167.14,1%,chr6:41118184:41118924:clu_47354_+:ENSG0000016...
1,Lung,1,ENSG00000002016.17,1%,chr12:977106:989809:clu_11438_-:ENSG0000000201...
2,Lung,1,ENSG00000002822.15,1%,chr7:1936897:1957629:clu_48209_-:ENSG000000028...
3,Lung,1,ENSG00000002919.14,1%,chr17:48076178:48076865:clu_23046_-:ENSG000001...
4,Lung,1,ENSG00000003056.8,1%,chr12:8946387:8949488:clu_11591_-:ENSG00000003...


In [10]:
x = pd.concat([cis_eqtl_colocs_with_cis_sqtl.assign(modality='cis-eQTL'), cis_sqtl_colocs_with_cis_eqtl.assign(modality='cis-sQTL')])
x['coloced_with_other_modality_signal'] = True
x['coloced_with_other_modality_signal_from_same_gene'] = [phenotype_id_to_gene_id(i) in [phenotype_id_to_gene_id(k) for k in j.split(',')] for i, j in zip(x.phenotype_id , x.colocing_phenotypes)]
x.head()

Unnamed: 0,tissue,cs_id,phenotype_id,maf,colocing_phenotypes,modality,coloced_with_other_modality_signal,coloced_with_other_modality_signal_from_same_gene
0,Lung,1,ENSG00000001167.14,1%,chr6:41118184:41118924:clu_47354_+:ENSG0000016...,cis-eQTL,True,False
1,Lung,1,ENSG00000002016.17,1%,chr12:977106:989809:clu_11438_-:ENSG0000000201...,cis-eQTL,True,True
2,Lung,1,ENSG00000002822.15,1%,chr7:1936897:1957629:clu_48209_-:ENSG000000028...,cis-eQTL,True,True
3,Lung,1,ENSG00000002919.14,1%,chr17:48076178:48076865:clu_23046_-:ENSG000001...,cis-eQTL,True,True
4,Lung,1,ENSG00000003056.8,1%,chr12:8946387:8949488:clu_11591_-:ENSG00000003...,cis-eQTL,True,True


In [11]:
susie = susie.merge(x, how='left')
susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf,colocing_phenotypes,coloced_with_other_modality_signal,coloced_with_other_modality_signal_from_same_gene
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True


In [12]:
susie.coloced_with_other_modality_signal = susie.coloced_with_other_modality_signal.fillna(False)
susie.coloced_with_other_modality_signal_from_same_gene = susie.coloced_with_other_modality_signal_from_same_gene.fillna(False)
susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf,colocing_phenotypes,coloced_with_other_modality_signal,coloced_with_other_modality_signal_from_same_gene
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True


In [13]:
susie.to_csv(f'tables/{PREFIX}cis-eqtl-and-sqtl-overlap.tsv', sep='\t', index=False)