In [1]:
import re
import os
import glob

import pandas as pd
from topmed_manuscript_clean import top_pip_variants, phenotype_id_to_gene_id, format_modality, cis_eqtl_pcs, cis_sqtl_pcs

# for each non-whole blood CS, determine whether it overlaps a whole blood CS for the same gene


def compare_credible_sets(scan_1_cs, scan_2_cs, summarize=True):
    """
    Given a dataframe representing scan_1_cs and scan_2_cs (each having columns ['phenotype_id', 'variant_id', 'cs_id']; can have other columns too),
    return a dataframe showing, for each CS in scan_1_cs, whether it overlaps a CS in scan_2_cs (for the same phenotype_id)
    If summarize = False, simply returns scan_1_cs with an added column indicating whether each credible set SNP is a credible set SNP 
    for the same phenotype in the other scan
    """
    # Validate input
    if not isinstance(scan_1_cs, pd.DataFrame):
        raise TypeError('scan_1_cs must be a DataFrame')
    if not isinstance(scan_2_cs, pd.DataFrame):
        raise TypeError('scan_2_cs must be a DataFrame')
    for i in ['phenotype_id', 'variant_id', 'cs_id']:
        if not i in scan_1_cs.columns.to_list():
            raise ValueError(f'scan_1_cs must include column {i}')
        if not i in scan_2_cs.columns.to_list():
            raise ValueError(f'scan_2_cs must include column {i}')

    results = scan_1_cs.merge(scan_2_cs[['phenotype_id', 'variant_id']].drop_duplicates().assign(in_other_scan_cs=1), how='left')
    results.in_other_scan_cs = results.in_other_scan_cs.fillna(0).astype(int)
    assert(len(results) == len(scan_1_cs))

    if summarize:
        return results.groupby(['phenotype_id', 'cs_id']).in_other_scan_cs.max().reset_index()
    else:
        return results

PREFIX = 'blood-as-proxy.'

# TODO: for splicing, do we wish to require that a given intron was tested in blood and other tissue? Currently just requiring that the gene was tested in the cis-sQTL scan

In [2]:
# 1% only, as at this point in the manuscript we haven't yet done 0.1% scans
to_load = glob.glob('../data/scan-results/joint/cis-eqtl/susie/maf001/*.cs.txt') + glob.glob('../data/scan-results/joint/cis-sqtl/susie/maf001/postprocessed/*.by-gene.cs.txt')
to_load_tissues = [os.path.basename(x).split('.')[0] for x in to_load]
to_load_modalities = [format_modality(re.search('(cis-[es]qtl)', x).group(1)) for x in to_load]
susie = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], modality=modality) for f, tissue, modality in zip(to_load, to_load_tissues, to_load_modalities)])
susie['unique_cs_id'] = susie.phenotype_id + '___' + susie.cs_id.astype(str)
susie['gene_id'] = susie.phenotype_id.map(phenotype_id_to_gene_id)

CIS_EQTL_PERMUTATIONS_GLOB = glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-eqtl/permutations/maf001/*.txt.gz')
CIS_SQTL_PERMUTATIONS_GLOB = glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-sqtl/permutations/maf001/*.txt.gz')
cis_eqtl_permutations_all = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], pcs=int(os.path.basename(f).split('.')[1])) for f in CIS_EQTL_PERMUTATIONS_GLOB])
cis_sqtl_permutations_all = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], pcs=int(os.path.basename(f).split('.')[1])) for f in CIS_SQTL_PERMUTATIONS_GLOB])
cis_eqtl_permutations = cis_eqtl_permutations_all[cis_eqtl_permutations_all.tissue.map(cis_eqtl_pcs) == cis_eqtl_permutations_all.pcs]
cis_sqtl_permutations = cis_sqtl_permutations_all[cis_sqtl_permutations_all.tissue.map(cis_sqtl_pcs) == cis_sqtl_permutations_all.pcs]
permutations = pd.concat([cis_eqtl_permutations.assign(modality='cis-eQTL'), cis_sqtl_permutations.assign(modality='cis-sQTL')])
permutations['gene_id'] = permutations.phenotype_id.map(phenotype_id_to_gene_id)

In [3]:
OTHER_TISSUES = [i for i in permutations.tissue.unique() if i != 'Whole_blood']

In [4]:
# for each non-whole-blood tissue, keep genes tested in that tissue and whole blood
genes_per_modality_per_tissue = {(tissue, modality): set(df.gene_id) for (tissue, modality), df in permutations.groupby(['tissue', 'modality'])}

In [5]:
print(len(permutations))
permutations = permutations.loc[[gene_id in genes_per_modality_per_tissue[('Whole_blood', modality)] for modality, gene_id in zip(permutations.modality, permutations.gene_id)]]
print(len(permutations))

223173
192591


In [6]:
print(len(susie))
susie = susie.loc[[gene_id in genes_per_modality_per_tissue[('Whole_blood', modality)] for modality, gene_id in zip(susie.modality, susie.gene_id)]]
print(len(susie))

3357850


3063826


In [7]:
top = susie.groupby(['tissue', 'modality']).apply(top_pip_variants).reset_index(drop=True)
top.head()

  top = susie.groupby(['tissue', 'modality']).apply(top_pip_variants).reset_index(drop=True)


Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,unique_cs_id,gene_id
0,ENSG00000107819.13,chr10_101032154_C_A,1.0,0.205267,2,Lung,cis-eQTL,ENSG00000107819.13___2,ENSG00000107819.13
1,ENSG00000107819.13,chr10_101035974_G_A,1.0,0.089466,1,Lung,cis-eQTL,ENSG00000107819.13___1,ENSG00000107819.13
2,ENSG00000269609.5,chr10_102437570_A_G,1.0,0.106894,3,Lung,cis-eQTL,ENSG00000269609.5___3,ENSG00000269609.5
3,ENSG00000156398.13,chr10_102727625_C_T,1.0,0.163439,1,Lung,cis-eQTL,ENSG00000156398.13___1,ENSG00000156398.13
4,ENSG00000197142.10,chr10_112364188_C_G,1.0,0.364446,1,Lung,cis-eQTL,ENSG00000197142.10___1,ENSG00000197142.10


In [8]:
credible_set_in_blood = []
susie['phenotype_id'] = susie.gene_id
susie['cs_id'] = susie.unique_cs_id

for (tissue, modality), df in susie.groupby(['tissue', 'modality']):
    if tissue not in OTHER_TISSUES:
        continue
    tmp = compare_credible_sets(df, susie[(susie.tissue=='Whole_blood') & (susie.modality==modality)]).assign(tissue=tissue, modality=modality)
    credible_set_in_blood.append(tmp)

credible_set_in_blood = pd.concat(credible_set_in_blood).rename(columns={'phenotype_id': 'gene_id', 'cs_id': 'unique_cs_id'})
credible_set_in_blood.head()

Unnamed: 0,gene_id,unique_cs_id,in_other_scan_cs,tissue,modality
0,ENSG00000000003.14,ENSG00000000003.14___1,0,Lung,cis-eQTL
1,ENSG00000000457.14,ENSG00000000457.14___1,1,Lung,cis-eQTL
2,ENSG00000000460.17,ENSG00000000460.17___1,1,Lung,cis-eQTL
3,ENSG00000000460.17,ENSG00000000460.17___2,0,Lung,cis-eQTL
4,ENSG00000000971.15,ENSG00000000971.15___1,0,Lung,cis-eQTL


In [9]:
credible_set_in_blood.groupby(['tissue', 'modality']).in_other_scan_cs.mean()

tissue            modality
Lung              cis-eQTL    0.469268
                  cis-sQTL    0.634774
Monocyte          cis-eQTL    0.653888
                  cis-sQTL    0.775477
Nasal_epithelial  cis-eQTL    0.447487
                  cis-sQTL    0.602015
PBMC              cis-eQTL    0.656701
                  cis-sQTL    0.679758
T_cell            cis-eQTL    0.693311
                  cis-sQTL    0.773790
Name: in_other_scan_cs, dtype: float64

In [10]:
print(len(credible_set_in_blood))
credible_set_in_blood = credible_set_in_blood.merge(top)
print(len(credible_set_in_blood))

129231


129231


In [11]:
credible_set_in_blood[['phenotype_id', 'variant_id', 'pip', 'af', 'cs_id', 'tissue', 'modality', 'in_other_scan_cs']].rename(columns={'in_other_scan_cs': 'in_whole_blood'}).to_csv(f'tables/{PREFIX}credible_set_in_blood.tsv', sep='\t', index=False)