In [1]:
import pandas as pd
import numpy as np
from topmed_manuscript_clean import gtf_to_df, phenotype_id_to_gene_id, format_modality
import sys
from scipy.stats import hypergeom
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
from statsmodels.stats.multitest import multipletests
import re
import glob


def parse_modality(s):
    RE = '((cis|trans)-?([es]qtl))'
    m = re.search(RE, s, flags=re.IGNORECASE)
    assert(m is not None)
    return m.group(0)


def parse_tissue(s):
    tissues = ['Whole_blood', 'Lung', 'Nasal_epithelial', 'PBMC', 'Monocyte', 'T_cell']
    RE = '(' + '|'.join(tissues) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)

def genrich(genes, background):
    """
    """
    #from scipy.stats import hypergeom
    # M is the total number of objects, n is total number of Type I objects. The random variate represents the number of Type I objects in N drawn without replacement from the total population.
    enrich = gp.profile(organism='hsapiens', query=genes, all_results=True, background=background)
    enrich['nominal_p'] = hypergeom.sf(enrich.intersection_size-1, enrich.effective_domain_size, enrich.query_size, enrich.term_size)
    enrich['nominal_p'] = [nominal_p if nominal_p > 0 else p_value for nominal_p, p_value in zip(enrich.nominal_p, enrich.p_value)]
    enrich['obs_exp'] = (enrich.intersection_size / enrich.term_size) / (enrich.query_size / enrich.effective_domain_size)
    enrich['log2_enrich'] = np.log2(enrich.obs_exp)
    enrich['-log10(q)'] = -1*np.log10(enrich.p_value)
    enrich['-log10(p)'] = -1*np.log10(enrich.nominal_p)
    if enrich[enrich.significant].obs_exp.min() <= 1:
        sys.stderr.write('Warning: observed / expected for significant enriched items not always > 1?')
        sys.stderr.flush()
    if enrich[enrich.significant].nominal_p.max() > 0.05:
        sys.stderr.write('Warning: nominal p for significant enriched itens not always < 0.05?')
        sys.stderr.flush()
    if not all(enrich[enrich.significant].nominal_p <= enrich[enrich.significant].p_value):
        sys.stderr.write('Warning: nominal p for significant enriched items sometimes > corrected p?')
        sys.stderr.flush()
    return enrich


PREFIX = 'trans-go-enrichment.'

GTF = '../data/gtf/gencode.v30.GRCh38.ERCC.genes.collapsed_only.gtf.gz'
gtf_df = gtf_to_df(GTF, parse_attributes=['gene_id', 'gene_name'])
gtf_df = gtf_df[gtf_df.feature=='gene']
gene_id_to_gene_name = dict(zip(gtf_df.gene_id, gtf_df.gene_name))

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [2]:
full_trans_permutation_files = pd.DataFrame({'file': glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/trans-*qtl/maf005/trans-top/*')})
full_trans_permutation_files['tissue'] = full_trans_permutation_files.file.map(parse_tissue)
full_trans_permutation_files['modality'] = full_trans_permutation_files.file.map(lambda x: format_modality(parse_modality(x)))

In [3]:
full_trans_permutations = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=tissue, modality=modality) for f, tissue, modality in zip(full_trans_permutation_files.file, full_trans_permutation_files.tissue, full_trans_permutation_files.modality)])
full_trans_permutations.head()

Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,af,gene_mappability,gene_crossmaps_to_gene_near_variant,biotype,...,pval_perm,pval_beta,pval_beta_no_zero,qvalue,tissue,modality,gene_id,phenotypes_tested_for_gene,pval_beta_corrected_across_phenotypes,pval_beta_corrected_across_phenotypes_no_zero
0,chr14_83387951_A_T,ENSG00000000003.14,7.92475e-06,0.10193,0.022799,0.00312,0.190889,0.958185,False,protein_coding,...,1.0,0.9999982,0.9999982,0.9999999,Whole_blood,trans-eQTL,,,,
1,chr7_132431526_AG_A,ENSG00000000419.12,3.25616e-07,-0.144682,0.028295,0.004078,0.119538,1.0,False,protein_coding,...,0.469577,0.4675975,0.4675975,0.9999999,Whole_blood,trans-eQTL,,,,
2,chrX_26802742_G_A,ENSG00000000457.14,3.07577e-06,0.076452,0.016371,0.003403,0.657887,0.967695,False,protein_coding,...,0.99415,0.9940261,0.9940261,0.9999999,Whole_blood,trans-eQTL,,,,
3,chr4_106906657_G_A,ENSG00000000460.17,3.76013e-07,-0.125686,0.024712,0.004034,0.15533,0.973241,False,protein_coding,...,0.496775,0.4957646,0.4957646,0.9999999,Whole_blood,trans-eQTL,,,,
4,chr7_50330658_C_T,ENSG00000000938.13,3.5030500000000003e-25,-0.234596,0.022533,0.01669,0.235203,0.999762,False,protein_coding,...,5e-05,3.457262e-18,3.457262e-18,1.392881e-16,Whole_blood,trans-eQTL,,,,


In [4]:
full_trans_permutations[full_trans_permutations.tissue=='Whole_blood'].pval.max()

9.98102e-06

In [5]:
trans_eqtl_clumped = pd.read_csv('../work/clump-trans-variants/clump-trans-signals.significant-trans-eqtl-clumped.tsv', sep='\t')
trans_sqtl_clumped = pd.read_csv('../work/clump-trans-variants//clump-trans-signals.significant-trans-sqtl-clumped.tsv', sep='\t')
trans_eqtl_clumped.head()

Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,af,gene_mappability,gene_crossmaps_to_gene_near_variant,biotype,...,beta_shape1,beta_shape2,true_df,pval_true_df,pval_perm,pval_beta,pval_beta_no_zero,qvalue,tissue,clumped_variant_id
0,chr7_50330658_C_T,ENSG00000000938.13,3.5030500000000003e-25,-0.234596,0.022533,0.01669,0.235203,0.999762,False,protein_coding,...,1.005841,979517.7,6086.276059,4.468785e-24,5e-05,3.457262e-18,3.457262e-18,1.392881e-16,Whole_blood,chr7_50342615_A_G
1,chr1_156302480_C_T,ENSG00000002330.13,1.02777e-13,0.134956,0.018106,0.008625,0.500775,0.999556,False,protein_coding,...,1.005125,1008630.0,6081.962583,3.926652e-13,5e-05,3.664337e-07,3.664337e-07,5.779733e-06,Whole_blood,chr1_156344836_A_G
2,chr6_144036619_C_A,ENSG00000004059.11,1.98404e-17,0.331091,0.038865,0.011237,0.055934,1.0,False,protein_coding,...,1.012491,958740.6,6060.399614,1.281332e-16,5e-05,9.188711e-11,9.188711e-11,2.1e-09,Whole_blood,chr6_144036619_C_A
3,chr6_122440739_T_C,ENSG00000004478.8,1.03467e-10,0.126184,0.019495,0.006518,0.706539,0.960425,False,protein_coding,...,1.003425,1105596.0,6130.610651,2.431723e-10,0.00025,0.0002609707,0.0002609707,0.002773133,Whole_blood,chr6_122440739_T_C
4,chr22_46290431_C_G,ENSG00000004799.8,7.74095e-13,0.207077,0.028839,0.008009,0.10753,1.0,False,protein_coding,...,1.012491,958740.6,6060.399614,2.939505e-12,5e-05,2.389651e-06,2.389651e-06,3.403684e-05,Whole_blood,chr22_46290431_C_G


In [6]:
trans_clumped = pd.concat([trans_eqtl_clumped[['phenotype_id', 'tissue', 'clumped_variant_id']].assign(modality='trans-eQTL'), trans_sqtl_clumped[['phenotype_id', 'tissue', 'clumped_variant_id']].assign(modality='trans-sQTL')])
trans_clumped['gene_id'] = trans_clumped.phenotype_id.map(phenotype_id_to_gene_id)
trans_clumped.head()

Unnamed: 0,phenotype_id,tissue,clumped_variant_id,modality,gene_id
0,ENSG00000000938.13,Whole_blood,chr7_50342615_A_G,trans-eQTL,ENSG00000000938.13
1,ENSG00000002330.13,Whole_blood,chr1_156344836_A_G,trans-eQTL,ENSG00000002330.13
2,ENSG00000004059.11,Whole_blood,chr6_144036619_C_A,trans-eQTL,ENSG00000004059.11
3,ENSG00000004478.8,Whole_blood,chr6_122440739_T_C,trans-eQTL,ENSG00000004478.8
4,ENSG00000004799.8,Whole_blood,chr22_46290431_C_G,trans-eQTL,ENSG00000004799.8


In [7]:
# TODO: use bonferroni correction
results = []
for (tissue, modality, clumped_variant_id), df in trans_clumped.groupby(['tissue', 'modality', 'clumped_variant_id']):
    genes = df['gene_id'].to_list()
    if len(genes) >= 10 and tissue == 'Whole_blood':
        print(tissue, modality, clumped_variant_id)
        background = full_trans_permutations[(full_trans_permutations.tissue==tissue) & (full_trans_permutations.modality==modality)].phenotype_id.map(phenotype_id_to_gene_id).to_list()
        go = genrich([i.split('.')[0] for i in genes], [i.split('.')[0] for i in background])
        go = go[go.source.isin(['GO:BP', 'KEGG'])]
        go = go[go.term_size>=5]
        go = go[go.term_size<=1000]
        go['qval'] = multipletests(go.nominal_p, method='bonferroni')[1]
        go = go.drop(columns=['-log10(q)', 'significant', 'p_value'])
        results.append(go.assign(tissue=tissue, modality=modality, clumped_variant_id=clumped_variant_id))
results = pd.concat(results)
results.head()

Whole_blood trans-eQTL chr10_63278270_G_C
Whole_blood trans-eQTL chr12_54292096_C_T
Whole_blood trans-eQTL chr14_35132881_T_G
Whole_blood trans-eQTL chr17_16268373_CTT_C
Whole_blood trans-eQTL chr17_28862756_A_G
Whole_blood trans-eQTL chr17_35548243_T_A
Whole_blood trans-eQTL chr17_64066984_C_A
Whole_blood trans-eQTL chr1_156344836_A_G
Whole_blood trans-eQTL chr1_158840522_C_T
Whole_blood trans-eQTL chr1_159204893_T_C
Whole_blood trans-eQTL chr2_159571135_C_T
Whole_blood trans-eQTL chr3_101391301_T_A
Whole_blood trans-eQTL chr3_56815721_T_C
Whole_blood trans-eQTL chr5_173778315_G_A
Whole_blood trans-eQTL chr6_139520516_T_C
Whole_blood trans-eQTL chr6_144036619_C_A
Whole_blood trans-eQTL chr6_154833905_C_T
Whole_blood trans-eQTL chr7_28684757_G_T
Whole_blood trans-eQTL chr7_50255709_CT_C
Whole_blood trans-eQTL chr7_50336475_T_C
Whole_blood trans-eQTL chr7_50337162_A_G
Whole_blood trans-eQTL chr7_50342615_A_G
Whole_blood trans-eQTL chr7_50360284_G_A
Whole_blood trans-eQTL chr8_60733792_C

Unnamed: 0,source,native,name,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,nominal_p,obs_exp,log2_enrich,-log10(p),qval,tissue,modality,clumped_variant_id
3,KEGG,KEGG:05418,Fluid shear stress and atherosclerosis,Fluid shear stress and atherosclerosis,102,10,2,14035,0.2,0.019608,query_1,[KEGG:00000],0.002266,27.519608,4.782388,2.644773,1.0,Whole_blood,trans-eQTL,chr10_63278270_G_C
14,KEGG,KEGG:04510,Focal adhesion,Focal adhesion,144,10,2,14035,0.2,0.013889,query_1,[KEGG:00000],0.004457,19.493056,4.284888,2.350934,1.0,Whole_blood,trans-eQTL,chr10_63278270_G_C
2738,KEGG,KEGG:04015,Rap1 signaling pathway,Rap1 signaling pathway,149,10,1,14035,0.1,0.006711,query_1,[KEGG:00000],0.101263,9.419463,3.235645,0.994548,1.0,Whole_blood,trans-eQTL,chr10_63278270_G_C
2740,GO:BP,GO:2000353,positive regulation of endothelial cell apopto...,"""Any process that activates or increases the f...",18,10,1,14035,0.1,0.055556,query_1,"[GO:0072577, GO:1904037, GO:2000351]",0.012755,77.972222,6.284888,1.894307,1.0,Whole_blood,trans-eQTL,chr10_63278270_G_C
2741,GO:BP,GO:2000351,regulation of endothelial cell apoptotic process,"""Any process that modulates the frequency, rat...",38,10,1,14035,0.1,0.026316,query_1,"[GO:0072577, GO:1904035]",0.026756,36.934211,5.206886,1.572577,1.0,Whole_blood,trans-eQTL,chr10_63278270_G_C


In [8]:
results = results[(results.tissue=='Whole_blood') & (results.modality=='trans-eQTL')]

In [9]:
print('{} of {} such groups showed significant GO:BP or KEGG pathway enrichments'.format(results[results.qval<=0.05].clumped_variant_id.nunique(), results.clumped_variant_id.nunique()))

14 of 25 such groups showed significant GO:BP or KEGG pathway enrichments


In [10]:
#pruned_results = results[(results.qval<=0.05) & (results.tissue=='Whole_blood') & (results.term_size<=1000) & (results.intersection_size>=(np.maximum(results.query_size/10, 3)))]
#pruned_results.to_csv(f'tables/{PREFIX}results.tsv', sep='\t', index=False)
results[(results.qval<=0.05)].rename(columns={'qval': 'bonferroni_p'}).to_csv(f'tables/{PREFIX}results.tsv', sep='\t', index=False)

In [11]:
results[(results.qval<=0.05)].groupby(['tissue', 'clumped_variant_id']).size()

tissue       clumped_variant_id  
Whole_blood  chr12_54292096_C_T       1
             chr14_35132881_T_G       1
             chr17_16268373_CTT_C    34
             chr17_35548243_T_A       5
             chr17_64066984_C_A       2
             chr1_156344836_A_G       3
             chr1_158840522_C_T       5
             chr3_101391301_T_A       1
             chr3_56815721_T_C       19
             chr6_139520516_T_C       1
             chr6_144036619_C_A       2
             chr7_50255709_CT_C       3
             chr7_50342615_A_G        1
             chr9_96430637_A_C        3
dtype: int64

In [12]:
results[(results.qval<=0.05) & (results.clumped_variant_id=='chr17_64066984_C_A')]

Unnamed: 0,source,native,name,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,nominal_p,obs_exp,log2_enrich,-log10(p),qval,tissue,modality,clumped_variant_id
0,KEGG,KEGG:04141,Protein processing in endoplasmic reticulum,Protein processing in endoplasmic reticulum,138,13,5,14035,0.384615,0.036232,query_1,[KEGG:00000],1.032438e-07,39.116499,5.289705,6.986136,6.7e-05,Whole_blood,trans-eQTL,chr17_64066984_C_A
6,GO:BP,GO:0034976,response to endoplasmic reticulum stress,"""Any process that results in a change in state...",181,13,5,14035,0.384615,0.027624,query_1,[GO:0033554],3.995242e-07,29.823629,4.898384,6.398457,0.00026,Whole_blood,trans-eQTL,chr17_64066984_C_A


In [13]:
tmp = results[(results.qval<=0.05) & (results.clumped_variant_id=='chr17_64066984_C_A') & (results.source=='KEGG')].iloc[0]

In [14]:
assert(tmp.query_size == trans_eqtl_clumped[trans_eqtl_clumped.clumped_variant_id=='chr17_64066984_C_A'].phenotype_id.nunique())

In [15]:
'For example, one cis-eVariant (chr17_64066984_C_A) for ERN1, which encodes endonuclease IRE1a, was a trans-eVariant for {} trans-eGenes. ERN1 is a regulator of the endoplasmic reticulum (ER) stress response (Shen et al., 2001; Yoshida et al., 2001), and {} of the trans-eGenes were in the "{}" KEGG pathway ({}-fold enrichment; nominal p={})'.format(tmp.query_size, tmp.intersection_size, tmp['name'], round(tmp.obs_exp, 1), tmp.nominal_p)

'For example, one cis-eVariant (chr17_64066984_C_A) for ERN1, which encodes endonuclease IRE1a, was a trans-eVariant for 13 trans-eGenes. ERN1 is a regulator of the endoplasmic reticulum (ER) stress response (Shen et al., 2001; Yoshida et al., 2001), and 5 of the trans-eGenes were in the "Protein processing in endoplasmic reticulum" KEGG pathway (39.1-fold enrichment; nominal p=1.0324375176906071e-07)'

In [16]:
results[results.clumped_variant_id=='chr3_56815721_T_C']

Unnamed: 0,source,native,name,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,nominal_p,obs_exp,log2_enrich,-log10(p),qval,tissue,modality,clumped_variant_id
1,KEGG,KEGG:04613,Neutrophil extracellular trap formation,Neutrophil extracellular trap formation,126,81,13,14035,0.160494,0.103175,query_1,[KEGG:00000],2.935704e-13,17.877229,4.160051,12.532288,4.879140e-10,Whole_blood,trans-eQTL,chr3_56815721_T_C
4,KEGG,KEGG:05034,Alcoholism,Alcoholism,117,81,11,14035,0.135802,0.094017,query_1,[KEGG:00000],6.223410e-11,16.290493,4.025958,10.205972,1.034331e-07,Whole_blood,trans-eQTL,chr3_56815721_T_C
17,KEGG,KEGG:05322,Systemic lupus erythematosus,Systemic lupus erythematosus,80,81,9,14035,0.111111,0.112500,query_1,[KEGG:00000],7.496328e-10,19.493056,4.284888,9.125151,1.245890e-06,Whole_blood,trans-eQTL,chr3_56815721_T_C
32,GO:BP,GO:0030168,platelet activation,"""A series of progressive, overlapping events t...",105,81,11,14035,0.135802,0.104762,query_1,"[GO:0001775, GO:0007596]",1.888524e-11,18.152263,4.182078,10.723877,3.138727e-08,Whole_blood,trans-eQTL,chr3_56815721_T_C
49,GO:BP,GO:0007596,blood coagulation,"""The sequential process in which the multiple ...",165,81,12,14035,0.148148,0.072727,query_1,"[GO:0007599, GO:0042060, GO:0050817]",1.635314e-10,12.601571,3.655532,9.786399,2.717892e-07,Whole_blood,trans-eQTL,chr3_56815721_T_C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9114,GO:BP,GO:0014812,muscle cell migration,"""The orderly movement of a muscle cell from on...",60,81,1,14035,0.012346,0.016667,query_1,[GO:0016477],2.939113e-01,2.887860,1.530001,0.531784,1.000000e+00,Whole_blood,trans-eQTL,chr3_56815721_T_C
9115,GO:BP,GO:0014823,response to activity,"""Any process that results in a change in state...",45,81,2,14035,0.024691,0.044444,query_1,[GO:0050896],2.774748e-02,7.700960,2.945038,1.556776,1.000000e+00,Whole_blood,trans-eQTL,chr3_56815721_T_C
9116,GO:BP,GO:0014831,gastro-intestinal system smooth muscle contrac...,"""A process in which force is generated within ...",7,81,1,14035,0.012346,0.142857,query_1,[GO:0006939],3.971457e-02,24.753086,4.629537,1.401050,1.000000e+00,Whole_blood,trans-eQTL,chr3_56815721_T_C
9117,GO:BP,GO:0014896,muscle hypertrophy,"""The muscle system process that results in enl...",53,81,1,14035,0.012346,0.018868,query_1,[GO:0003012],2.645962e-01,3.269276,1.708971,0.577416,1.000000e+00,Whole_blood,trans-eQTL,chr3_56815721_T_C
