In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import re
import glob
import os
import statsmodels.api as sm

PREFIX = 'calculate-functional-enrichments-set-level-logistic-regression.'


topmed_to_roadmap = {
    'Whole_blood': 'E062',
    'PBMC': 'E062',
    'Lung': 'E096',
    'Monocyte': 'E029',
    'T_cell': 'E034',
    'Nasal_epithelial': 'E114'
}

roadmap_state_names = {
    '1_TssA': 'Active_TSS',
    '2_TssAFlnk': 'Flanking Active TSS',
    '3_TxFlnk':	"Transcr. at gene 5' and 3'",
    '4_Tx':	'Strong transcription',
    '5_TxWk': 'Weak transcription',
    '6_EnhG': 'Genic enhancers',
    '7_Enh': 'Enhancers',
    '8_ZNF/Rpts': 'ZNF genes & repeats',
    '9_Het': 'Heterochromatin',
    '10_TssBiv': 'Bivalent/Poised TSS',
    '11_BivFlnk': 'Flanking Bivalent TSS/Enh',
    '12_EnhBiv': 'Bivalent Enhancer',
    '13_ReprPC': 'Repressed PolyComb',
    '14_ReprPCWk': 'Weak Repressed PolyComb',
    '15_Quies': 'Quiescent/Low'
}

def format_modality(modality):
    modality_lower = modality.lower()
    assert('trans' in modality_lower or 'cis' in modality_lower)
    cis_trans = 'trans' if 'trans' in modality_lower else 'cis'
    assert('eqtl' in modality_lower or 'sqtl' in modality_lower)
    e_sqtl = 'eQTL' if 'eqtl' in modality_lower else 'sQTL'
    return cis_trans + '-' + e_sqtl


def parse_modality(s):
    RE = '((cis|trans)-?([es]qtl))'
    m = re.search(RE, s, flags=re.IGNORECASE)
    assert(m is not None)
    return m.group(0)


def parse_tissue(s):
    tissues = ['Whole_blood', 'Lung', 'Nasal_epithelial', 'PBMC', 'Monocyte', 'T_cell']
    RE = '(' + '|'.join(tissues) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def parse_maf(s):
    mafs = ['maf001', 'maf0001', 'maf005']
    RE = '(' + '|'.join(mafs) + ')'
    m = re.search(RE, s)
    assert(m is not None)
    return m.group(0)


def format_maf(s):
    x = {
        'maf001': '1%',
        'maf0001': '0.1%',
        'maf005': '5%'
         }
    if s not in x:
        raise ValueError("Cannot format MAF: {}".format(s))
    return x[s]


def variant_annotation_to_credible_set_annotation(credible_sets, annotation_table):
    tmp = annotation_table.loc[credible_sets.variant_id.unique()].reset_index().rename(columns={'index': 'variant_id'})
    tmp = credible_sets[['phenotype_id', 'variant_id', 'cs_id']].merge(tmp, on='variant_id').drop(columns=['variant_id'])
    return tmp.groupby(['phenotype_id', 'cs_id']).max()


def logistic_regression(credible_sets, control_credible_sets, annotation_table):
    x_hit = variant_annotation_to_credible_set_annotation(credible_sets, annotation_table)
    x_control = variant_annotation_to_credible_set_annotation(control_credible_sets, annotation_table)
    x = pd.concat([x_hit.assign(observed=1), x_control.assign(observed=0)]).reset_index(drop=True)
    y = x.pop('observed')

    # drop annotations w/ no variance
    DROP_ANNOTATIONS = x.nunique().where(lambda y: y==1).dropna().index.to_list()

    # drop annotations w/ few overlaps (eliminates some singular matrix issues)
    # DROP_ANNOTATIONS += x.sum().where(lambda y: y<5).dropna().index.to_list()
    threshold = len(x) * 0.01 # at least 1% of true + control credible sets need to overlap the annotation
    DROP_ANNOTATIONS += x.sum().where(lambda y: y<threshold).dropna().index.to_list()

    # drop annotations w/ no overlaps in either case or control
    # DROP_ANNOTATIONS += x.groupby(y).mean().min().where(lambda y: y<=0).dropna().index.to_list()
    x = x.drop(columns=list(set(DROP_ANNOTATIONS)))

    logit_model = sm.Logit(y, sm.add_constant(x))
    result = logit_model.fit(disp=0)
    coefs = result.params
    pvalues = result.pvalues
    converged = result.mle_retvals['converged']
    assert(all(coefs.index == pvalues.index))

    tmp = result.summary2().tables[1]
    tmp['converged'] = converged
    tmp = tmp[tmp.index!='const']

    hits_in_annotation = x_hit.sum()
    hits_not_in_annotation = (x_hit==0).sum()
    controls_in_annotation = x_control.sum()
    controls_not_in_annotation = (x_control==0).sum()
    tmp['n_hits_in_annotation'] = hits_in_annotation.loc[tmp.index].to_list()
    tmp['n_hits_not_in_annotation'] = hits_not_in_annotation.loc[tmp.index].to_list()
    tmp['n_controls_in_annotation'] = controls_in_annotation.loc[tmp.index].to_list()
    tmp['n_controls_not_in_annotation'] = controls_not_in_annotation.loc[tmp.index].to_list()

    return tmp

In [2]:
ANNOTATION_FILES = glob.glob('../work/variant-annotation-matrices/annotations.*.txt')
annotations = {os.path.basename(f).split('.')[1]: pd.read_csv(f, sep='\t', index_col=0) for f in ANNOTATION_FILES}

# rename Roadmap states
for a in [i for i in annotations.keys() if 'roadmap' in i]:
    annotations[a] = annotations[a].rename(columns=roadmap_state_names)

In [3]:
# del annotations['regulatory_build']

In [4]:
control_credible_sets = glob.glob('../work/control-credible-sets/cis-*/maf*/results/controls/*.control-credible-sets.tsv')
control_credible_sets = pd.concat([pd.read_csv(f, sep='\t').assign(f=f) for f in control_credible_sets])
control_credible_sets['tissue'] = control_credible_sets.f.map(lambda x: os.path.basename(x).split('.')[0])
control_credible_sets['maf_threshold'] = control_credible_sets.f.map(lambda x: re.search('(maf\d+)', x).group(0))
control_credible_sets['modality'] = control_credible_sets.f.map(lambda x: re.search('(cis-[a-z]+)', x).group(0))
control_credible_sets = control_credible_sets.drop(columns=['f'])
control_credible_sets.modality = control_credible_sets.modality.map(format_modality)
control_credible_sets.maf_threshold = control_credible_sets.maf_threshold.map({'maf0001': '0.1%', 'maf001': '1%', 'maf005': '5%'})
control_credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,maf,cs_id,tissue,maf_threshold,modality
0,ENSG00000168350.8,chr14_69324247_T_G,1.0,0.3215,2,Lung,1%,cis-eQTL
1,ENSG00000100711.13,chr14_76492603_C_G,1.0,0.05732,1,Lung,1%,cis-eQTL
2,ENSG00000258913.1,chr14_104810307_C_T,1.0,0.03292,1,Lung,1%,cis-eQTL
3,ENSG00000179627.10,chr14_20870069_A_G,1.0,0.08056,1,Lung,1%,cis-eQTL
4,ENSG00000211923.1,chr14_105495192_G_A,1.0,0.08211,1,Lung,1%,cis-eQTL


In [5]:
to_load = glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-eqtl/susie/maf*/*.cs.txt') + glob.glob('../../manuscript-intermediate-processing/data/scan-results/joint/cis-sqtl/susie/maf*/postprocessed/*.by-gene.cs.txt')
to_load = pd.DataFrame({'file': to_load})
to_load['tissue'] = to_load.file.apply(parse_tissue)
to_load['modality'] = to_load.file.apply(parse_modality).map(format_modality)
to_load['maf'] = to_load.file.apply(parse_maf).map(format_maf)

In [6]:
credible_sets = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=tissue, modality=modality, maf_threshold=maf_threshold) for f, tissue, modality, maf_threshold in zip(to_load.file, to_load.tissue, to_load.modality, to_load.maf)])
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%


In [7]:
# remove credible sets without a matching control credible set
credible_sets = credible_sets.merge(control_credible_sets[['phenotype_id', 'cs_id', 'tissue', 'modality', 'maf_threshold']].assign(has_control=1).drop_duplicates(), how='left')
credible_sets['has_control'] = credible_sets.has_control.fillna(0).astype(int)
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold,has_control
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,1
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,1
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,1
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,1
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,1


In [8]:
credible_sets[['phenotype_id', 'cs_id', 'tissue', 'modality', 'maf_threshold', 'has_control']].drop_duplicates().has_control.value_counts()

1    364693
0       476
Name: has_control, dtype: int64

In [9]:
credible_sets = credible_sets[credible_sets.has_control==1].drop(columns=['has_control'])
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%


In [10]:
# add trans-e/sQTL top hits, reshaping to resemble credible sets
input_variant_files = glob.glob('../work/control-snps/trans-*qtl/maf005/results/controls/*.controls.txt')
RE = re.compile('.*/work/control-snps/(.*)/(maf\d+)/results/controls/(.*).controls.txt')
input_variants = pd.concat([pd.read_csv(f, sep='\t').assign(modality=RE.match(f).group(1), tissue=RE.match(f).group(3), maf=RE.match(f).group(2)) for f in input_variant_files])
input_variants.maf = input_variants.maf.map({'maf001': '1%', 'maf0001': '0.1%', 'maf005': '5%'})
input_variants.modality = input_variants.modality.map(format_modality)
input_variants = input_variants.groupby(['variant', 'modality', 'tissue', 'maf']).head(1)

input_variants['phenotype_id'] = [f'fake_{i}' for i in range(1, len(input_variants)+ 1)]
input_variants['cs_id'] = 1
input_variants['pip'] = 1.0
trans_credible_sets = input_variants[['phenotype_id', 'variant', 'pip', 'variant_maf', 'cs_id', 'tissue', 'modality', 'maf']].rename(columns={'variant': 'variant_id', 'variant_maf': 'af', 'maf': 'maf_threshold'})
trans_control_credible_sets = input_variants[['phenotype_id', 'control_variant', 'pip', 'control_variant_maf', 'cs_id', 'tissue', 'modality', 'maf']].rename(columns={'control_variant': 'variant_id', 'control_variant_maf': 'maf', 'maf': 'maf_threshold'})

In [11]:
input_variants.head()

Unnamed: 0,variant,control_variant,variant_maf,control_variant_maf,modality,tissue,maf,phenotype_id,cs_id,pip
0,chr19_37475759_A_G,chr19_40989528_A_G,0.09328,0.09209,trans-eQTL,PBMC,5%,fake_1,1,1.0
5,chr19_16325451_T_C,chr19_43696515_C_A,0.1798,0.1826,trans-eQTL,PBMC,5%,fake_2,1,1.0
10,chr19_36330296_T_G,chr19_2894839_C_T,0.1846,0.1818,trans-eQTL,PBMC,5%,fake_3,1,1.0
15,chr19_6077048_A_G,chr19_28418435_C_A,0.2549,0.2549,trans-eQTL,PBMC,5%,fake_4,1,1.0
20,chr19_12477305_T_C,chr19_20056962_A_G,0.2664,0.2676,trans-eQTL,PBMC,5%,fake_5,1,1.0


In [12]:
credible_sets = pd.concat([credible_sets, trans_credible_sets])
control_credible_sets = pd.concat([control_credible_sets, trans_control_credible_sets])

In [13]:
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%


In [14]:
results = []

for (tissue, modality, maf_threshold), df in credible_sets.groupby(['tissue', 'modality', 'maf_threshold']):
    if 'trans' in modality and tissue != 'Whole_blood' or modality == 'trans-sQTL':
        # matrix singularity / convergence issues likely
        continue
    control_df = control_credible_sets[(control_credible_sets.tissue==tissue) & (control_credible_sets.modality==modality) & (control_credible_sets.maf_threshold==maf_threshold)]
    annotations_to_test = ['snpeff', 'regulatory_build', f'roadmap_{topmed_to_roadmap[tissue]}']
    for a in annotations_to_test:
        lr = logistic_regression(df, control_df, annotations[a])
        if lr.converged.values[0] == False:
            print(tissue, modality, maf_threshold, a, 'converged: ', lr.converged.values[0])
        results.append(lr.assign(tissue=tissue, modality=modality, maf_threshold=maf_threshold, annotation=a))
results = pd.concat(results)
results.head()

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],converged,n_hits_in_annotation,n_hits_not_in_annotation,n_controls_in_annotation,n_controls_not_in_annotation,tissue,modality,maf_threshold,annotation
sequence_feature,-0.078589,0.048185,-1.630979,0.1028947,-0.17303,0.015852,True,1335,23744,838,24241,Lung,cis-eQTL,1%,snpeff
5_prime_UTR_variant,0.648394,0.039557,16.39157,2.196965e-60,0.570864,0.725923,True,3798,21281,1315,23764,Lung,cis-eQTL,1%,snpeff
3_prime_UTR_variant,0.326662,0.028733,11.369052,5.963155e-30,0.270347,0.382976,True,6034,19045,2990,22089,Lung,cis-eQTL,1%,snpeff
intron_variant,0.16629,0.027745,5.993512,2.053564e-09,0.111911,0.22067,True,20532,4547,17018,8061,Lung,cis-eQTL,1%,snpeff
downstream_gene_variant,0.272838,0.022563,12.092274,1.160295e-33,0.228615,0.31706,True,16615,8464,11696,13383,Lung,cis-eQTL,1%,snpeff


In [15]:
results.to_csv(f'tables/{PREFIX}enrichments-vs-control-variants.tsv', sep='\t', index=True)

In [16]:
# enrichments for cis-eQTL that are cis-sQTL or are not
cis_sharing = pd.read_csv('tables/summarize-cis-eqtl-coloc-with-cis-sqtl.cis-eqtl-and-sqtl-overlap.tsv', sep='\t')
cis_sharing = cis_sharing.rename(columns={'coloced_with_other_modality_signal': 'colocs_with_other_modality'})
# losing trans cs here -- but that's fine as we don't use them anymore
print(credible_sets.modality.value_counts())
credible_sets = credible_sets.merge(cis_sharing.rename(columns={'maf': 'maf_threshold'}))
print(credible_sets.modality.value_counts())

  cis_sharing = pd.read_csv('tables/summarize-cis-eqtl-coloc-with-cis-sqtl.cis-eqtl-and-sqtl-overlap.tsv', sep='\t')


cis-eQTL      2388661
cis-sQTL      1621388
trans-eQTL        700
trans-sQTL         67
Name: modality, dtype: int64
cis-eQTL    2388661
cis-sQTL    1621388
Name: modality, dtype: int64


In [17]:
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold,colocing_phenotypes,colocs_with_other_modality,coloced_with_other_modality_signal_from_same_gene
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True


In [18]:
results = []

for (tissue, modality, maf_threshold, colocs_with_other_modality), df in credible_sets.groupby(['tissue', 'modality', 'maf_threshold', 'colocs_with_other_modality']):
    if tissue != 'Whole_blood':
        continue
    control_df = control_credible_sets.merge(df[['phenotype_id', 'cs_id', 'tissue', 'modality', 'maf_threshold', 'colocs_with_other_modality']].drop_duplicates())
    annotations_to_test = ['snpeff', 'regulatory_build', f'roadmap_{topmed_to_roadmap[tissue]}']
    for a in annotations_to_test:
        lr = logistic_regression(df, control_df, annotations[a])
        if lr.converged.values[0] == False:
            print(tissue, modality, maf_threshold, colocs_with_other_modality, a, 'converged: ', lr.converged.values[0])
        results.append(lr.assign(tissue=tissue, modality=modality, maf_threshold=maf_threshold, colocs_with_other_modality=colocs_with_other_modality, annotation=a))
results = pd.concat(results)
results.head()

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],converged,n_hits_in_annotation,n_hits_not_in_annotation,n_controls_in_annotation,n_controls_not_in_annotation,tissue,modality,maf_threshold,colocs_with_other_modality,annotation
sequence_feature,-0.102893,0.034483,-2.983861,0.00284636,-0.170479,-0.035307,True,2414,74107,1682,74839,Whole_blood,cis-eQTL,0.1%,False,snpeff
5_prime_UTR_variant,0.891838,0.02804,31.805785,5.3843170000000003e-222,0.836881,0.946796,True,8573,67948,2266,74255,Whole_blood,cis-eQTL,0.1%,False,snpeff
3_prime_UTR_variant,0.317298,0.018947,16.746864,5.969097e-63,0.280163,0.354433,True,12203,64318,5949,70572,Whole_blood,cis-eQTL,0.1%,False,snpeff
intron_variant,-0.115431,0.015474,-7.459702,8.671868e-14,-0.145759,-0.085102,True,57413,19108,49873,26648,Whole_blood,cis-eQTL,0.1%,False,snpeff
downstream_gene_variant,0.312512,0.01216,25.699438,1.185919e-145,0.288679,0.336346,True,43676,32845,30687,45834,Whole_blood,cis-eQTL,0.1%,False,snpeff


In [19]:
results.to_csv(f'tables/{PREFIX}cis-sharing.tsv', sep='\t', index=True)

In [20]:
# compare Lung CS that are in blood or aren't (and same for nasal epithelial)

blood_as_proxy = pd.read_csv('tables/blood-as-proxy.credible_set_in_blood.tsv', sep='\t')
blood_as_proxy.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,in_whole_blood
0,ENSG00000000003.14,chrX_100649875_A_G,1.0,0.518977,1,Lung,cis-eQTL,0
1,ENSG00000000457.14,chr1_169788346_A_C,0.852837,0.508908,1,Lung,cis-eQTL,1
2,ENSG00000000460.17,chr1_169757980_T_A,0.009195,0.091402,1,Lung,cis-eQTL,1
3,ENSG00000000460.17,chr1_169796758_G_A,0.057029,0.300542,2,Lung,cis-eQTL,0
4,ENSG00000000971.15,chr1_196391465_A_T,0.405031,0.560418,1,Lung,cis-eQTL,0


In [21]:
# blood_as_proxy.groupby(['tissue', 'modality', 'in_whole_blood']).size()

In [22]:
# blood_as_proxy.loc[blood_as_proxy.variant_id=='chr1_196891605_G_A',['phenotype_id', 'variant_id', 'tissue', 'modality']].values

In [23]:
# credible_sets.loc[credible_sets.variant_id=='chr1_196891605_G_A',['phenotype_id', 'variant_id', 'tissue', 'modality']].values

In [24]:
# WTF? cluster IDs different
# Ah -- because the blood as proxy analysis was using EUR credible sets
# need to update analyses accordingly
# print(credible_sets.loc[credible_sets.variant_id=='chr1_196891605_G_A',['phenotype_id', 'variant_id', 'tissue', 'modality']].values[0][0])
# print(blood_as_proxy.loc[blood_as_proxy.variant_id=='chr1_196891605_G_A',['phenotype_id', 'variant_id', 'tissue', 'modality']].values[0][0])

In [25]:
blood_as_proxy = credible_sets.merge(blood_as_proxy[['phenotype_id', 'cs_id', 'tissue', 'modality', 'in_whole_blood']].drop_duplicates())
blood_as_proxy.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold,colocing_phenotypes,colocs_with_other_modality,coloced_with_other_modality_signal_from_same_gene,in_whole_blood
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True,1
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,1
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,1
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,1
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,1


In [26]:
# blood_as_proxy.groupby(['tissue', 'modality', 'in_whole_blood']).size()

In [27]:
results = []

for (tissue, modality, maf_threshold, in_whole_blood), df in blood_as_proxy.groupby(['tissue', 'modality', 'maf_threshold', 'in_whole_blood']):
    if tissue not in ['Lung', 'Nasal_epithelial']:
        continue
    control_df = control_credible_sets.merge(df[['phenotype_id', 'cs_id', 'tissue', 'modality', 'maf_threshold', 'in_whole_blood']].drop_duplicates())
    annotations_to_test = ['snpeff', 'regulatory_build', f'roadmap_{topmed_to_roadmap[tissue]}']
    for a in annotations_to_test:
        if a == 'regulatory_build':
            continue
        lr = logistic_regression(df, control_df, annotations[a])
        if lr.converged.values[0] == False:
            print(tissue, modality, maf_threshold, in_whole_blood, a, 'converged: ', lr.converged.values[0])
        results.append(lr.assign(tissue=tissue, modality=modality, maf_threshold=maf_threshold, in_whole_blood=in_whole_blood, annotation=a))
results = pd.concat(results)
results.head()

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],converged,n_hits_in_annotation,n_hits_not_in_annotation,n_controls_in_annotation,n_controls_not_in_annotation,tissue,modality,maf_threshold,in_whole_blood,annotation
sequence_feature,0.092739,0.07401,1.25306,0.2101839,-0.052318,0.237795,True,588,10275,328,10535,Lung,cis-eQTL,1%,0,snpeff
5_prime_UTR_variant,0.461738,0.061782,7.473646,7.800276e-14,0.340647,0.582828,True,1389,9474,570,10293,Lung,cis-eQTL,1%,0,snpeff
3_prime_UTR_variant,0.364939,0.043569,8.376027,5.4744430000000007e-17,0.279544,0.450333,True,2586,8277,1279,9584,Lung,cis-eQTL,1%,0,snpeff
intron_variant,0.304533,0.042868,7.103917,1.212693e-12,0.220512,0.388553,True,8978,1885,7313,3550,Lung,cis-eQTL,1%,0,snpeff
downstream_gene_variant,0.339265,0.034059,9.960947,2.2590000000000003e-23,0.272509,0.40602,True,7217,3646,5047,5816,Lung,cis-eQTL,1%,0,snpeff


In [28]:
results.to_csv(f'tables/{PREFIX}blood-as-proxy.tsv', sep='\t', index=True)

In [29]:
# primary vs secondary
cs_ranks = pd.read_csv(f'tables/rank-signals.cis-ranks.tsv', sep='\t').rename(columns={'variant_id': 'variant', 'rank': 'rnk'})
#cs_ranks.modality = cs_ranks.modality.map({'cis-eQTL': 'cis-eqtl', 'cis-sQTL': 'cis-sqtl'})
#tissue_and_variant_to_min_rank = cs_ranks.groupby(['tissue', 'variant', 'modality', 'maf']).rnk.min().rename('min_rank').reset_index()
#tissue_and_variant_to_max_rank = cs_ranks.groupby(['tissue', 'variant', 'modality', 'maf']).rnk.max().rename('max_rank').reset_index()
#tissue_and_variant_to_rank = tissue_and_variant_to_max_rank.merge(tissue_and_variant_to_min_rank)
#tissue_and_variant_to_rank.head()
cs_ranks.head()

Unnamed: 0,tissue,maf,modality,phenotype_id,cs_id,variant,rnk
0,Whole_blood,0.1%,cis-eQTL,ENSG00000179344.16,1,chr6_32667428_C_T,1
1,Whole_blood,1%,cis-eQTL,ENSG00000179344.16,1,chr6_32667428_C_T,1
2,Whole_blood,1%,cis-eQTL,ENSG00000013573.17,1,chr12_31076435_A_C,1
3,Whole_blood,0.1%,cis-eQTL,ENSG00000013573.17,1,chr12_31076435_A_C,1
4,Whole_blood,1%,cis-eQTL,ENSG00000113504.21,1,chr5_1104823_C_T,1


In [30]:
# TODO: validate this merge
cs_ranks = credible_sets.merge(cs_ranks[['tissue', 'maf', 'modality', 'phenotype_id', 'cs_id', 'rnk']].rename(columns={'maf': 'maf_threshold'}))
cs_ranks['primary_secondary'] = ['primary' if i == 1 else 'secondary' for i in cs_ranks.rnk]
cs_ranks.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold,colocing_phenotypes,colocs_with_other_modality,coloced_with_other_modality_signal_from_same_gene,rnk,primary_secondary
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True,1,primary
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,2,secondary
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,2,secondary
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,2,secondary
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True,2,secondary


In [31]:
results = []

for (tissue, modality, maf_threshold, primary_secondary), df in cs_ranks.groupby(['tissue', 'modality', 'maf_threshold', 'primary_secondary']):
    if tissue != 'Whole_blood':
        continue
    control_df = control_credible_sets.merge(df[['phenotype_id', 'cs_id', 'tissue', 'modality', 'maf_threshold', 'primary_secondary']].drop_duplicates())
    annotations_to_test = ['snpeff', 'regulatory_build', f'roadmap_{topmed_to_roadmap[tissue]}']
    for a in annotations_to_test:
        lr = logistic_regression(df, control_df, annotations[a])
        if lr.converged.values[0] == False:
            print(tissue, modality, maf_threshold, in_whole_blood, a, 'converged: ', lr.converged.values[0])
        results.append(lr.assign(tissue=tissue, modality=modality, maf_threshold=maf_threshold, primary_secondary=primary_secondary, annotation=a))
results = pd.concat(results)
results.head()

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],converged,n_hits_in_annotation,n_hits_not_in_annotation,n_controls_in_annotation,n_controls_not_in_annotation,tissue,modality,maf_threshold,primary_secondary,annotation
sequence_feature,-0.007981,0.075604,-0.105564,0.9159281,-0.156163,0.140201,True,552,17842,346,18048,Whole_blood,cis-eQTL,0.1%,primary,snpeff
5_prime_UTR_variant,1.056094,0.057625,18.327005,5.039021000000001e-75,0.943151,1.169037,True,2536,15858,506,17888,Whole_blood,cis-eQTL,0.1%,primary,snpeff
3_prime_UTR_variant,0.353018,0.039567,8.922056,4.577124999999999e-19,0.275469,0.430568,True,3053,15341,1362,17032,Whole_blood,cis-eQTL,0.1%,primary,snpeff
intron_variant,-0.281058,0.032653,-8.607325,7.478543e-18,-0.345058,-0.217059,True,13589,4805,11629,6765,Whole_blood,cis-eQTL,0.1%,primary,snpeff
downstream_gene_variant,0.332567,0.025395,13.095649,3.4867910000000003e-39,0.282794,0.382341,True,10460,7934,6982,11412,Whole_blood,cis-eQTL,0.1%,primary,snpeff


In [32]:
results.to_csv(f'tables/{PREFIX}primary_secondary.tsv', sep='\t', index=True)

In [33]:
# rare vs common
mafs = pd.read_csv('tables/cis-rare-variants-summary.maf-breakdown.tsv', sep='\t')
mafs = mafs[mafs.maf=='0.1%'].drop(columns=['maf'])
mafs.head()


Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,gene_id,unique_cs_id,AFR_MAF,AMR_MAF,EUR_MAF,variant_maf,max_maf,driven_by
2,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
3,ENSG00000107566.14,chr10_100278884_G_A,0.967864,0.363108,2,Whole_blood,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
5,chr10:100275493:100280124:clu_5973_-:ENSG00000...,chr10_100278884_G_A,1.0,0.363108,7,Whole_blood,cis-sQTL,ENSG00000196072.12,chr10:100275493:100280124:clu_5973_-:ENSG00000...,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
7,ENSG00000235823.2,chr10_100365714_A_G,1.0,0.179966,1,Whole_blood,cis-eQTL,ENSG00000235823.2,ENSG00000235823.2___L1,0.241398,0.097561,0.17918,0.179966,0.241398,"AFR,AMR,EUR"
13,ENSG00000107937.19,chr10_1009926_A_T,1.0,0.874574,1,Whole_blood,cis-eQTL,ENSG00000107937.19,ENSG00000107937.19___L1,0.24086,0.041667,0.098423,0.125426,0.24086,"AFR,EUR"


In [34]:
# break up by modality, rare, common
def make_label(maf_series):
    if max(maf_series) < 0.01:
        return 'rare_in_all'
    elif min(maf_series) >= 0.01:
        return 'common_in_all'
    elif min(maf_series) < 0.01 and max(maf_series) >= 0.01:
        return 'rare_in_some'
    else:
        raise ValueError('This should not occur')

mafs['rare_in_all'] = (mafs.max_maf<0.01).map({True: 'rare_in_all', False: 'not_rare_in_all'})
mafs['common'] = (mafs.variant_maf>=0.01).map({True: 'common', False: 'rare'})
mafs['frequency_class'] = mafs[['AFR_MAF', 'AMR_MAF', 'EUR_MAF']].apply(make_label, axis=1)
mafs.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,gene_id,unique_cs_id,AFR_MAF,AMR_MAF,EUR_MAF,variant_maf,max_maf,driven_by,rare_in_all,common,frequency_class
2,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR",not_rare_in_all,common,common_in_all
3,ENSG00000107566.14,chr10_100278884_G_A,0.967864,0.363108,2,Whole_blood,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR",not_rare_in_all,common,common_in_all
5,chr10:100275493:100280124:clu_5973_-:ENSG00000...,chr10_100278884_G_A,1.0,0.363108,7,Whole_blood,cis-sQTL,ENSG00000196072.12,chr10:100275493:100280124:clu_5973_-:ENSG00000...,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR",not_rare_in_all,common,common_in_all
7,ENSG00000235823.2,chr10_100365714_A_G,1.0,0.179966,1,Whole_blood,cis-eQTL,ENSG00000235823.2,ENSG00000235823.2___L1,0.241398,0.097561,0.17918,0.179966,0.241398,"AFR,AMR,EUR",not_rare_in_all,common,common_in_all
13,ENSG00000107937.19,chr10_1009926_A_T,1.0,0.874574,1,Whole_blood,cis-eQTL,ENSG00000107937.19,ENSG00000107937.19___L1,0.24086,0.041667,0.098423,0.125426,0.24086,"AFR,EUR",not_rare_in_all,common,common_in_all


In [35]:
credible_sets.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,modality,maf_threshold,colocing_phenotypes,colocs_with_other_modality,coloced_with_other_modality_signal_from_same_gene
0,ENSG00000112685.14,chr6_692805_C_G,1.0,0.682804,1,Lung,cis-eQTL,1%,chr6:637861:693019:clu_45255_-:ENSG00000112685.14,True,True
1,ENSG00000112685.14,chr6_506781_A_G,0.043571,0.578234,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
2,ENSG00000112685.14,chr6_518078_G_A,0.038384,0.595662,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
3,ENSG00000112685.14,chr6_518480_A_T,0.071511,0.594888,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True
4,ENSG00000112685.14,chr6_528568_T_C,0.058727,0.595275,2,Lung,cis-eQTL,1%,chr6:532610:549175:clu_45252_-:ENSG00000112685.14,True,True


In [36]:
print(credible_sets[credible_sets.maf_threshold=='0.1%'].modality.value_counts())
mafs = credible_sets[credible_sets.maf_threshold=='0.1%'].merge(mafs[['phenotype_id', 'cs_id', 'tissue', 'modality', 'frequency_class']].drop_duplicates())
print(mafs.modality.value_counts())

cis-eQTL    531239
cis-sQTL    331781
Name: modality, dtype: int64
cis-eQTL    531239
cis-sQTL    331781
Name: modality, dtype: int64


In [37]:
results = []

for (tissue, modality, frequency_class), df in mafs.groupby(['tissue', 'modality', 'frequency_class']):
    if tissue != 'Whole_blood':
        continue
    control_df = control_credible_sets[control_credible_sets.maf_threshold=='0.1%'].merge(df[['phenotype_id', 'cs_id', 'tissue', 'modality', 'frequency_class']].drop_duplicates())
    annotations_to_test = ['snpeff', 'regulatory_build', f'roadmap_{topmed_to_roadmap[tissue]}']
    for a in annotations_to_test:
        lr = logistic_regression(df, control_df, annotations[a])
        if lr.converged.values[0] == False:
            print(tissue, modality, frequency_class, a, 'converged: ', lr.converged.values[0])
        results.append(lr.assign(tissue=tissue, modality=modality, frequency_class=frequency_class, annotation=a))
results = pd.concat(results)
results.head()

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],converged,n_hits_in_annotation,n_hits_not_in_annotation,n_controls_in_annotation,n_controls_not_in_annotation,tissue,modality,frequency_class,annotation
sequence_feature,-0.10295,0.047722,-2.15728,0.03098384,-0.196483,-0.009416,True,1268,41610,873,42005,Whole_blood,cis-eQTL,common_in_all,snpeff
5_prime_UTR_variant,0.783749,0.039572,19.805706,2.658115e-87,0.70619,0.861309,True,4053,38825,1165,41713,Whole_blood,cis-eQTL,common_in_all,snpeff
3_prime_UTR_variant,0.376418,0.026146,14.396637,5.43234e-47,0.325172,0.427663,True,6556,36322,3019,39859,Whole_blood,cis-eQTL,common_in_all,snpeff
intron_variant,-0.09286,0.021402,-4.33894,1.431715e-05,-0.134806,-0.050914,True,31723,11155,26913,15965,Whole_blood,cis-eQTL,common_in_all,snpeff
downstream_gene_variant,0.344206,0.016254,21.176851,1.561257e-99,0.312349,0.376063,True,23840,19038,16164,26714,Whole_blood,cis-eQTL,common_in_all,snpeff


In [38]:
results.to_csv(f'tables/{PREFIX}frequency_class.tsv', sep='\t', index=True)