In [1]:
import os
import glob
import numpy as np
import pandas as pd
import re

PREFIX = 'cis-rare-variants-summary.'

def phenotype_id_to_gene_id(x):
    # first, try to match with a version
    ENSEMBL_RE_WITH_VERSION = r'ENSG\d+\.\d+'
    ENSEMBL_RE_WITHOUT_VERSION = r'ENSG\d+'
    with_version = re.search(ENSEMBL_RE_WITH_VERSION, x)
    without_version = re.search(ENSEMBL_RE_WITHOUT_VERSION, x)
    if with_version:
        return with_version.group(0)
    elif without_version:
        return without_version.group(0)
    else:
        raise ValueError(f'Not able to infer gene ID from {x}')


def compare_credible_sets(scan_1_cs, scan_2_cs, summarize=True):
    """
    Given a dataframe representing scan_1_cs and scan_2_cs (each having columns ['phenotype_id', 'variant_id', 'cs_id']; can have other columns too),
    return a dataframe showing, for each CS in scan_1_cs, whether it overlaps a CS in scan_2_cs (for the same phenotype_id)
    If summarize = False, simply returns scan_1_cs with an added column indicating whether each credible set SNP is a credible set SNP 
    for the same phenotype in the other scan
    """
    # Validate input
    if not isinstance(scan_1_cs, pd.DataFrame):
        raise TypeError('scan_1_cs must be a DataFrame')
    if not isinstance(scan_2_cs, pd.DataFrame):
        raise TypeError('scan_2_cs must be a DataFrame')
    for i in ['phenotype_id', 'variant_id', 'cs_id']:
        if not i in scan_1_cs.columns.to_list():
            raise ValueError(f'scan_1_cs must include column {i}')
        if not i in scan_2_cs.columns.to_list():
            raise ValueError(f'scan_2_cs must include column {i}')

    results = scan_1_cs.merge(scan_2_cs[['phenotype_id', 'variant_id']].drop_duplicates().assign(in_other_scan_cs=1), how='left')
    results.in_other_scan_cs = results.in_other_scan_cs.fillna(0).astype(int)
    assert(len(results) == len(scan_1_cs))

    if summarize:
        return results.groupby(['phenotype_id', 'cs_id']).in_other_scan_cs.max().reset_index()
    else:
        return results


def top_pip_variants(cs):
    """
    Given a dataframe representing CS (having columns ['phenotype_id', 'variant_id', 'cs_id', 'pip']; can have other columns too),
    return a dataframe containing only the top PIP variant per credible set (credible set defined by phenotype_id + csID pair)
    """
    # Validate input
    if not isinstance(cs, pd.DataFrame):
        raise TypeError('cs must be a DataFrame')
    for i in ['phenotype_id', 'variant_id', 'cs_id', 'pip']:
        if not i in cs.columns.to_list():
            raise ValueError(f'cs must include column {i}')

    return cs.sort_values(['pip', 'variant_id'], ascending=[False, True]).groupby(['phenotype_id', 'cs_id']).head(1)


CIS_EQTL_PCS = {
    'Lung': 75,
    'Monocyte': 30,
    'Nasal_epithelial': 30,
    'PBMC': 30,
    'T_cell': 30,
    'Whole_blood': 100,
}

CIS_SQTL_PCS = {
    'Lung': 10,
    'Monocyte': 10,
    'Nasal_epithelial': 10,
    'PBMC': 10,
    'T_cell': 10,
    'Whole_blood': 10,
}

METADATA = '../data/metadata/metadata.tm.txt'

In [2]:
# load in primary hits
CIS_EQTL_PERMUTATIONS_GLOB = glob.glob('../data/scan-results/joint/cis-eqtl/permutations/maf*/Whole_blood*.txt.gz')
CIS_SQTL_PERMUTATIONS_GLOB = glob.glob('../data/scan-results/joint/cis-sqtl/permutations/maf*/Whole_blood*.txt.gz')
cis_eqtl_permutations_all = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], pcs=int(os.path.basename(f).split('.')[1]), maf='1%' if 'maf001' in f else '0.1%') for f in CIS_EQTL_PERMUTATIONS_GLOB])
cis_sqtl_permutations_all = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], pcs=int(os.path.basename(f).split('.')[1]), maf='1%' if 'maf001' in f else '0.1%') for f in CIS_SQTL_PERMUTATIONS_GLOB])
cis_eqtl_permutations_all.head()

Unnamed: 0,phenotype_id,num_var,beta_shape1,beta_shape2,true_df,pval_true_df,variant_id,tss_distance,ma_samples,ma_count,...,pval_nominal,slope,slope_se,pval_perm,pval_beta,qval,pval_nominal_threshold,tissue,pcs,maf
0,ENSG00000285578.1,4927,1.02051,994.008,5807.91,5.1210500000000005e-54,chr6_184740_C_T,401,351,366,...,1.18064e-59,-0.892507,0.054226,0.0001,4.6934300000000005e-52,2.13694e-51,4.2e-05,Whole_blood,0,1%
1,ENSG00000112679.14,5553,1.04192,1091.76,5735.69,4.43673e-16,chr6_250373_T_TGAG,-41257,592,623,...,7.4885e-18,0.377827,0.043769,0.0001,1.44902e-13,3.23532e-13,4.2e-05,Whole_blood,0,1%
2,ENSG00000137265.15,6093,1.04731,1128.12,5717.04,1.0101300000000001e-18,chr6_396321_C_T,4569,1464,1642,...,7.09508e-21,0.250111,0.026594,0.0001,2.19204e-16,5.31697e-16,4.1e-05,Whole_blood,0,1%
3,ENSG00000112685.14,7690,1.03145,1525.45,5758.99,4.654259999999999e-50,chr6_692805_C_G,-334,3847,4871,...,1.00474e-55,-0.293773,0.018501,0.0001,2.4761099999999997e-48,1.07111e-47,2.9e-05,Whole_blood,0,1%
4,ENSG00000272463.1,7808,1.02158,1816.81,5904.27,0.0,chr6_711150_A_C,-255,3861,4826,...,0.0,0.810795,0.016028,0.0001,0.0,0.0,2.3e-05,Whole_blood,0,1%


In [3]:
cis_eqtl_permutations = cis_eqtl_permutations_all[cis_eqtl_permutations_all.tissue.map(CIS_EQTL_PCS) == cis_eqtl_permutations_all.pcs]
cis_sqtl_permutations = cis_sqtl_permutations_all[cis_sqtl_permutations_all.tissue.map(CIS_SQTL_PCS) == cis_sqtl_permutations_all.pcs]

In [4]:
print('With the lower MAF threshold, the average number of variants tested against each gene more than doubled ({:,} vs {:,} for the cis-eQTL scan)'.format(
    round(cis_eqtl_permutations[(cis_eqtl_permutations.maf=='1%')].num_var.mean()),
    round(cis_eqtl_permutations[(cis_eqtl_permutations.maf=='0.1%')].num_var.mean())
))

With the lower MAF threshold, the average number of variants tested against each gene more than doubled (7,866 vs 17,195 for the cis-eQTL scan)


In [5]:
metadata = pd.read_csv(METADATA, sep='\t')
sample_size_dict = metadata[metadata.used_for_scan].groupby('tissue').size().to_dict()

In [6]:
CIS_EQTL_SUSIE_GLOB = glob.glob('../data/scan-results/joint/cis-eqtl/susie/maf*/Whole_blood*.cs.txt')
CIS_SQTL_SUSIE_GLOB = glob.glob('../data/scan-results/joint/cis-sqtl/susie/maf*/postprocessed/Whole_blood*.by-gene.cs.txt')
cis_eqtl_susie = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], maf='1%' if 'maf001' in f else '0.1%') for f in CIS_EQTL_SUSIE_GLOB])
cis_sqtl_susie = pd.concat([pd.read_csv(f, sep='\t').assign(tissue=os.path.basename(f).split('.')[0], maf='1%' if 'maf001' in f else '0.1%') for f in CIS_SQTL_SUSIE_GLOB])
cis_susie = pd.concat([cis_eqtl_susie.assign(modality='cis-eQTL'), cis_sqtl_susie.assign(modality='cis-sQTL')])
cis_susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality
0,ENSG00000285578.1,chr6_180573_C_A,0.274643,0.027967,1,Whole_blood,1%,cis-eQTL
1,ENSG00000285578.1,chr6_184740_C_T,0.725334,0.028355,1,Whole_blood,1%,cis-eQTL
2,ENSG00000285578.1,chr6_153439_T_A,1.0,0.678416,2,Whole_blood,1%,cis-eQTL
3,ENSG00000285578.1,chr6_196027_C_T,0.17046,0.066006,3,Whole_blood,1%,cis-eQTL
4,ENSG00000285578.1,chr6_197265_T_C,0.829534,0.073365,3,Whole_blood,1%,cis-eQTL


In [7]:
cis_susie['gene_id'] = cis_susie.phenotype_id.map(phenotype_id_to_gene_id)
cis_susie['unique_cs_id'] = cis_susie.phenotype_id + '___L' + cis_susie.cs_id.astype(str)
cis_susie.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,unique_cs_id
0,ENSG00000285578.1,chr6_180573_C_A,0.274643,0.027967,1,Whole_blood,1%,cis-eQTL,ENSG00000285578.1,ENSG00000285578.1___L1
1,ENSG00000285578.1,chr6_184740_C_T,0.725334,0.028355,1,Whole_blood,1%,cis-eQTL,ENSG00000285578.1,ENSG00000285578.1___L1
2,ENSG00000285578.1,chr6_153439_T_A,1.0,0.678416,2,Whole_blood,1%,cis-eQTL,ENSG00000285578.1,ENSG00000285578.1___L2
3,ENSG00000285578.1,chr6_196027_C_T,0.17046,0.066006,3,Whole_blood,1%,cis-eQTL,ENSG00000285578.1,ENSG00000285578.1___L3
4,ENSG00000285578.1,chr6_197265_T_C,0.829534,0.073365,3,Whole_blood,1%,cis-eQTL,ENSG00000285578.1,ENSG00000285578.1___L3


In [8]:
len(cis_eqtl_permutations[(cis_eqtl_permutations.maf=='0.1%') & (cis_eqtl_permutations.qval<=0.05)])

19394

In [9]:
'While the number of cis-e/sGenes showed little change compared to a 0.01 MAF threshold ({:,} and {:,} cis-eGenes at 0.01 and 0.001 MAF thresholds, respectively, and {:,} vs {:,} cis-sGenes), the number of total cis-eQTL signals detected increased by {}% (from {:,} to {:,}) and the number of cis-sQTL signals increased by {}% (from {:,} to {:,})'.format(
    len(cis_eqtl_permutations[(cis_eqtl_permutations.maf=='1%') & (cis_eqtl_permutations.qval<=0.05)]),
    len(cis_eqtl_permutations[(cis_eqtl_permutations.maf=='0.1%') & (cis_eqtl_permutations.qval<=0.05)]),
    len(cis_sqtl_permutations[(cis_sqtl_permutations.maf=='1%') & (cis_sqtl_permutations.qval<=0.05)]),
    len(cis_sqtl_permutations[(cis_sqtl_permutations.maf=='0.1%') & (cis_sqtl_permutations.qval<=0.05)]),
    round(100*(cis_susie[(cis_susie.modality=='cis-eQTL') & (cis_susie.maf=='0.1%')].unique_cs_id.nunique() - cis_susie[(cis_susie.modality=='cis-eQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique()) / cis_susie[(cis_susie.modality=='cis-eQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique(), 1),
    cis_susie[(cis_susie.modality=='cis-eQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique(),
    cis_susie[(cis_susie.modality=='cis-eQTL') & (cis_susie.maf=='0.1%')].unique_cs_id.nunique(),
    round(100*(cis_susie[(cis_susie.modality=='cis-sQTL') & (cis_susie.maf=='0.1%')].unique_cs_id.nunique() - cis_susie[(cis_susie.modality=='cis-sQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique()) / cis_susie[(cis_susie.modality=='cis-sQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique(), 1),
    cis_susie[(cis_susie.modality=='cis-sQTL') & (cis_susie.maf=='1%')].unique_cs_id.nunique(),
    cis_susie[(cis_susie.modality=='cis-sQTL') & (cis_susie.maf=='0.1%')].unique_cs_id.nunique()
)

'While the number of cis-e/sGenes showed little change compared to a 0.01 MAF threshold (19,465 and 19,394 cis-eGenes at 0.01 and 0.001 MAF thresholds, respectively, and 8,795 vs 8,873 cis-sGenes), the number of total cis-eQTL signals detected increased by 9.7% (from 69,766 to 76,545) and the number of cis-sQTL signals increased by 9.0% (from 35,770 to 39,001)'

In [10]:
# # find signals specific to either scan, or shared
# cis_eqtl_signals_at_maf_1 = cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-eQTL')].merge(compare_credible_sets(cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-eQTL')], cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-eQTL')]))
# cis_eqtl_signals_at_maf_01 = cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-eQTL')].merge(compare_credible_sets(cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-eQTL')], cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-eQTL')]))
# cis_sqtl_signals_at_maf_1 = cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-sQTL')].merge(compare_credible_sets(cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-sQTL')], cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-sQTL')]))
# cis_sqtl_signals_at_maf_01 = cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-sQTL')].merge(compare_credible_sets(cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-sQTL')], cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-sQTL')]))
# cis_sqtl_signals_at_maf_1.head()

In [11]:
# how many CS at 1% persist (by overlap)?
persist_by_overlap_ciseqtl = compare_credible_sets(cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-eQTL')], cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-eQTL')])

cissqtl_maf1_as_gene = cis_susie.loc[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-sQTL'),['gene_id', 'variant_id', 'unique_cs_id']].rename(columns={'gene_id': 'phenotype_id', 'unique_cs_id': 'cs_id'})
cissqtl_maf01_as_gene = cis_susie.loc[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-sQTL'),['gene_id', 'variant_id', 'unique_cs_id']].rename(columns={'gene_id': 'phenotype_id', 'unique_cs_id': 'cs_id'})
persist_by_overlap_cissqtl = compare_credible_sets(cissqtl_maf1_as_gene, cissqtl_maf01_as_gene)
print(persist_by_overlap_ciseqtl.in_other_scan_cs.mean())
print(persist_by_overlap_cissqtl.in_other_scan_cs.mean())

0.7936530688300891
0.8597428012300811


In [12]:
top_pip_variants_ciseqtl_maf1 = top_pip_variants(cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-eQTL')])
top_pip_variants_ciseqtl_maf01 = top_pip_variants(cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-eQTL')])
top_pip_variants_cissqtl_maf1 = top_pip_variants(cis_susie[(cis_susie.maf=='1%') & (cis_susie.modality=='cis-sQTL')])
top_pip_variants_cissqtl_maf01 = top_pip_variants(cis_susie[(cis_susie.maf=='0.1%') & (cis_susie.modality=='cis-sQTL')])

In [13]:
# another way to look at this:
# for each signal w/ MAF < 1%; what is the best proxy MAF > 1% SNP? Was it a (now lost) CS?
# but will not deal with this for now

In [14]:
PLINK_MAC_FILES = glob.glob('../work/ancestry-allele-counts-75-AMR-50/cs-variants/results/mac-per-chrom/*')
mac = pd.concat([pd.read_csv(f, delim_whitespace=True).assign(tissue=os.path.basename(f).split('.')[0], ancestry=os.path.basename(f).split('.')[1]) for f in PLINK_MAC_FILES])
mac['ref_allele'] = mac.SNP.str.split('_', expand=True)[2]
mac['alt_allele'] = mac.SNP.str.split('_', expand=True)[3]
assert(all(((mac.ref_allele == mac.A1) | (mac.alt_allele == mac.A1))))
assert(all(((mac.ref_allele == mac.A2) | (mac.alt_allele == mac.A2))))
mac['ref_allele_count'] = np.where(mac.ref_allele == mac.A1, mac.C1, mac.C2)
mac['alt_allele_count'] = np.where(mac.alt_allele == mac.A1, mac.C1, mac.C2)
mac.head()

Unnamed: 0,CHR,SNP,A1,A2,C1,C2,G0,tissue,ancestry,ref_allele,alt_allele,ref_allele_count,alt_allele_count
0,21,chr21_5090094_C_T,T,C,0,2,0,T_cell,EAS,C,T,2,0
1,21,chr21_5090518_G_A,A,G,0,2,0,T_cell,EAS,G,A,2,0
2,21,chr21_5090561_G_A,A,G,0,2,0,T_cell,EAS,G,A,2,0
3,21,chr21_5090588_C_T,T,C,0,2,0,T_cell,EAS,C,T,2,0
4,21,chr21_5090666_C_A,A,C,0,2,0,T_cell,EAS,C,A,2,0


In [15]:
ancestry_mafs = mac[['SNP', 'ref_allele_count', 'alt_allele_count', 'tissue', 'ancestry']].rename(columns={'SNP': 'ID'})
ancestry_mafs.head()

Unnamed: 0,ID,ref_allele_count,alt_allele_count,tissue,ancestry
0,chr21_5090094_C_T,2,0,T_cell,EAS
1,chr21_5090518_G_A,2,0,T_cell,EAS
2,chr21_5090561_G_A,2,0,T_cell,EAS
3,chr21_5090588_C_T,2,0,T_cell,EAS
4,chr21_5090666_C_A,2,0,T_cell,EAS


In [16]:
whole_blood_ancestry_mafs = ancestry_mafs[(ancestry_mafs.tissue=='Whole_blood') & (ancestry_mafs.ancestry.isin(['AMR', 'AFR', 'EUR']))]
whole_blood_ancestry_mafs['af'] = whole_blood_ancestry_mafs.alt_allele_count / whole_blood_ancestry_mafs[['alt_allele_count', 'ref_allele_count']].sum(axis=1)
whole_blood_ancestry_mafs['maf'] = np.minimum(whole_blood_ancestry_mafs.af, 1-whole_blood_ancestry_mafs.af)
whole_blood_ancestry_mafs = whole_blood_ancestry_mafs[['ID', 'ancestry', 'maf']].pivot(index='ID', columns='ancestry', values='maf')
whole_blood_ancestry_mafs = whole_blood_ancestry_mafs.rename(columns=lambda x: x + '_MAF')
whole_blood_ancestry_mafs['variant_id'] = whole_blood_ancestry_mafs.index.to_list()
whole_blood_ancestry_mafs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  whole_blood_ancestry_mafs['af'] = whole_blood_ancestry_mafs.alt_allele_count / whole_blood_ancestry_mafs[['alt_allele_count', 'ref_allele_count']].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  whole_blood_ancestry_mafs['maf'] = np.minimum(whole_blood_ancestry_mafs.af, 1-whole_blood_ancestry_mafs.af)


ancestry,AFR_MAF,AMR_MAF,EUR_MAF,variant_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr10_100004256_T_A,0.405914,0.381098,0.379968,chr10_100004256_T_A
chr10_100006605_T_C,0.137097,0.059959,0.08123,chr10_100006605_T_C
chr10_100007810_T_A,0.000538,0.0,0.001735,chr10_100007810_T_A
chr10_100007968_C_T,0.137634,0.059959,0.081073,chr10_100007968_C_T
chr10_100014566_A_G,0.170968,0.10874,0.028707,chr10_100014566_A_G


In [17]:
top_pip_variants_combined = pd.concat([top_pip_variants_ciseqtl_maf1, top_pip_variants_ciseqtl_maf01, top_pip_variants_cissqtl_maf1, top_pip_variants_cissqtl_maf01])
top_pip_variants_combined = top_pip_variants_combined.merge(whole_blood_ancestry_mafs)
top_pip_variants_combined.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,unique_cs_id,AFR_MAF,AMR_MAF,EUR_MAF
0,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479
1,ENSG00000107566.14,chr10_100278884_G_A,0.964002,0.363108,2,Whole_blood,1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479
2,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,0.1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479
3,ENSG00000107566.14,chr10_100278884_G_A,0.967864,0.363108,2,Whole_blood,0.1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479
4,chr10:100286332:100286605:clu_5974_-:ENSG00000...,chr10_100278884_G_A,0.973766,0.363108,1,Whole_blood,1%,cis-sQTL,ENSG00000196072.12,chr10:100286332:100286605:clu_5974_-:ENSG00000...,0.23172,0.231707,0.444479


In [18]:
top_pip_variants_combined['variant_maf'] = np.minimum(top_pip_variants_combined.af, 1 - top_pip_variants_combined.af)
top_pip_variants_combined['max_maf'] = top_pip_variants_combined[['EUR_MAF', 'AFR_MAF', 'AMR_MAF']].max(axis=1)
top_pip_variants_combined.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,unique_cs_id,AFR_MAF,AMR_MAF,EUR_MAF,variant_maf,max_maf
0,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479
1,ENSG00000107566.14,chr10_100278884_G_A,0.964002,0.363108,2,Whole_blood,1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479
2,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,0.1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479
3,ENSG00000107566.14,chr10_100278884_G_A,0.967864,0.363108,2,Whole_blood,0.1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479
4,chr10:100286332:100286605:clu_5974_-:ENSG00000...,chr10_100278884_G_A,0.973766,0.363108,1,Whole_blood,1%,cis-sQTL,ENSG00000196072.12,chr10:100286332:100286605:clu_5974_-:ENSG00000...,0.23172,0.231707,0.444479,0.363108,0.444479


In [19]:
# try to determine which ancestries drive signals
ANCESTRY_MAF_COLS = ['AFR_MAF', 'AMR_MAF', 'EUR_MAF']
top_pip_variants_combined['driven_by'] = top_pip_variants_combined[ANCESTRY_MAF_COLS].apply(lambda x: ','.join(x[x/max(x)>=0.2].index.to_list()).replace('_MAF', ''), axis=1)
top_pip_variants_combined.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id,tissue,maf,modality,gene_id,unique_cs_id,AFR_MAF,AMR_MAF,EUR_MAF,variant_maf,max_maf,driven_by
0,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
1,ENSG00000107566.14,chr10_100278884_G_A,0.964002,0.363108,2,Whole_blood,1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
2,ENSG00000196072.12,chr10_100278884_G_A,1.0,0.363108,1,Whole_blood,0.1%,cis-eQTL,ENSG00000196072.12,ENSG00000196072.12___L1,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
3,ENSG00000107566.14,chr10_100278884_G_A,0.967864,0.363108,2,Whole_blood,0.1%,cis-eQTL,ENSG00000107566.14,ENSG00000107566.14___L2,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"
4,chr10:100286332:100286605:clu_5974_-:ENSG00000...,chr10_100278884_G_A,0.973766,0.363108,1,Whole_blood,1%,cis-sQTL,ENSG00000196072.12,chr10:100286332:100286605:clu_5974_-:ENSG00000...,0.23172,0.231707,0.444479,0.363108,0.444479,"AFR,AMR,EUR"


In [20]:
top_pip_variants_combined.groupby(['maf', 'modality']).variant_maf.apply(lambda x: (x<0.01).mean())

maf   modality
0.1%  cis-eQTL    0.211862
      cis-sQTL    0.187226
1%    cis-eQTL    0.000000
      cis-sQTL    0.000000
Name: variant_maf, dtype: float64

In [21]:
len(top_pip_variants_combined[(top_pip_variants_combined.maf=='0.1%') & (top_pip_variants_combined.modality=='cis-eQTL')])

76545

In [22]:
print('The top posterior inclusion probability (PIP) variant in {:,} cis-eQTL and {:,} cis-sQTL credible sets had MAF < 0.01'.format(
    (top_pip_variants_combined[(top_pip_variants_combined.maf=='0.1%') & (top_pip_variants_combined.modality=='cis-eQTL')].variant_maf<0.01).sum(),
    (top_pip_variants_combined[(top_pip_variants_combined.maf=='0.1%') & (top_pip_variants_combined.modality=='cis-sQTL')].variant_maf<0.01).sum()
))

The top posterior inclusion probability (PIP) variant in 16,217 cis-eQTL and 7,302 cis-sQTL credible sets had MAF < 0.01


In [23]:
print('{:,} cis-eQTL and {:,} cis-sQTL signals had MAF < 0.01 in all three well-represented ancestries'.format(
    (top_pip_variants_combined[(top_pip_variants_combined.maf=='0.1%') & (top_pip_variants_combined.modality=='cis-eQTL')].max_maf<0.01).sum(),
    (top_pip_variants_combined[(top_pip_variants_combined.maf=='0.1%') & (top_pip_variants_combined.modality=='cis-sQTL')].max_maf<0.01).sum()
))

7,479 cis-eQTL and 3,290 cis-sQTL signals had MAF < 0.01 in all three well-represented ancestries


In [24]:
top_pip_variants_combined.to_csv(f'tables/{PREFIX}maf-breakdown.tsv', sep='\t', index=False)