In [1]:
import glob
import pandas as pd

PREFIX = 'signal-in-topmed-gtex-eqtlgen-direct.'

def compare_credible_sets(scan_1_cs, scan_2_cs, summarize=True):
    """
    Given a dataframe representing scan_1_cs and scan_2_cs (each having columns ['phenotype_id', 'variant_id', 'cs_id']; can have other columns too),
    return a dataframe showing, for each CS in scan_1_cs, whether it overlaps a CS in scan_2_cs (for the same phenotype_id)
    If summarize = False, simply returns scan_1_cs with an added column indicating whether each credible set SNP is a credible set SNP 
    for the same phenotype in the other scan
    """
    # Validate input
    if not isinstance(scan_1_cs, pd.DataFrame):
        raise TypeError('scan_1_cs must be a DataFrame')
    if not isinstance(scan_2_cs, pd.DataFrame):
        raise TypeError('scan_2_cs must be a DataFrame')
    for i in ['phenotype_id', 'variant_id', 'cs_id']:
        if not i in scan_1_cs.columns.to_list():
            raise ValueError(f'scan_1_cs must include column {i}')
        if not i in scan_2_cs.columns.to_list():
            raise ValueError(f'scan_2_cs must include column {i}')

    results = scan_1_cs.merge(scan_2_cs[['phenotype_id', 'variant_id']].drop_duplicates().assign(in_other_scan_cs=1), how='left')
    results.in_other_scan_cs = results.in_other_scan_cs.fillna(0).astype(int)
    assert(len(results) == len(scan_1_cs))

    if summarize:
        return results.groupby(['phenotype_id', 'cs_id']).in_other_scan_cs.max().reset_index()
    else:
        return results


USE_LD = True

topmed_whole_blood_ciseqtl = pd.read_csv('../data/scan-results/joint/cis-eqtl/susie/maf001/Whole_blood.100.cs.txt', sep='\t')
gtex_whole_blood_ciseqtl = pd.read_parquet('/net/topmed11/working/porchard/gtex-preprocessing/data/gtex/GTEx_v8_finemapping_SuSiE/GTEx_v8.Whole_Blood.eQTL.SuSiE_summary.parquet')
topmed_whole_blood_ciseqtl.phenotype_id = topmed_whole_blood_ciseqtl.phenotype_id.str.split('.', expand=True)[0]
gtex_whole_blood_ciseqtl.phenotype_id = gtex_whole_blood_ciseqtl.phenotype_id.str.split('.', expand=True)[0]
gtex_whole_blood_ciseqtl.variant_id = gtex_whole_blood_ciseqtl.variant_id.str.replace('_b38', '')
topmed_whole_blood_ciseqtl.head()

Unnamed: 0,phenotype_id,variant_id,pip,af,cs_id
0,ENSG00000285578,chr6_180573_C_A,0.274643,0.027967,1
1,ENSG00000285578,chr6_184740_C_T,0.725334,0.028355,1
2,ENSG00000285578,chr6_153439_T_A,1.0,0.678416,2
3,ENSG00000285578,chr6_196027_C_T,0.17046,0.066006,3
4,ENSG00000285578,chr6_197265_T_C,0.829534,0.073365,3


In [2]:
# reshape the eQTLGen primary signals as if they were credible sets
eqtlgen_whole_blood_ciseqtl = pd.read_csv('/net/topmed11/working/porchard/eqtlgen-preprocessing/work/top-hit-per-gene-after-lifting/top-per-gene.txt', sep='\t')
eqtlgen_whole_blood_ciseqtl = eqtlgen_whole_blood_ciseqtl[eqtlgen_whole_blood_ciseqtl.FDR<=0.05]
eqtlgen_whole_blood_ciseqtl = eqtlgen_whole_blood_ciseqtl[['Gene', 'SNP']]
eqtlgen_whole_blood_ciseqtl.columns = ['phenotype_id', 'variant_id']
eqtlgen_whole_blood_ciseqtl['cs_id'] = 1
eqtlgen_whole_blood_ciseqtl.head()


Unnamed: 0,phenotype_id,variant_id,cs_id
0,ENSG00000187642,chr1_953778_G_C,1
1,ENSG00000187608,chr1_1015336_A_T,1
2,ENSG00000187583,chr1_953778_G_C,1
3,ENSG00000188290,chr1_995543_A_G,1
4,ENSG00000188157,chr1_1002736_T_G,1


In [3]:
# reshape the DIRECT conditional hits as if they were credible sets
direct_whole_blood_ciseqtl = pd.read_csv('/net/topmed11/working/porchard/direct-preprocessing/work/lift-and-tabix/cis-eqtl-significant/results/tabixed/direct.txt.gz', sep='\t')
direct_whole_blood_ciseqtl = direct_whole_blood_ciseqtl[['GeneID', 'SNPid', 'DiscoveryOrder']]
direct_whole_blood_ciseqtl.GeneID = direct_whole_blood_ciseqtl.GeneID.str.split('.', expand=True)[0]
assert(direct_whole_blood_ciseqtl.groupby(['GeneID', 'DiscoveryOrder']).size().max() == 1)
direct_whole_blood_ciseqtl = direct_whole_blood_ciseqtl.rename(columns={'DiscoveryOrder': 'cs_id', 'GeneID': 'phenotype_id', 'SNPid': 'variant_id'})
direct_whole_blood_ciseqtl.head()

  direct_whole_blood_ciseqtl = pd.read_csv('/net/topmed11/working/porchard/direct-preprocessing/work/lift-and-tabix/cis-eqtl-significant/results/tabixed/direct.txt.gz', sep='\t')


Unnamed: 0,phenotype_id,variant_id,cs_id
0,ENSG00000272512,chr1_633714_A_G,6
1,ENSG00000235373,chr1_757414_CA_C,2
2,ENSG00000160087,chr1_771398_G_A,3
3,ENSG00000237491,chr1_791564_C_G,2
4,ENSG00000228327,chr1_800909_T_A,1


In [4]:
len(direct_whole_blood_ciseqtl)

59828

In [5]:
ld_buddies = pd.concat([pd.read_csv(f, delim_whitespace=True) for f in glob.glob('../work/eqtlgen-and-direct-ld-buddies/results/ld/*')])
ld_buddies = ld_buddies[['SNP_A', 'SNP_B', 'R2']].drop_duplicates()
r2_threshold = 0.8
assert(r2_threshold >= ld_buddies.R2.min())
ld_buddies = ld_buddies[ld_buddies.R2>=r2_threshold]
ld_buddies = ld_buddies[ld_buddies.SNP_A!=ld_buddies.SNP_B]
ld_buddies_dict = {variant: set() for variant in (ld_buddies.SNP_A.to_list() + ld_buddies.SNP_B.to_list())} # variant --> ld_buddies
for row in ld_buddies.itertuples():
    ld_buddies_dict[row.SNP_A].add(row.SNP_B)
    ld_buddies_dict[row.SNP_B].add(row.SNP_A)

In [6]:
def expand_cs(cs, ld_buddies_dict):
    new_cs = []
    for phenotype_id, variant_id, cs_id in zip(cs.phenotype_id, cs.variant_id, cs.cs_id):
        new_cs.append([phenotype_id, variant_id, cs_id])
        if variant_id in ld_buddies_dict:
            for buddy in ld_buddies_dict[variant_id]:
                new_cs.append([phenotype_id, buddy, cs_id])
    return pd.DataFrame(new_cs, columns=['phenotype_id', 'variant_id', 'cs_id']) 

eqtlgen_whole_blood_ciseqtl_expanded = expand_cs(eqtlgen_whole_blood_ciseqtl, ld_buddies_dict)
direct_whole_blood_ciseqtl_expanded = expand_cs(direct_whole_blood_ciseqtl, ld_buddies_dict)

In [7]:
assert(compare_credible_sets(eqtlgen_whole_blood_ciseqtl_expanded, eqtlgen_whole_blood_ciseqtl).in_other_scan_cs.min() == 1)
assert(compare_credible_sets(eqtlgen_whole_blood_ciseqtl, eqtlgen_whole_blood_ciseqtl_expanded).in_other_scan_cs.min() == 1)
assert(compare_credible_sets(direct_whole_blood_ciseqtl_expanded, direct_whole_blood_ciseqtl).in_other_scan_cs.min() == 1)
assert(compare_credible_sets(direct_whole_blood_ciseqtl, direct_whole_blood_ciseqtl_expanded).in_other_scan_cs.min() == 1)

In [8]:
if USE_LD:
    combined = pd.concat([topmed_whole_blood_ciseqtl.drop(columns=['pip', 'af']).assign(src='TOPMed'), gtex_whole_blood_ciseqtl.drop(columns=['pip', 'af']).assign(src='GTEx'), eqtlgen_whole_blood_ciseqtl_expanded.assign(src='eQTLGen'), direct_whole_blood_ciseqtl_expanded.assign(src='DIRECT')])
else:
    combined = pd.concat([topmed_whole_blood_ciseqtl.drop(columns=['pip', 'af']).assign(src='TOPMed'), gtex_whole_blood_ciseqtl.drop(columns=['pip', 'af']).assign(src='GTEx'), eqtlgen_whole_blood_ciseqtl.assign(src='eQTLGen'), direct_whole_blood_ciseqtl.assign(src='DIRECT')])
combined.head()

Unnamed: 0,phenotype_id,variant_id,cs_id,src
0,ENSG00000285578,chr6_180573_C_A,1,TOPMed
1,ENSG00000285578,chr6_184740_C_T,1,TOPMed
2,ENSG00000285578,chr6_153439_T_A,2,TOPMed
3,ENSG00000285578,chr6_196027_C_T,3,TOPMed
4,ENSG00000285578,chr6_197265_T_C,3,TOPMed


In [9]:
comparisons = [] # source 1, source 2, fraction from source 1 in source 2
comparisons_against_all = [] # source, fraction from source 1 in any of the other sources
comparisons_against_all_full = [] # source, fraction from source 1 in any of the other sources
sources = combined.src.unique()

for src1 in sources:
    source_1_data = combined[combined.src==src1]
    other_source_data = combined[combined.src!=src1]
    comparisons_against_all.append([src1, compare_credible_sets(source_1_data, other_source_data).in_other_scan_cs.mean()])
    comparisons_against_all_full.append(compare_credible_sets(source_1_data, other_source_data).assign(src=src1))
    for src2, source_2_data in other_source_data.groupby('src'):
        src1_in_src2 = compare_credible_sets(source_1_data, source_2_data)
        comparisons.append([src1, src2, src1_in_src2.in_other_scan_cs.mean()])
    
comparisons = pd.DataFrame(comparisons, columns=['source_1', 'source_2', 'fraction_cs_in_source_2'])
comparisons_against_all = pd.DataFrame(comparisons_against_all, columns=['source', 'fraction_cs_in_other_sources'])

In [10]:
comparisons_against_all_full = pd.concat(comparisons_against_all_full)
comparisons_against_all_full.head()

Unnamed: 0,phenotype_id,cs_id,in_other_scan_cs,src
0,ENSG00000000003,1,1,TOPMed
1,ENSG00000000003,2,1,TOPMed
2,ENSG00000000419,1,0,TOPMed
3,ENSG00000000419,2,0,TOPMed
4,ENSG00000000457,1,1,TOPMed


In [11]:
print(comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts())
print(len(comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs))

0    51037
1    18729
Name: in_other_scan_cs, dtype: int64
69766


In [12]:
comparisons_against_all_full.loc[comparisons_against_all_full.src=='TOPMed',['phenotype_id', 'cs_id', 'in_other_scan_cs']].to_csv(f'{PREFIX}topmed-signal-in-gtex-eqtlgen-direct.tsv', sep='\t', index=False)

In [13]:
comparisons.pivot(index='source_1', columns='source_2', values='fraction_cs_in_source_2').fillna(1)

source_2,DIRECT,GTEx,TOPMed,eQTLGen
source_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DIRECT,1.0,0.081066,0.204469,0.084124
GTEx,0.407709,1.0,0.646079,0.28241
TOPMed,0.17381,0.107201,1.0,0.095806
eQTLGen,0.287455,0.187614,0.388238,1.0


In [14]:
comparisons_against_all['fraction_not_in_other_sources'] = 1 - comparisons_against_all.fraction_cs_in_other_sources
comparisons_against_all

Unnamed: 0,source,fraction_cs_in_other_sources,fraction_not_in_other_sources
0,TOPMed,0.268455,0.731545
1,GTEx,0.75658,0.24342
2,eQTLGen,0.512745,0.487255
3,DIRECT,0.244284,0.755716


In [15]:
len(eqtlgen_whole_blood_ciseqtl)

16987

In [16]:
colocs = pd.read_csv('../work/coloc/panukbb/joint/postprocessed.txt', sep='\t')
colocs = colocs[colocs.gwas_ancestry=='EUR']
#colocs = colocs[(colocs.xqtl_maf!='0.1%')]
colocs = colocs[(colocs.xqtl_tissue=='Whole_blood') & (colocs.xqtl_maf=='1%') & (colocs.xqtl_modality=='ciseqtl')]
colocs.xqtl_phenotype = colocs.xqtl_phenotype.str.split('.', expand=True)[0]
colocs.head()

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_modality,xqtl_tissue,xqtl_maf,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal
9,2162,chr1_111197723_C_G,chr1_111197723_C_G,0.0,2.663501e-14,0.0,6.366463e-15,1.0,4,1,...,ciseqtl,Whole_blood,1%,ENSG00000162777,joint,ENSG00000162777.17,chr1_110947722_111447723,EUR,biomarkers-30750-both_sexes-irnt,biomarkers-30750-both_sexes-irnt___EUR___chr1_...
26,6415,chr15_63049797_T_C,chr15_63049797_T_C,0.0,3.198827e-177,0.0,0.0,1.0,4,1,...,ciseqtl,Whole_blood,1%,ENSG00000140416,joint,ENSG00000140416.21,chr15_62392636_63970660,EUR,continuous-30100-both_sexes-irnt,continuous-30100-both_sexes-irnt___EUR___chr15...
30,2026,chr1_159205564_G_A,chr1_159205564_G_A,1.501781e-170,9.969781000000001e-52,3.0126649999999996e-122,0.0,1.0,2,2,...,ciseqtl,Whole_blood,1%,ENSG00000213088,joint,ENSG00000213088.11,chr1_158955703_159455704,EUR,continuous-30130-both_sexes-irnt,continuous-30130-both_sexes-irnt___EUR___chr1_...
33,1562,chr6_396321_C_T,chr6_396321_C_T,0.0,9.685085000000001e-299,6.2542389999999995e-167,0.0,1.0,1,1,...,ciseqtl,Whole_blood,1%,ENSG00000137265,joint,ENSG00000137265.15,chr6_146320_646321,EUR,continuous-1757-both_sexes,continuous-1757-both_sexes___EUR___chr6_146320...
43,3868,chr7_28237488_C_G,chr7_28237488_C_G,0.0,1.1979830000000001e-60,0.0,4.638423e-14,1.0,1,2,...,ciseqtl,Whole_blood,1%,ENSG00000153814,joint,ENSG00000153814.13,chr7_27987487_28934757,EUR,continuous-30180-both_sexes-irnt,continuous-30180-both_sexes-irnt___EUR___chr7_...


In [17]:
print(len(colocs))
for_merge = comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed']
for_merge['xqtl_phenotype'] = for_merge.phenotype_id
for_merge['xqtl_cs'] = 'L' + for_merge.cs_id.astype(str)
for_merge = for_merge[['xqtl_cs', 'xqtl_phenotype', 'in_other_scan_cs']]
colocs = colocs.merge(for_merge)
print(len(colocs))
#print(len(for_merge))

8641
8641


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_merge['xqtl_phenotype'] = for_merge.phenotype_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_merge['xqtl_cs'] = 'L' + for_merge.cs_id.astype(str)


In [18]:
colocs.head()

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_tissue,xqtl_maf,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs
0,2162,chr1_111197723_C_G,chr1_111197723_C_G,0.0,2.663501e-14,0.0,6.366463e-15,1.0,4,1,...,Whole_blood,1%,ENSG00000162777,joint,ENSG00000162777.17,chr1_110947722_111447723,EUR,biomarkers-30750-both_sexes-irnt,biomarkers-30750-both_sexes-irnt___EUR___chr1_...,1
1,2160,chr1_111197723_C_G,chr1_111197723_C_G,0.0,2.076595e-11,0.0,1.13032e-11,1.0,4,2,...,Whole_blood,1%,ENSG00000162777,joint,ENSG00000162777.17,chr1_110943971_111443972,EUR,continuous-30080-both_sexes-irnt,continuous-30080-both_sexes-irnt___EUR___chr1_...,1
2,6415,chr15_63049797_T_C,chr15_63049797_T_C,0.0,3.198827e-177,0.0,0.0,1.0,4,1,...,Whole_blood,1%,ENSG00000140416,joint,ENSG00000140416.21,chr15_62392636_63970660,EUR,continuous-30100-both_sexes-irnt,continuous-30100-both_sexes-irnt___EUR___chr15...,1
3,5194,chr15_63049797_T_C,chr15_63049797_T_C,0.0,5.783741e-110,0.0,0.0,1.0,4,2,...,Whole_blood,1%,ENSG00000140416,joint,ENSG00000140416.21,chr15_62799038_65581301,EUR,continuous-30080-both_sexes-irnt,continuous-30080-both_sexes-irnt___EUR___chr15...,1
4,2026,chr1_159205564_G_A,chr1_159205564_G_A,1.501781e-170,9.969781000000001e-52,3.0126649999999996e-122,0.0,1.0,2,2,...,Whole_blood,1%,ENSG00000213088,joint,ENSG00000213088.11,chr1_158955703_159455704,EUR,continuous-30130-both_sexes-irnt,continuous-30130-both_sexes-irnt___EUR___chr1_...,1


In [19]:
len(colocs[['gwas_cs', 'gwas_signal']].drop_duplicates())

5872

In [20]:
colocs[['xqtl_phenotype', 'xqtl_cs', 'in_other_scan_cs']].drop_duplicates().in_other_scan_cs.value_counts()

0    2760
1    2369
Name: in_other_scan_cs, dtype: int64

In [21]:
"For example, in whole blood we detect {:,} total cis-eQTL signals  for 19,468 genes, {:,} ({}%) of which were not detected in GTEx, DIRECT, or eQTLGen. Of these {:,}, {:,} ({}%) colocalize with at least one PanUKBB GWAS signal, emphasizing their potential biological relevance and usefulness in downstream analysis".format(
len(topmed_whole_blood_ciseqtl[['phenotype_id', 'cs_id']].drop_duplicates()),
comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts()[0],
round(100*comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts()[0] / comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts().sum(), 1),
comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts()[0],
colocs[['xqtl_phenotype', 'xqtl_cs', 'in_other_scan_cs']].drop_duplicates().in_other_scan_cs.value_counts()[0],
round(100*colocs[['xqtl_phenotype', 'xqtl_cs', 'in_other_scan_cs']].drop_duplicates().in_other_scan_cs.value_counts()[0] / comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts()[0], 1)


)


'For example, in whole blood we detect 69,766 total cis-eQTL signals  for 19,468 genes, 51,037 (73.2%) of which were not detected in GTEx, DIRECT, or eQTLGen. Of these 51,037, 2,760 (5.4%) colocalize with at least one PanUKBB GWAS signal, emphasizing their potential biological relevance and usefulness in downstream analysis'

In [22]:
comparisons_against_all_full[comparisons_against_all_full.src=='TOPMed'].in_other_scan_cs.value_counts()[0]

51037

In [23]:
colocs.head()

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_tissue,xqtl_maf,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs
0,2162,chr1_111197723_C_G,chr1_111197723_C_G,0.0,2.663501e-14,0.0,6.366463e-15,1.0,4,1,...,Whole_blood,1%,ENSG00000162777,joint,ENSG00000162777.17,chr1_110947722_111447723,EUR,biomarkers-30750-both_sexes-irnt,biomarkers-30750-both_sexes-irnt___EUR___chr1_...,1
1,2160,chr1_111197723_C_G,chr1_111197723_C_G,0.0,2.076595e-11,0.0,1.13032e-11,1.0,4,2,...,Whole_blood,1%,ENSG00000162777,joint,ENSG00000162777.17,chr1_110943971_111443972,EUR,continuous-30080-both_sexes-irnt,continuous-30080-both_sexes-irnt___EUR___chr1_...,1
2,6415,chr15_63049797_T_C,chr15_63049797_T_C,0.0,3.198827e-177,0.0,0.0,1.0,4,1,...,Whole_blood,1%,ENSG00000140416,joint,ENSG00000140416.21,chr15_62392636_63970660,EUR,continuous-30100-both_sexes-irnt,continuous-30100-both_sexes-irnt___EUR___chr15...,1
3,5194,chr15_63049797_T_C,chr15_63049797_T_C,0.0,5.783741e-110,0.0,0.0,1.0,4,2,...,Whole_blood,1%,ENSG00000140416,joint,ENSG00000140416.21,chr15_62799038_65581301,EUR,continuous-30080-both_sexes-irnt,continuous-30080-both_sexes-irnt___EUR___chr15...,1
4,2026,chr1_159205564_G_A,chr1_159205564_G_A,1.501781e-170,9.969781000000001e-52,3.0126649999999996e-122,0.0,1.0,2,2,...,Whole_blood,1%,ENSG00000213088,joint,ENSG00000213088.11,chr1_158955703_159455704,EUR,continuous-30130-both_sexes-irnt,continuous-30130-both_sexes-irnt___EUR___chr1_...,1


In [24]:
colocs[colocs.xqtl_gene=='ENSG00000137507.11']

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_tissue,xqtl_maf,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs
1831,2942,chr11_76582483_G_T,chr11_76582714_G_A,6.5235000000000005e-81,6.7862959999999995e-56,6.895906e-28,0.005184,0.994816,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76074762_76832714,EUR,categorical-6152-both_sexes-100,categorical-6152-both_sexes-100___EUR___chr11_...,0
1832,2934,chr11_76582483_G_T,chr11_76582714_G_A,3.905879e-125,4.0632259999999996e-100,7.5088500000000005e-28,0.005823,0.994177,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76071628_76832714,EUR,continuous-30210-both_sexes-irnt,continuous-30210-both_sexes-irnt___EUR___chr11...,0
1833,1496,chr11_76582483_G_T,chr11_76582714_G_A,1.961336e-30,2.040348e-05,1.044228e-27,0.008881,0.991099,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76332713_76832714,EUR,categorical-20003-both_sexes-1140881856,categorical-20003-both_sexes-1140881856___EUR_...,0
1834,1675,chr11_76582483_G_T,chr11_76582761_G_C,1.15404e-31,1.20053e-06,1.952618e-27,0.018349,0.981649,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76338386_76838387,EUR,categorical-20003-both_sexes-1140861998,categorical-20003-both_sexes-1140861998___EUR_...,0
1835,1852,chr11_76582483_G_T,chr11_76584554_G_A,1.063491e-35,1.106334e-10,2.4045500000000003e-27,0.02306,0.97694,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76334553_76834554,EUR,continuous-135-both_sexes,continuous-135-both_sexes___EUR___chr11_763345...,0
1836,1842,chr11_76582483_G_T,chr11_76570549_C_T,3.314461e-30,3.447983e-05,5.979166e-27,0.060321,0.939644,2,1,...,Whole_blood,1%,ENSG00000137507,joint,ENSG00000137507.11,chr11_76334553_76834554,EUR,continuous-AG-both_sexes-irnt,continuous-AG-both_sexes-irnt___EUR___chr11_76...,0


In [26]:
gwas_traits = pd.read_csv('/net/topmed11/working/porchard/panukbb-finemapping/work/selected-traits/manifest.EUR.txt', sep='\t').rename(columns={'trait_id': 'gwas_trait'})
gwas_traits.head()

Unnamed: 0,trait_type,phenocode,coding,modifier,description,description_more,coding_description,category,n_cases_EUR,n_controls_EUR,sldsc_25bin_h2_observed_EUR,sldsc_25bin_h2_observed_se_EUR,sldsc_25bin_h2_liability_EUR,sldsc_25bin_h2_liability_se_EUR,sldsc_25bin_h2_z_EUR,lambda_gc_EUR,aws_path,aws_path_tabix,gwas_trait
0,biomarkers,30600,,irnt,Albumin,,,Biological samples > Assay results > Blood ass...,367192.0,,0.1448,0.0087,0.1448,0.0087,16.644,1.368,s3://pan-ukb-us-east-1/sumstats_flat_files/bio...,s3://pan-ukb-us-east-1/sumstats_flat_files_tab...,biomarkers-30600-both_sexes-irnt
1,biomarkers,30610,,irnt,Alkaline phosphatase,,,Biological samples > Assay results > Blood ass...,400988.0,,0.2047,0.0171,0.2047,0.0171,11.971,1.6682,s3://pan-ukb-us-east-1/sumstats_flat_files/bio...,s3://pan-ukb-us-east-1/sumstats_flat_files_tab...,biomarkers-30610-both_sexes-irnt
2,biomarkers,30620,,irnt,Alanine aminotransferase,,,Biological samples > Assay results > Blood ass...,400822.0,,0.1235,0.0091,0.1235,0.0091,13.571,1.415,s3://pan-ukb-us-east-1/sumstats_flat_files/bio...,s3://pan-ukb-us-east-1/sumstats_flat_files_tab...,biomarkers-30620-both_sexes-irnt
3,biomarkers,30630,,irnt,Apolipoprotein A,,,Biological samples > Assay results > Blood ass...,364987.0,,0.1822,0.012,0.1822,0.012,15.183,1.6285,s3://pan-ukb-us-east-1/sumstats_flat_files/bio...,s3://pan-ukb-us-east-1/sumstats_flat_files_tab...,biomarkers-30630-both_sexes-irnt
4,biomarkers,30640,,irnt,Apolipoprotein B,,,Biological samples > Assay results > Blood ass...,399003.0,,0.102,0.0114,0.102,0.0114,8.9474,1.4551,s3://pan-ukb-us-east-1/sumstats_flat_files/bio...,s3://pan-ukb-us-east-1/sumstats_flat_files_tab...,biomarkers-30640-both_sexes-irnt


In [27]:
print(len(colocs))
colocs = colocs.merge(gwas_traits[['description', 'description_more', 'gwas_trait']])
print(len(colocs))
colocs[colocs.xqtl_gene=='ENSG00000137507.11']


8641
8641


Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs,description,description_more
2355,2934,chr11_76582483_G_T,chr11_76582714_G_A,3.905879e-125,4.0632259999999996e-100,7.5088500000000005e-28,0.005823,0.994177,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76071628_76832714,EUR,continuous-30210-both_sexes-irnt,continuous-30210-both_sexes-irnt___EUR___chr11...,0,Eosinophill percentage,"Result of ""Eosinophils Percentage"" assay, perf..."
2722,1842,chr11_76582483_G_T,chr11_76570549_C_T,3.314461e-30,3.447983e-05,5.979166e-27,0.060321,0.939644,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76334553_76834554,EUR,continuous-AG-both_sexes-irnt,continuous-AG-both_sexes-irnt___EUR___chr11_76...,0,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...
7190,2942,chr11_76582483_G_T,chr11_76582714_G_A,6.5235000000000005e-81,6.7862959999999995e-56,6.895906e-28,0.005184,0.994816,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76074762_76832714,EUR,categorical-6152-both_sexes-100,categorical-6152-both_sexes-100___EUR___chr11_...,0,"Blood clot, DVT, bronchitis, emphysema, asthma...","ACE touchscreen question ""Has a doctor ever to..."
7539,1675,chr11_76582483_G_T,chr11_76582761_G_C,1.15404e-31,1.20053e-06,1.952618e-27,0.018349,0.981649,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76338386_76838387,EUR,categorical-20003-both_sexes-1140861998,categorical-20003-both_sexes-1140861998___EUR_...,0,Treatment/medication code,Code for treatment Negative codes indicate fre...
7548,1496,chr11_76582483_G_T,chr11_76582714_G_A,1.961336e-30,2.040348e-05,1.044228e-27,0.008881,0.991099,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76332713_76832714,EUR,categorical-20003-both_sexes-1140881856,categorical-20003-both_sexes-1140881856___EUR_...,0,Treatment/medication code,Code for treatment Negative codes indicate fre...
8132,1852,chr11_76582483_G_T,chr11_76584554_G_A,1.063491e-35,1.106334e-10,2.4045500000000003e-27,0.02306,0.97694,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76334553_76834554,EUR,continuous-135-both_sexes,continuous-135-both_sexes___EUR___chr11_763345...,0,Number of self-reported non-cancer illnesses,Number of non-cancer illnesses entered


In [29]:
colocs[colocs.xqtl_gene=='ENSG00000137507.11'].gwas_signal.values

array(['continuous-30210-both_sexes-irnt___EUR___chr11_76071628_76832714',
       'continuous-AG-both_sexes-irnt___EUR___chr11_76334553_76834554',
       'categorical-6152-both_sexes-100___EUR___chr11_76074762_76832714',
       'categorical-20003-both_sexes-1140861998___EUR___chr11_76338386_76838387',
       'categorical-20003-both_sexes-1140881856___EUR___chr11_76332713_76832714',
       'continuous-135-both_sexes___EUR___chr11_76334553_76834554'],
      dtype=object)

In [30]:
colocs[(colocs.xqtl_gene=='ENSG00000137507.11') & (colocs.gwas_signal=='categorical-6152-both_sexes-100___EUR___chr11_76074762_76832714')]

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs,description,description_more
7190,2942,chr11_76582483_G_T,chr11_76582714_G_A,6.5235000000000005e-81,6.7862959999999995e-56,6.895906e-28,0.005184,0.994816,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76074762_76832714,EUR,categorical-6152-both_sexes-100,categorical-6152-both_sexes-100___EUR___chr11_...,0,"Blood clot, DVT, bronchitis, emphysema, asthma...","ACE touchscreen question ""Has a doctor ever to..."


In [31]:
colocs[(colocs.gwas_signal=='categorical-6152-both_sexes-100___EUR___chr11_76074762_76832714')]

Unnamed: 0,nsnps,hit1,hit2,PP.H0.abf,PP.H1.abf,PP.H2.abf,PP.H3.abf,PP.H4.abf,idx1,idx2,...,xqtl_phenotype,xqtl_ancestry,xqtl_gene,gwas_region,gwas_ancestry,gwas_trait,gwas_signal,in_other_scan_cs,description,description_more
7190,2942,chr11_76582483_G_T,chr11_76582714_G_A,6.5235000000000005e-81,6.7862959999999995e-56,6.895906e-28,0.005184,0.994816,2,1,...,ENSG00000137507,joint,ENSG00000137507.11,chr11_76074762_76832714,EUR,categorical-6152-both_sexes-100,categorical-6152-both_sexes-100___EUR___chr11_...,0,"Blood clot, DVT, bronchitis, emphysema, asthma...","ACE touchscreen question ""Has a doctor ever to..."


In [28]:
colocs.loc[7190]

nsnps                                                            2942
hit1                                               chr11_76582483_G_T
hit2                                               chr11_76582714_G_A
PP.H0.abf                                                         0.0
PP.H1.abf                                                         0.0
PP.H2.abf                                                         0.0
PP.H3.abf                                                    0.005184
PP.H4.abf                                                    0.994816
idx1                                                                2
idx2                                                                1
chrom                                                           chr11
region_start                                                 76075261
region_end                                                   76832430
xqtl_cs                                                            L1
gwas_cs             

In [None]:
colocs[coloced.gwas_signal=='']