In [1]:
import pandas as pd

def parse_attribute(attribute_series: pd.Series, attribute_name: str) -> pd.Series:
    """
    Parse the attributes column of a (GENCODE/RefSeq) GTF file.

    Input:
    * a [str]: the attributes element (column 9 of the GTF file)
    * regex [str]: a regular expression that will be iteratively applied to the attribute string to capture attribute key, val pairs. Default should work for GENCODE/RefSeq
    """
    if not isinstance(attribute_series, pd.Series):
        raise TypeError('attribute_series must be a pandas Series')
    if not isinstance(attribute_name, str):
        raise TypeError('attribute_name must be a string')
    
    return attribute_series.str.extract(f'{attribute_name} "(.*?)"')


def gtf_to_df(gtf: str, parse_attributes: list=None) -> pd.DataFrame:
    df = pd.read_csv(gtf, sep='\t', header=None, names=['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes'], comment='#')
    if parse_attributes is not None:
        for a in parse_attributes:
            df[a] = parse_attribute(df.attributes, a)
    return df

PREFIX = 'summarize-panukbb-coloc.'

GTF = '../../manuscript-intermediate-processing/data/gtf/gencode.v30.GRCh38.ERCC.genes.collapsed_only.gtf.gz'

gtf_df = gtf_to_df(GTF, parse_attributes=['gene_id', 'gene_name'])
gtf_df = gtf_df[gtf_df.feature=='gene']
gene_id_to_gene_name = dict(zip(gtf_df.gene_id, gtf_df.gene_name))

In [2]:
manifest = pd.read_csv('/net/topmed11/working/porchard/panukbb-finemapping/work/selected-traits/manifest.EUR.txt', sep='\t').loc[:,['trait_id', 'description', 'description_more', 'coding_description']]
coloced_joint = pd.read_csv('../../manuscript-intermediate-processing/work/coloc/panukbb/joint/postprocessed.txt', sep='\t')
coloced_joint = coloced_joint[coloced_joint.gwas_ancestry=='EUR']

In [3]:
print(len(coloced_joint.loc[(coloced_joint.xqtl_modality.str.contains('cis')) & (coloced_joint.xqtl_tissue=='Whole_blood') & (coloced_joint.xqtl_maf=='0.1%'),['gwas_cs', 'gwas_signal']].drop_duplicates()))
print(len(coloced_joint.loc[(coloced_joint.xqtl_modality.str.contains('cis')) & (coloced_joint.xqtl_tissue=='Whole_blood') & (coloced_joint.xqtl_maf=='1%'),['gwas_cs', 'gwas_signal']].drop_duplicates()))

7260
7241


In [4]:
coloced_joint = coloced_joint[(coloced_joint.xqtl_maf!='0.1%')]

In [5]:
len(coloced_joint[['gwas_signal', 'gwas_cs']].drop_duplicates())

10611

In [6]:
# look for cases where there is more than one coloc between a given gene and GWAS trait
x = coloced_joint.groupby(['xqtl_tissue', 'xqtl_maf', 'xqtl_modality', 'xqtl_gene', 'gwas_trait']).size().rename('n_colocs').reset_index()
#x = x[x.n_colocs>1]
x.head()

Unnamed: 0,xqtl_tissue,xqtl_maf,xqtl_modality,xqtl_gene,gwas_trait,n_colocs
0,Lung,1%,ciseqtl,ENSG00000001461.17,categorical-1747-both_sexes-4,1
1,Lung,1%,ciseqtl,ENSG00000001561.7,biomarkers-30770-both_sexes-irnt,1
2,Lung,1%,ciseqtl,ENSG00000002726.20,biomarkers-30630-both_sexes-irnt,1
3,Lung,1%,ciseqtl,ENSG00000002726.20,biomarkers-30680-both_sexes-irnt,1
4,Lung,1%,ciseqtl,ENSG00000002726.20,biomarkers-30760-both_sexes-irnt,1


In [7]:
x.n_colocs.value_counts()

1    42775
2      609
3       44
4        6
Name: n_colocs, dtype: int64

In [8]:
x = x.merge(manifest.rename(columns={'trait_id': 'gwas_trait'})).sort_values('n_colocs', ascending=False)
x['gene_name'] = x.xqtl_gene.map(gene_id_to_gene_name)
x.head()

Unnamed: 0,xqtl_tissue,xqtl_maf,xqtl_modality,xqtl_gene,gwas_trait,n_colocs,description,description_more,coding_description,gene_name
16029,Lung,1%,cissqtl,ENSG00000257017.8,biomarkers-30690-both_sexes-irnt,4,Cholesterol,,,HP
14895,Whole_blood,1%,cissqtl,ENSG00000244682.7,continuous-AG-both_sexes-irnt,4,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,,FCGR2C
16691,Lung,1%,cissqtl,ENSG00000257017.8,continuous-LDLC-both_sexes-medadj_irnt,4,"LDL direct, adjusted by medication",LDLC (30780) adjusted by the use of cholestero...,,HP
18350,Lung,1%,cissqtl,ENSG00000257017.8,biomarkers-30640-both_sexes-irnt,4,Apolipoprotein B,,,HP
25742,Whole_blood,1%,ciseqtl,ENSG00000172216.5,continuous-30130-both_sexes-irnt,4,Monocyte count,"Result of ""Monocytes Number"" assay, performed ...",,CEBPB


In [9]:
x = x[['xqtl_tissue', 'xqtl_maf', 'xqtl_modality', 'xqtl_gene', 'gene_name', 'n_colocs', 'gwas_trait', 'description', 'description_more', 'coding_description']]
x = x.rename(columns=lambda y: y.replace('xqtl_', '')).rename(columns=lambda y: 'gwas_' + y if 'description' in y else y)
x

Unnamed: 0,tissue,maf,modality,gene,gene_name,n_colocs,gwas_trait,gwas_description,gwas_description_more,gwas_coding_description
16029,Lung,1%,cissqtl,ENSG00000257017.8,HP,4,biomarkers-30690-both_sexes-irnt,Cholesterol,,
14895,Whole_blood,1%,cissqtl,ENSG00000244682.7,FCGR2C,4,continuous-AG-both_sexes-irnt,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,
16691,Lung,1%,cissqtl,ENSG00000257017.8,HP,4,continuous-LDLC-both_sexes-medadj_irnt,"LDL direct, adjusted by medication",LDLC (30780) adjusted by the use of cholestero...,
18350,Lung,1%,cissqtl,ENSG00000257017.8,HP,4,biomarkers-30640-both_sexes-irnt,Apolipoprotein B,,
25742,Whole_blood,1%,ciseqtl,ENSG00000172216.5,CEBPB,4,continuous-30130-both_sexes-irnt,Monocyte count,"Result of ""Monocytes Number"" assay, performed ...",
...,...,...,...,...,...,...,...,...,...,...
14549,Whole_blood,1%,ciseqtl,ENSG00000132170.21,PPARG,1,continuous-AG-both_sexes-irnt,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,
14550,Whole_blood,1%,ciseqtl,ENSG00000132185.16,FCRLA,1,continuous-AG-both_sexes-irnt,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,
14551,Whole_blood,1%,ciseqtl,ENSG00000132475.10,H3F3B,1,continuous-AG-both_sexes-irnt,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,
14552,Whole_blood,1%,ciseqtl,ENSG00000133030.21,MPRIP,1,continuous-AG-both_sexes-irnt,Albumin/Globulin ratio,Albumin (30600) / (Total Protein [30860] - Alb...,


In [10]:
x[x.n_colocs>1].to_csv(f'tables/{PREFIX}multicolocs.csv', doublequote=True, index=False)

In [11]:
len(x[x.n_colocs>1])

659

In [12]:
len(x[(x.n_colocs>1) & (x.maf!='0.1%')])

659

In [13]:
x[x.n_colocs>1].groupby(['tissue', 'maf', 'modality']).size()

tissue            maf  modality 
Lung              1%   ciseqtl       43
                       cissqtl       51
Monocyte          1%   ciseqtl       17
                       cissqtl       10
Nasal_epithelial  1%   ciseqtl       17
                       cissqtl       25
PBMC              1%   ciseqtl       43
                       cissqtl       51
T_cell            1%   ciseqtl        7
                       cissqtl       16
Whole_blood       1%   ciseqtl      186
                       cissqtl      133
                  5%   transeqtl     59
                       transsqtl      1
dtype: int64