# Enrichment Analysis on LCI Predictions

Analyzes the enrichment of proteins predicted by LCI

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))

import numpy as np
import matplotlib.pyplot as plt 
import torch
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook as tqdm
import goatools
from goatools.base import download_go_basic_obo, download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.go_enrichment import GOEnrichmentStudy


from dpp.methods.lci.lci_method import LCIModule
from dpp.data.network import PPINetwork
from dpp.util import Params
from dpp.data.associations import load_diseases

## Loading Data
Load disease associations and protein-protein interaction network.

In [2]:
# load diseases
diseases_dict = load_diseases("../data/associations/disgenet-associations.csv", exclude_splits=['none'])

In [3]:
# load network
network = PPINetwork("../data/networks/bio-pathways-network.txt")
n = len(network)

## Load Predictions
Load predictions from a disease protein prediction method.

In [4]:
predictions= pd.read_csv("../experiments/bio-pathways/dpp_predict/lci/predictions.csv", index_col=0)

## Load Enrichment Analysis
Prepare an enrichment study

In [5]:
# load gene ontology
obodag = GODag("../data/go/go-basic.obo")

../data/go/go-basic.obo: fmt(1.2) rel(2019-01-19) 47,379 GO Terms


In [6]:
geneid2go = read_ncbi_gene2go("../data/go/gene2go.txt", taxids=[9606])

  20,385 items READ: ../data/go/gene2go.txt


In [7]:
goeaobj = GOEnrichmentStudy(network.get_proteins(), # List of mouse protein-coding genes
                            geneid2go, # geneid/GO associations
                            obodag, # Ontologies
                            propagate_counts = True,
                            alpha = 0.05, # default significance cut-off
                            methods = ['fdr_bh']) # defult multipletest correction method

fisher module not installed.  Falling back on scipy.stats.fisher_exact


Propagating term counts to parents ..


 76% 16,420 of 21,557 population items found in association


## Perform Enrichment Analysis
Perform an enrichment analysis on one disease. 

In [15]:
disease_id = "C1862314"

In [16]:
# run disease enrichment
disease_proteins = set(diseases_dict[disease_id].proteins)
disease_results = goeaobj.run_study(disease_proteins)

100%     23 of     23 study items found in association
100%     23 of     23 study items found in population(21557)
Calculating 21,968 uncorrected p-values using fisher_scipy_stats
  21,968 GO terms are associated with 16,420 of 21,557 population items
   1,564 GO terms are associated with     23 of     23 study items
     334 GO terms found significant (< 0.05=alpha) after multitest correction: statsmodels fdr_bh


In [18]:
# run prediction enrichment
pred_proteins = set(map(int, predictions.loc[disease_id].sort_values(ascending=False).index[:len(disease_proteins)]))
pred_results = goeaobj.run_study(pred_proteins)

 78%     18 of     23 study items found in association
100%     23 of     23 study items found in population(21557)
Calculating 21,968 uncorrected p-values using fisher_scipy_stats
  21,968 GO terms are associated with 16,420 of 21,557 population items
   1,263 GO terms are associated with     18 of     23 study items
     257 GO terms found significant (< 0.05=alpha) after multitest correction: statsmodels fdr_bh


In [19]:
k = 10
disease_top_k = sorted(disease_results, key=lambda x: x.p_fdr_bh)[:k]
disease_significant = [r for r in disease_results if r.p_fdr_bh < 0.05]

pred_top_k = sorted(pred_results, key=lambda x: x.p_fdr_bh)[:k]
pred_significant = [r for r in pred_results if r.p_fdr_bh < 0.05]

In [23]:
intersection = set([result.goterm.name for result in disease_top_k]) & set([result.goterm.name for result in pred_top_k])
union = set([result.goterm.name for result in disease_top_k]) | set([result.goterm.name for result in pred_top_k])

print("Jaccard Similarity: {}".format(1.0*len(intersection)/len(union)))

Jaccard Similarity: 0.25


In [22]:
intersection = set([result.goterm.name for result in pred_significant]) & set([result.goterm.name for result in disease_significant])
union = set([result.goterm.name for result in pred_significant]) | set([result.goterm.name for result in disease_significant])

print("Jaccard Similarity: {}".format(1.0*len(intersection)/len(union)))

Jaccard Similarity: 0.26282051282051283


In [None]:
# compute jaccard similarity for all diseases
# find ten diseases 

In [29]:
x = [{"disease": {"x": 1, "y": 2, "z": 3}, 
     "pred": {"x": 3, "y": 2, "z": 3}},
     {"disease": {"x": 5, "y": 6, "z": 7}, 
     "pred": {"x": 5, "y": 7, "z": 6}}]
dict_of_df = {k: pd.DataFrame(v) for k,v in x.items()}
dpd.concat(dict_of_df, axis=1)

AttributeError: 'list' object has no attribute 'items'

In [31]:
diseases_dict

{'C1842060': <dpp.data.associations.Disease at 0x10d0f36d8>,
 'C2239176': <dpp.data.associations.Disease at 0x10d0f36a0>,
 'C0149721': <dpp.data.associations.Disease at 0x10d0f32b0>,
 'C0020428': <dpp.data.associations.Disease at 0x10d0f3780>,
 'C0235153': <dpp.data.associations.Disease at 0x10d0f3748>,
 'C1527311': <dpp.data.associations.Disease at 0x13aee6cf8>,
 'C0079744': <dpp.data.associations.Disease at 0x10d0f35f8>,
 'C0024667': <dpp.data.associations.Disease at 0x10d0f3940>,
 'C0018800': <dpp.data.associations.Disease at 0x10d0f32e8>,
 'C0002514': <dpp.data.associations.Disease at 0x10d0f3588>,
 'C0043352': <dpp.data.associations.Disease at 0x10d0f3550>,
 'C0013182': <dpp.data.associations.Disease at 0x10d0f3978>,
 'C0018824': <dpp.data.associations.Disease at 0x10d0f35c0>,
 'C0332606': <dpp.data.associations.Disease at 0x10d0f3630>,
 'C1842170': <dpp.data.associations.Disease at 0x10d0f3668>,
 'C0917798': <dpp.data.associations.Disease at 0x10d0f34e0>,
 'C0017639': <dpp.data.a