# Check for significance in BMI subsystems

Try integrating mouse KO data (from MGI)

3/23/21: switch to sampled gene sets results

6/14/21: add enrichment for just seed genes

6/29/21: update to be more consistent with NetColoc notebooks

8/9/21: add Brittany's BMI genes

In [1]:
# load required packages

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import random

from IPython.display import display

import getpass
import ndex2

import json
import cdapsutil

# need ddot to parse the ontology
import ddot
from ddot import Ontology

# annotate the clusters
# gprofiler prelim annotation
from gprofiler import GProfiler
gp = GProfiler("MyToolName/0.1")

from statsmodels.stats import contingency_tables

import requests

# find human orthologs of mouse genes
import mygene
mg = mygene.MyGeneInfo()

from scipy.stats import hypergeom
from scipy.stats import norm

# latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

sns.set(font_scale=1.4)

sns.set_style('white')

sns.set_style("ticks", {"xtick.major.size": 15, "ytick.major.size": 15})
plt.rcParams['svg.fonttype'] = 'none'

import sys

# % matplotlib inline

DISCLAIMIER: cdapsutil is experimental and may contain errors and interfaces may change


# Load interactome

In [2]:

interactome_uuid='4de852d9-9908-11e9-bcaf-0ac135e8bacf' # for PCNet
# interactome_uuid='275bd84e-3d18-11e8-a935-0ac135e8bacf' # for STRING high confidence
ndex_server='public.ndexbio.org'
ndex_user=None
ndex_password=None
G_int = ndex2.create_nice_cx_from_server(
            ndex_server, 
            username=ndex_user, 
            password=ndex_password, 
            uuid=interactome_uuid
        ).to_networkx()
nodes = list(G_int.nodes)

# pcnet appears to have some self edges... should remove them. 
G_int.remove_edges_from(nx.selfloop_edges(G_int))

# print out interactome num nodes and edges for diagnostic purposes
print('number of nodes:')
print(len(G_int.nodes))
print('\nnumber of edges:')
print(len(G_int.edges))

number of nodes:
18820

number of edges:
2693109


In [3]:
nodes = list(G_int.nodes)

# First load overlap network computed in significance_network_overlap_sampling_210519.ipynb

In [4]:

ndex_server='public.ndexbio.org'
ndex_user=None
ndex_password=None
G_overlap_cx = ndex2.create_nice_cx_from_server(
            ndex_server, 
            username=ndex_user, 
            password=ndex_password, 
            uuid='e8cc9239-d91a-11eb-b666-0ac135e8bacf'
        )


# print out interactome num nodes and edges for diagnostic purposes
G_overlap = G_overlap_cx.to_networkx()
print('number of nodes:')
print(len(G_overlap.nodes))
print('\nnumber of edges:')
print(len(G_overlap.edges))

number of nodes:
657

number of edges:
5287


In [5]:
node_df = pd.DataFrame.from_dict(dict(G_overlap.nodes(data=True)), orient='index')
node_df.head()

Unnamed: 0,z_hBMI,hBMI_seeds,rBMI_seeds,z_both,z_rBMI
EMILIN2,3.299327450717328,0,0,4.319544181058447,1.3092196047771212
BACE2,2.8994208805031514,1,0,5.1352520072145325,1.7711302425066986
ABCF3,1.342286612722842,0,0,3.16588457673184,2.358575692198711
EIF2AK4,6.620334089952722,0,1,61.32723979498262,9.263466006655952
HSP90AB1,2.4374615254638914,0,1,25.352273034423177,10.401096702274389


In [6]:
# compute cosine similarity transformation
from netcoloc import network_colocalization

In [7]:
network_colocalization.transform_edges?

In [8]:
G_overlap_cosSim = network_colocalization.transform_edges(G_overlap,edge_weight_threshold=0.98)
print(len(G_overlap_cosSim.nodes()))
print(len(G_overlap_cosSim.edges()))

computing the adjacency matrix...
computing the cosine similarity...


  dist = 1.0 - uv / np.sqrt(uu * vv)


rank transforming...
number of transformed edges returned = 
71
657
71


  sim_rank.values[[np.arange(sim_rank.shape[0])]*2] = 0


In [9]:
print(len(G_overlap_cosSim.nodes()))
print(len(G_overlap_cosSim.edges()))

657
71


In [None]:
#upload G_cosSim to ndex
print(len(G_overlap_cosSim.nodes()))
print(len(G_overlap_cosSim.edges()))
G_overlap_cosSim_cx = ndex2.create_nice_cx_from_networkx(G_overlap_cosSim)
G_overlap_cosSim_cx.set_name('rat_human_BMI_CosSim98') 
for node_id, node in G_overlap_cosSim_cx.get_nodes():
    data = node_df.loc[node['n']]
    for row, value in data.items():
        if row == 'd1_seeds' or row == 'd2_seeds' or row=='sum_seeds':
            data_type = 'double'
        elif row=='d1_name' or row=='d2_name':
            data_type='string'
        else:
            data_type = 'double'
        G_overlap_cosSim_cx.set_node_attribute(node_id, row, value, type=data_type)
        
        

# apply a template style (834b6ad4-d2ea-11eb-b666-0ac135e8bacf)
G_overlap_cosSim_cx.apply_template('ndexbio.org','2cbed84b-e5c3-11eb-b666-0ac135e8bacf')

SERVER = input('NDEx server (probably ndexbio.org): ')
USERNAME = input('NDEx user name: ')
PASSWORD = getpass.getpass('NDEx password: ')
network_uuid_NetColoc_CosSim = G_overlap_cosSim_cx.upload_to(SERVER, USERNAME, PASSWORD)

657
71


# Build multiscale systems map



In [None]:
cd = cdapsutil.CommunityDetection()

# Run HiDeF on CDAPS REST service
G_hier = cd.run_community_detection(G_overlap_cx, algorithm='hidefv1.1beta',arguments={'--maxres':'10'})

In [None]:
# Print information about hierarchy
print('Hierarchy name: ' + str(G_hier.get_name()))
print('# nodes: ' + str(len(G_hier.get_nodes())))
print('# edges: ' + str(len(G_hier.get_edges())))

In [None]:
G_hier = G_hier.to_networkx(mode='default')
G_hier

nodes = G_hier.nodes()

# print out interactome num nodes and edges for diagnostic purposes
print('number of nodes:')
print(len(G_hier.nodes()))
print('\nnumber of edges:')
print(len(G_hier.edges()))

In [None]:
# add node attributes to dataframe for easier access
hier_df = pd.DataFrame.from_dict(dict(G_hier.nodes(data=True)), orient='index')
# relabel nodes in G_ASD
# G_ASD=nx.relabel_nodes(G_ASD,dict(hier_df['CD_CommunityName']))
hier_df['system_ID']=hier_df.index.tolist()
# some columns are not the right type
hier_df['CD_MemberList_Size']=[int(x) for x in hier_df['CD_MemberList_Size'].tolist()]
hier_df['HiDeF_persistence']=[int(x) for x in hier_df['HiDeF_persistence'].tolist()]
hier_df.head()

# add fraction rat/human seeds

In [None]:
hier_df.index=hier_df['name']
hier_df.head()

H_seeds_overlap = node_df[node_df['hBMI_seeds']=='1'].index.tolist()
print(len(H_seeds_overlap))

R_seeds_overlap = node_df[node_df['rBMI_seeds']=='1'].index.tolist()
print(len(R_seeds_overlap))

num_d1_seeds, num_d2_seeds = [],[]
frac_d1_seeds, frac_d2_seeds=[],[]
systems_keep = []
for c in hier_df.index.tolist():
    system_genes = hier_df['CD_MemberList'].loc[c].split(' ')
    num_H_temp = len(list(np.intersect1d(system_genes,H_seeds_overlap)))
    num_R_temp = len(list(np.intersect1d(system_genes,R_seeds_overlap)))
    if (num_H_temp+num_H_temp)>0: # keep the system if it has at least 1 seed genes
        systems_keep.append(c)
        num_d1_seeds.append(num_H_temp)
        num_d2_seeds.append(num_R_temp)
        
        frac_d1_seeds.append(num_H_temp/np.float(len(system_genes)))
        frac_d2_seeds.append(num_R_temp/np.float(len(system_genes)))

        
frac_no_seeds = np.subtract(1.0,np.add(frac_d1_seeds,frac_d2_seeds))

hier_df = hier_df.loc[systems_keep]
hier_df['num_d1_seeds']=num_d1_seeds
hier_df['num_d2_seeds']=num_d2_seeds
hier_df['frac_d1_seeds']=frac_d1_seeds
hier_df['frac_d2_seeds']=frac_d2_seeds
hier_df['frac_no_seeds']=frac_no_seeds
print(len(hier_df))

hier_df.head()
    

In [None]:
# prune G_hier--> only keep systems with at least one seed gene

nkeep=[]
for n in list(G_hier.nodes()):
    if G_hier.nodes(data=True)[n]['name'] in systems_keep:
        nkeep.append(n)
        

G_hier = nx.subgraph(G_hier, nkeep)
print(len(G_hier.nodes()))
print(len(G_hier.edges()))

# System validation with mammalian phenotype ontology + mouse KO data

In [None]:
# download MGI phenotype data
url = 'http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt'
r = requests.get(url,allow_redirects=True)
open('MGI_PhenoGenoMP.rpt','wb').write(r.content)

# parse the downloaded MGI phenotype data
mgi_df = pd.read_csv('MGI_PhenoGenoMP.rpt',sep='\t',
                    names=['MGI_Allele_Accession_ID','Allele symbol','involves','MP','PMID','MGI_marker_accession_ID'])
# extract gene names
gene_name = [a.split('<')[0] for a in mgi_df['Allele symbol'].tolist()]
mgi_df['gene_name']=gene_name
mgi_df.index=mgi_df['gene_name']
display(mgi_df.head())

# map mouse genes to human orthologs
mouse_genes = list(np.unique(mgi_df['gene_name']))
# mouse_genes = [g for g in mouse_genes if (not g.startswith('+'))]
mg_mapped = mg.querymany(mouse_genes,as_dataframe=True,species=['mouse','human'],scopes='symbol',fields='symbol')

# drop genes with no human ortholog
print(len(mg_mapped))
mg_mapped = mg_mapped.dropna(subset=['symbol'])
print(len(mg_mapped))
# drop duplicates
mg_mapped = mg_mapped[~mg_mapped.index.duplicated(keep='first')]
print(len(mg_mapped))
mg_mapped.head()

mgi_df['human_ortholog']=mgi_df['gene_name'].map(dict(mg_mapped['symbol']))
mgi_df.head()

In [None]:
# download the mammalian phenotype ontology, parse with ddot
url = 'http://www.informatics.jax.org/downloads/reports/MPheno_OBO.ontology'
r = requests.get(url,allow_redirects=True)
open('MPheno_OBO.ontology','wb').write(r.content)
ddot.parse_obo('MPheno_OBO.ontology',
               'parsed_mp.txt',
              'id2name_mp.txt',
              'id2namespace_mp.txt',
              'altID_mp.txt')


MP2desc = pd.read_csv('id2name_mp.txt',sep='\t',
                      names=['MP','description'],index_col='MP')

MP2desc=MP2desc.loc[MP2desc.index.dropna()] # drop NAN from index
print(len(MP2desc))


display(MP2desc.head())

hierarchy = pd.read_table('parsed_mp.txt',
                          sep='\t',
                          header=None,
                          names=['Parent', 'Child', 'Relation', 'Namespace'])

display(hierarchy.head())

MPO = Ontology.from_table(
    table=hierarchy,
    parent='Parent',
    child='Child',
    add_root_name='MP:00SUPER',
    ignore_orphan_terms=True)
#MPO.clear_node_attr()
#MPO.clear_edge_attr()

# add description to node attribute
terms_keep = list(np.unique(hierarchy['Parent'].tolist()+hierarchy['Child'].tolist()))
MPO.node_attr=MP2desc.loc[terms_keep]

MPO

In [None]:
MP_focal_list = ['MP:0002089','MP:0002069','MP:0003633','MP:0001186']

In [None]:
from netcoloc import validation

validation.MPO_enrichment_root(hier_df,MPO,mgi_df,MP_focal_list,G_int,verbose=True)

In [None]:
full_results = validation.MPO_enrichment_full(hier_df,MPO,mgi_df,MP_focal_list,G_int)

In [None]:
full_results.head()

In [None]:
# test for enrichment in root node
OR_p_list,OR_CI_list,log_OR_list = [],[],[]
num_genes_in_term_list=[]

MP_keep_list = []

# root node is the largest node
hier_df.index=hier_df['name']
root_node = hier_df['CD_MemberList_Size'].sort_values(ascending=False).head(1).index.tolist()[0]

# get list of node names from G_int
G_int_nodes = list(G_int.nodes())

# add a negative control phenotype: abnormal mature B cell morphology: MP:0008171
# negative controls are tough here because we're dealing with development... which impacts almost everything
for MP_focal in MP_focal_list:
    MP_desc_focal = dict(MP2desc['description'])[MP_focal]

    # focus the hierarchy on one branch, and look up all terms within that branch
    if len(MPO.parent_2_child[MP_focal])>0:
        MPO_focal = MPO.focus(MP_focal,verbose=False)
        focal_terms = MPO_focal.terms
    else: # if the term has no children, just look at that term
        focal_terms=[MP_focal]


    # check enrichment in root node
    focal_genes = hier_df['CD_MemberList'].loc[root_node].split(' ')
    mgi_temp = mgi_df[mgi_df['MP'].isin(focal_terms)]
    mgi_temp = mgi_temp.dropna(subset=['human_ortholog'])
    mgi_genes = list(np.unique(mgi_temp['human_ortholog']))
    mgi_genes = list(np.intersect1d(mgi_genes,G_int_nodes)) 
    
    if (len(mgi_genes)>10) and (len(mgi_genes)<20000): # only test if there are at least 10 genes, and fewer than 2000 genes
        print('\n'+MP_desc_focal)
        print('number of genes in root node = '+str(len(focal_genes)))
        print('number of genes in focal MPO term = '+str(len(mgi_genes)))

        q00 = len(np.intersect1d(mgi_genes,focal_genes))
        print('number overlapping genes = '+str(q00))
        q01 = len(mgi_genes)-q00
        q10 = len(focal_genes)-q00
        q11 = len(G_int_nodes)-q00-q01-q10
        table_temp = [[q00,q01],[q10,q11]]

        CT= contingency_tables.Table2x2(table_temp)
        OR_p_temp = CT.log_oddsratio_pvalue()
        OR_CI_temp = CT.log_oddsratio_confint()
        log_OR_temp = CT.log_oddsratio
        print(OR_p_temp)
        print(OR_CI_temp)
        print(log_OR_temp)

        OR_p_list.append(OR_p_temp)
        OR_CI_list.append(OR_CI_temp)
        log_OR_list.append(log_OR_temp)
        num_genes_in_term_list.append(len(mgi_genes))
        
        MP_keep_list.append(MP_focal)

    
OR_CI_lower, OR_CI_upper = zip(*OR_CI_list)

root_KO_df = pd.DataFrame({'OR_p':OR_p_list,'log_OR':log_OR_list,
                           'log_OR_CI_lower':OR_CI_lower,'log_OR_CI_upper':OR_CI_upper,
                          'num_genes_in_term':num_genes_in_term_list},
                          index=MP_keep_list)

# Test enrichment for all systems

In [None]:
MP_focal_top = root_KO_df.head(10).index.tolist() # record the top 10 overall

MP_full_results_df=pd.DataFrame(index=hier_df.index.tolist())

for MP_focal in MP_focal_top:
    MP_desc_focal = dict(MP2desc['description'])[MP_focal]
    print(MP_desc_focal)

    # focus the hierarchy on one branch, and look up all terms within that branch
    if len(MPO.parent_2_child[MP_focal])>0:
        MPO_focal = MPO.focus(MP_focal)
        focal_terms = MPO_focal.terms
    else: # if the term has no children, just look at that term
        focal_terms=MP_focal


    hyper_p_list = []
    num_genes_list = []
    genes_id_list = []

    OR_p_list,OR_CI_list,log_OR_list=[],[],[]
    for focal_cluster in hier_df.index.tolist():
        #print(focal_cluster)
        mFocal_genes = hier_df['CD_MemberList'].loc[focal_cluster].split(' ')


        from scipy.stats import hypergeom
        M=len(list(G_int.nodes())) # only keep genes in PCnet
        # Look up all entries matching focal_terms, and mFocal_genes
        mgi_temp = mgi_df[mgi_df['MP'].isin(focal_terms)]
        mgi_temp = mgi_temp.dropna(subset=['human_ortholog'])
        mgi_genes = list(np.unique(mgi_temp['human_ortholog']))
        new_index=[g.upper() for g in mgi_temp.index.tolist()] 
        mgi_temp.index=new_index

        N=len(np.intersect1d(list(np.unique(mgi_temp.index.tolist())),list(G_int.nodes()))) # only keep genes in PCnet

        if len(np.intersect1d(mFocal_genes,mgi_temp.index.tolist()))>0:
            mgi_genes = mgi_temp.index.tolist()
            mgi_genes = list(np.intersect1d(mgi_genes,list(G_int.nodes())))
            x = len(np.intersect1d(mFocal_genes,mgi_genes))
            #print(x)
            n=len(mFocal_genes)

            #mgi_temp['MP_description']=mgi_temp['MP'].map(dict(MP2desc['description']))



            #print(hypergeom.sf(x,M,n,N))
            hyper_p_list.append(hypergeom.sf(x,M,n,N))
            num_genes_list.append(x)
            genes_id_list.append(' '.join(list(np.intersect1d(mFocal_genes,mgi_genes))))

            q00 = len(np.intersect1d(mgi_genes,mFocal_genes))
            q01 = len(mgi_genes)-q00

            q10 = len(mFocal_genes)-q00
            q11 = len(list(G_int.nodes()))-q00-q01-q10

            table_temp = [[q00,q01],[q10,q11]]

            CT= contingency_tables.Table2x2(table_temp)
            OR_p_temp = CT.log_oddsratio_pvalue()
            OR_CI_temp = CT.log_oddsratio_confint()
            log_OR_temp = CT.log_oddsratio
            #print(OR_p_temp)


            OR_p_list.append(OR_p_temp)
            OR_CI_list.append(OR_CI_temp)
            log_OR_list.append(log_OR_temp)


        else:
            hyper_p_list.append(1)
            num_genes_list.append(0)
            genes_id_list.append('')

            OR_p_list.append(1)
            OR_CI_list.append(0)
            log_OR_list.append(0)



    MP_focal_df = pd.DataFrame({MP_desc_focal+':-log(OR_p)':-np.log10(OR_p_list),
                                MP_desc_focal+':log_OR':log_OR_list,
                                MP_desc_focal+':num_genes':num_genes_list,
                                MP_desc_focal+':gene_ids':genes_id_list},index=hier_df.index.tolist())

    if MP_desc_focal+':-log(OR_p)' not in MP_full_results_df.columns.tolist():
        MP_full_results_df=MP_full_results_df.join(MP_focal_df)


# gprofiler annotation of clusters

In [None]:

# print out GO pathways

# hier_df = hier_df.sort_values('name')

# ....this needs to be much more elegant...

system_name_list = []
for p in hier_df.index.tolist():
    focal_genes=hier_df['CD_MemberList'].loc[p].split(' ')
    print(p)
    print(len(focal_genes))
    if len(focal_genes)>2:
        gp_temp = pd.DataFrame(gp.profile(focal_genes,significance_threshold_method='fdr',
                                               sources=['GO:BP'],no_evidences=False))
        if len(gp_temp)>0: # make sure data is not empty
            
            # make sure terms are specific, and overlap with at least 3 genes
            gp_temp = gp_temp[(gp_temp['term_size']<1000)]
            gp_temp = gp_temp[gp_temp['intersection_size']>=3]
            
            gp_temp = gp_temp[gp_temp['p_value']<1E-3] # set a stringent pvalue threshold
            
            # try sorting by precision + recall
            #gp_temp['prec_plus_recall']=gp_temp['precision']+gp_temp['recall']
            gp_temp = gp_temp.sort_values('p_value',ascending=True)
            
            gp_temp.index=gp_temp['name']
            for t in gp_temp.index.tolist():
                if t.lower().find('dopamin')>-1:
                    print(gp_temp.loc[t][['p_value','intersections']])
                    
            gp_cols_keep = ['description','name','p_value','precision','recall',
                            'query_size','effective_domain_size','intersection_size','native','source',
                           'term_size','intersections']

            
            if len(gp_temp)>1:
                system_name_list.append(gp_temp.head(1)['name'].tolist()[0])
                gp_temp[gp_cols_keep].to_csv('../tables/rat_human_BMI_hierarchy_GO/'+str(p)+'.csv',sep=',',
                          index=False)
            else:
                system_name_list.append(p)
        else:
            system_name_list.append(p)
            

        display(gp_temp.head())
        
    else:
        system_name_list.append(p)

# Load annotated hierarchy to ndex

In [None]:
d1='rat'
d2='human'
# add the best gprofiler annotation
MP_full_results_df['gprofiler_name']=pd.Series(system_name_list,index=hier_df.index.tolist())
# don't annotate the root node
MP_full_results_df['gprofiler_name'].loc[root_node]=d1+'-'+d2+'BMI systems map'

# also add the frac_seeds/num_seeds data here
MP_full_results_df=MP_full_results_df.join(hier_df[['num_d1_seeds','num_d2_seeds','frac_d1_seeds','frac_d2_seeds','frac_no_seeds']],
                                          how='left')

MP_full_results_df.head()

In [None]:
# Convert G_hier to nice cx network
node_id_to_node_name = nx.get_node_attributes(G_hier, 'name')
for node_id in list(G_hier.nodes):
    del G_hier.nodes[node_id]['name']

G_hier_cx = ndex2.create_nice_cx_from_networkx(G_hier)

for node_id, node in G_hier_cx.get_nodes():
    node['n'] = node_id_to_node_name[node_id]

In [None]:
G_hier_cx.set_name('rat_human_BMI_systems_map') 
for node_id, node in G_hier_cx.get_nodes():
    data = MP_full_results_df.loc[node['r']]
    #print(data)
    for row, value in data.items():
        if (row.find('gene_ids')>-1) or (row=='gprofiler_name'):
            data_type = "string"
            value=str(value)
        else:
            data_type = "double"
            value = str(value) # nice cx can only accept strings as values...
            if value=='inf': # check if inf, set to -1 if so
                value='-1'
            
        #print(value)
        #print(type(value))
        G_hier_cx.set_node_attribute(node_id, row, value, type=data_type)
        
# some CDAPs properties were corrupted on networkx conversion. Fix them here
for node_id, node in G_hier_cx.get_nodes():
    for i in np.arange(len(G_hier_cx.nodeAttributes[node_id])):
        dict_temp = G_hier_cx.nodeAttributes[node_id][i]
        if dict_temp['n'] in ['CD_MemberList_Size','CD_MemberList_LogSize','HiDeF_persistence']:
            G_hier_cx.set_node_attribute(node_id, dict_temp['n'], dict_temp['v'], type='double',overwrite=True)
            
# apply a template style (36041bac-d2e3-11eb-b666-0ac135e8bacf)
G_hier_cx.apply_template('ndexbio.org','36041bac-d2e3-11eb-b666-0ac135e8bacf')

In [None]:
#Upload to NDEx

SERVER = input('NDEx server (probably ndexbio.org): ')
USERNAME = input('NDEx user name: ')
PASSWORD = getpass.getpass('NDEx password: ')
network_uuid = G_hier_cx.upload_to(SERVER, USERNAME, PASSWORD)

# Make the enrichment figure (redundant with above... integrate later)

In [None]:
# set some parameters
ratThresh='relaxed' # relaxed or stringent

num_reps=1000

# set type of gene mapper
mapper = 'PASCAL' # can be PASCAL or PREDIXCAN

adj_type = 'bonf' # can be bonf or BH
# bonf_p = .05/len(BMI_GIANT_pascal) #0.25/len(BMI_GIANT_pascal)
# print(bonf_p)

excl_rat_seeds = False # if true, exclude rat seed genes from relevant phenotype

In [None]:
# read in rat BMI seed genes, corresponding to parameters above
rat_bmi_genes=pd.read_csv('seed_genes/ratBMI_seed_'+ratThresh+'.txt',sep='\t')['0'].tolist()
print(len(rat_bmi_genes))

In [None]:
# read in human BMI seed genes, corresponding to parameters above
# h_bmi_genes=pd.read_csv('seed_genes/humanBMI_seed_'+mapper+'_'+adj_type+'_exclRatSeeds'+str(excl_rat_seeds)+ratThresh+'.txt',
#                                      sep='\t')['0'].tolist()
BMI_GIANT_pascal = pd.read_csv('/Users/brinrosenthal/Documents/CCBB_tickets_data/GIANT_genomics/BMI/GIANT_BMI_pascal.sum.genescores.txt',
                              sep='\t')
BMI_GIANT_pascal.index=BMI_GIANT_pascal['gene_symbol']
BMI_GIANT_pascal = BMI_GIANT_pascal.loc[list(np.intersect1d(BMI_GIANT_pascal.index.tolist(),list(G_int.nodes)))]
bonf_p = .05/len(BMI_GIANT_pascal)

h_bmi_genes = BMI_GIANT_pascal[BMI_GIANT_pascal['pvalue']<bonf_p].index.tolist()

print(len(h_bmi_genes))



In [None]:
BMI_GIANT_pascal.loc[h_bmi_genes].head()

In [None]:
from statsmodels.stats import contingency_tables
from scipy.stats import hypergeom

In [None]:
# MP:0001186 pigmentation phenotype
# MP:0001533 skeletal phenotype
MP_focal_list = ['MP:0002089','MP:0002069','MP:0003633','MP:0001186']
OR_p_list,OR_CI_list,log_OR_list = [],[],[]
root_node = hier_df['CD_MemberList_Size'].sort_values(ascending=False).head(1).index.tolist()[0]
for MP_focal in MP_focal_list:
    MP_desc_focal = dict(MP2desc['description'])[MP_focal]
    print(MP_desc_focal)

    # focus the hierarchy on one branch, and look up all terms within that branch
    if len(MPO.parent_2_child[MP_focal])>0:
        MPO_focal = MPO.focus(MP_focal)
        focal_terms = MPO_focal.terms
    else: # if the term has no children, just look at that term
        focal_terms=MP_focal


    # check enrichment in root node
    focal_genes = hier_df['CD_MemberList'].loc[root_node].split(' ')
    print(len(focal_genes))

    mgi_temp = mgi_df[mgi_df['MP'].isin(focal_terms)]
    mgi_genes = list(np.unique(mgi_temp['gene_name']))
    mgi_genes = [g.upper() for g in mgi_genes]
    print(len(mgi_genes))
    mgi_genes = list(np.intersect1d(mgi_genes,G_int.nodes()))
    print(len(mgi_genes))

    q00 = len(np.intersect1d(mgi_genes,focal_genes))
    q01 = len(mgi_genes)-q00

    q10 = len(focal_genes)-q00
    q11 = len(G_int.nodes())-q00-q01-q10

    table_temp = [[q00,q01],[q10,q11]]
    print(table_temp)

    CT= contingency_tables.Table2x2(table_temp)
    OR_p_temp = CT.log_oddsratio_pvalue()
    OR_CI_temp = CT.log_oddsratio_confint()
    log_OR_temp = CT.log_oddsratio
    print(OR_p_temp)
    print(OR_CI_temp)
    print(log_OR_temp)
    
    OR_p_list.append(OR_p_temp)
    OR_CI_list.append(OR_CI_temp)
    log_OR_list.append(log_OR_temp)
    
    print(hypergeom.sf(q00,len(G_int.nodes()),len(focal_genes),len(mgi_genes)))
    
OR_CI_lower, OR_CI_upper = zip(*OR_CI_list)

In [None]:
np.exp(np.subtract(log_OR_list,OR_CI_lower))
print(np.exp(OR_CI_lower))
print(np.exp(OR_CI_upper))
print(np.subtract(np.exp(log_OR_list),np.exp(OR_CI_lower)))
print(np.subtract(np.exp(OR_CI_upper),np.exp(log_OR_list)))

In [None]:
plt.figure(figsize=(5,5))
plt.errorbar(np.arange(len(MP_focal_list)),np.exp(log_OR_list),
             yerr=[np.subtract(np.exp(log_OR_list),np.exp(OR_CI_lower)),
                   np.subtract(np.exp(OR_CI_upper),np.exp(log_OR_list))],color='k',fmt='o')
plt.plot(np.arange(len(MP_focal_list)),np.exp(log_OR_list),'o',color='k')
plt.ylim([.2,2.8])

plt.plot([-.5,3.5],[1,1],'--',color='gray')

tmp=plt.xticks(np.arange(len(MP_focal_list)),MP2desc['description'].loc[MP_focal_list].tolist(),rotation='vertical')

plt.ylabel('OR +- 95% CI')


# ax = plt.gca()

# ax.semilogy()
# # plt.yticks(np.arange(.5, 3, .5))
# plt.yticks([0.8,1.0,1.2,1.4,1.6,2.0,2.4])
# from matplotlib.ticker import ScalarFormatter
# ax.yaxis.set_major_formatter(ScalarFormatter())
# ax.yaxis.set_minor_formatter(ScalarFormatter())

# plt.savefig('../manuscript/figures/figure5/mouseKO_enrichment.png',dpi=300,bbox_inches='tight')
# plt.savefig('../manuscript/figures/figure5/mouseKO_enrichment.svg',dpi=300,bbox_inches='tight')


### Compare to enrichment for just seed genes (rat and human)


In [None]:
print(len(h_bmi_genes))
print(len(rat_bmi_genes))

In [None]:
BMI_GIANT_pascal.loc[focal_genes_human].head()

In [None]:
MP_focal_list = ['MP:0002089','MP:0002069','MP:0003633','MP:0001186']
OR_p_list_hseed,OR_CI_list_hseed,log_OR_list_hseed = [],[],[]
OR_p_list_rseed,OR_CI_list_rseed,log_OR_list_rseed = [],[],[]
for MP_focal in MP_focal_list:
    MP_desc_focal = dict(MP2desc['description'])[MP_focal]
    print(MP_desc_focal)

    # focus the hierarchy on one branch, and look up all terms within that branch
    if len(MPO.parent_2_child[MP_focal])>0:
        MPO_focal = MPO.focus(MP_focal)
        focal_terms = MPO_focal.terms
    else: # if the term has no children, just look at that term
        focal_terms=MP_focal


    mgi_temp = mgi_df[mgi_df['MP'].isin(focal_terms)]
    mgi_genes = list(np.unique(mgi_temp['gene_name']))
    mgi_genes = [g.upper() for g in mgi_genes]
    print(len(mgi_genes))
    mgi_genes = list(np.intersect1d(mgi_genes,G_int.nodes()))
    print(len(mgi_genes))
    
    
    # check enrichment in human BMI seed genes
    focal_genes_human = h_bmi_genes #BMI_GIANT_pascal.sort_values('pvalue',ascending=True).head(1000).index.tolist()
    focal_genes_human = list(np.intersect1d(focal_genes_human,G_int.nodes())) # only use genes in interactome
    print(len(focal_genes_human))
    q00 = len(np.intersect1d(mgi_genes,focal_genes_human))
    q01 = len(mgi_genes)-q00

    q10 = len(focal_genes_human)-q00
    q11 = len(G_int.nodes())-q00-q01-q10

    table_temp = [[q00,q01],[q10,q11]]
    print(table_temp)

    CT= contingency_tables.Table2x2(table_temp)
    OR_p_temp = CT.log_oddsratio_pvalue()
    OR_CI_temp = CT.log_oddsratio_confint()
    log_OR_temp = CT.log_oddsratio
    print(OR_p_temp)
    print(OR_CI_temp)
    print(log_OR_temp)
    
    OR_p_list_hseed.append(OR_p_temp)
    OR_CI_list_hseed.append(OR_CI_temp)
    log_OR_list_hseed.append(log_OR_temp)
    
    # check enrichment in rat BMI seed genes
    focal_genes_rat = rat_bmi_genes
    focal_genes_rat = list(np.intersect1d(focal_genes_rat,G_int.nodes())) # only use genes in interactome
    print(len(focal_genes_rat))

    q00 = len(np.intersect1d(mgi_genes,focal_genes_rat))
    q01 = len(mgi_genes)-q00

    q10 = len(focal_genes_rat)-q00
    q11 = len(G_int.nodes())-q00-q01-q10

    table_temp = [[q00,q01],[q10,q11]]
    print(table_temp)

    CT= contingency_tables.Table2x2(table_temp)
    OR_p_temp = CT.log_oddsratio_pvalue()
    OR_CI_temp = CT.log_oddsratio_confint()
    log_OR_temp = CT.log_oddsratio
    print(OR_p_temp)
    print(OR_CI_temp)
    print(log_OR_temp)
    
    OR_p_list_rseed.append(OR_p_temp)
    OR_CI_list_rseed.append(OR_CI_temp)
    log_OR_list_rseed.append(log_OR_temp)
    
    #print(hypergeom.sf(q00,len(G_int.nodes()),len(focal_genes),len(mgi_genes)))
    
OR_CI_lower_hseed, OR_CI_upper_hseed = zip(*OR_CI_list_hseed)
OR_CI_lower_rseed, OR_CI_upper_rseed = zip(*OR_CI_list_rseed)

In [None]:
# plot all together
plt.figure(figsize=(5,5))

# NetColoc subgraph
plt.errorbar(np.exp(log_OR_list),np.arange(len(MP_focal_list))-.2,
             xerr=[np.subtract(np.exp(log_OR_list),np.exp(OR_CI_lower)),
                   np.subtract(np.exp(OR_CI_upper),np.exp(log_OR_list))],color='blue',fmt='o',
             label='rat-human BMI network')
plt.plot(np.exp(log_OR_list),np.arange(len(MP_focal_list))-.2,'o',color='blue')

# r seeds 
plt.errorbar(np.exp(log_OR_list_rseed),np.arange(len(MP_focal_list)),
             xerr=[np.subtract(np.exp(log_OR_list_rseed),np.exp(OR_CI_lower_rseed)),
                   np.subtract(np.exp(OR_CI_upper_rseed),np.exp(log_OR_list_rseed))],color='k',fmt='o',
             label='rat BMI seed genes')
plt.plot(np.exp(log_OR_list_rseed),np.arange(len(MP_focal_list)),'o',color='k')

# h seeds 
plt.errorbar(np.exp(log_OR_list_hseed),np.arange(len(MP_focal_list))+.2,
             xerr=[np.subtract(np.exp(log_OR_list_hseed),np.exp(OR_CI_lower_hseed)),
                   np.subtract(np.exp(OR_CI_upper_hseed),np.exp(log_OR_list_hseed))],color='#E221D9',fmt='o',
            label='human BMI seed genes')
plt.plot(np.exp(log_OR_list_hseed),np.arange(len(MP_focal_list))+.2,'o',color='#E221D9')
# plt.ylim([.2,2.8])
plt.gca().invert_yaxis()

plt.plot([1,1],[-.5,3.5],'--',color='gray')
plt.legend(bbox_to_anchor=(1.1, 1.05))

tmp=plt.yticks(np.arange(len(MP_focal_list)),MP2desc['description'].loc[MP_focal_list].tolist(),rotation='horizontal')
plt.xlabel('OR +- 95% CI')

# plt.savefig('../manuscript/figures/figure5/mouseKO_enrichment.png',dpi=300,bbox_inches='tight')
# plt.savefig('../manuscript/figures/figure5/mouseKO_enrichment.svg',dpi=300,bbox_inches='tight')

# Load BMI genes called from Brittany's pipeline

In [None]:
BMI_brittany = pd.read_csv('../data/from_brittany/gene_list_bothsources_forBrin.csv',index_col='Unnamed: 0')
print(len(BMI_brittany))
BMI_brittany.head()

In [None]:
BMI_brit_sig = BMI_brittany[BMI_brittany['genotype_pval']<1E-4]

print(len(BMI_brit_sig))
BMI_brit_sig.head()

In [None]:
# ----------- get gene mapping info from brittany -----------

In [None]:
BMI_all_genes = list(np.unique(BMI_brittany['gene_symbol']))
BMI_all_genes = [g.upper() for g in BMI_all_genes]
print(len(BMI_all_genes))
print(len(np.intersect1d(BMI_all_genes,list(G_int.nodes()))))

In [None]:
BMI_sig_genes = list(np.unique(BMI_brit_sig['gene_symbol']))
BMI_sig_genes = [g.upper() for g in BMI_sig_genes]
print(len(BMI_sig_genes))
print(len(np.intersect1d(BMI_sig_genes,list(G_int.nodes()))))

In [None]:
print(len(np.intersect1d(BMI_sig_genes,node_df.index.tolist())))
print(np.intersect1d(BMI_sig_genes,node_df.index.tolist()))

In [None]:
print(len(np.intersect1d(BMI_all_genes,node_df.index.tolist())))