# Setup

In [5]:
import pandas as pd
import numpy as np

from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import hypergeom
from statsmodels.stats import contingency_tables

import getpass
import ndex2

# need ddot to parse the ontology
import ddot
from ddot import Ontology

import networkx as nx
import requests

import random

from os.path import exists
import mygene
mg = mygene.MyGeneInfo()

import matplotlib.pyplot as plt
from matplotlib.patches import BoxStyle as bx
from matplotlib.patches import Patch
import seaborn as sns

import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

sns.set(font_scale=1.4)

sns.set_style('white')

sns.set_style("ticks", {"xtick.major.size": 15, "ytick.major.size": 15})
plt.rcParams['svg.fonttype'] = 'none'

In [6]:
import sys
sys.path.append("..")

from analysis_functions import *
from plotting_functions import *
from updated_netcoloc_functions import *

### Analysis Functions

### Plotting Functions

## Load Data

In [7]:
nodes, G_int = load_pcnet()

number of nodes:
18820

number of edges:
2693109


In [10]:
updated_pc_nodes = mg.querymany(nodes, as_dataframe=True, species='human', scopes='symbol, alias', fields='symbol')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-18820...done.
Finished.
894 input query terms found dup hits:
	[('FBXO30', 2), ('ARF1', 2), ('UBE2F-SCLY', 2), ('FHL3', 2), ('UBC', 2), ('COP1', 2), ('SP3', 2), ('
159 input query terms found no hit:
	['AC022826.2', 'AL136295.5', 'MT-CO3', 'LOC105369243', 'AL136295.1', 'LOC101060399', 'AC006486.1', '
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [11]:
updated_pc_nodes = updated_pc_nodes.sort_values(by="_score", ascending=False)
updated_pc_nodes = updated_pc_nodes.drop_duplicates(subset=["symbol"])

In [8]:
## LOAD Hierarchy
hier_df_genes = pd.read_csv("BMI_hierarchy/hier_df_genes.tsv", sep="\t", index_col=0)

In [9]:
## LOAD Seed Genes
human_seeds = []
rat_seeds = []
all_seeds = human_seeds + rat_seeds

### Load Mouse Data

In [None]:
mgi_df = load_MGI_mouseKO_data(url='http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt',
                                         map_using="mgi")

In [None]:
mgi_df = change_symbols(mgi_df, updated_pc_nodes)

In [None]:
MPO2 = load_MPO(url='http://www.informatics.jax.org/downloads/reports/MPheno_OBO.ontology', use_genes=True, mapping=mgi_df)

In [None]:
mp_graph = _get_mp_graph()
top_level = [node for node in nx.dfs_preorder_nodes(mp_graph, "MP:0000001", 1) if node not in ["MP:0003012", "MP:0002873"]][1:]

# Perform Enrichment

In [None]:
phenotypes = []
term_counts, gene_mapping, term_mapping = genes_per_node(MPO2)
for community in hier_df_genes.index:
    x = community_term_enrichment(community, hier_df_genes, MPO2, mgi_df, term_counts, gene_mapping, exclude_genes=all_seeds)
    x = x.assign(name=community)
    phenotypes.append(x)

In [None]:
results = pd.concat(phenotypes)
results = results.assign(MP=results.index)

In [None]:
results.to_csv("BMI_hierarchy/updated_enrichment_results_02_28_22.tsv", sep="\t")

## Get Gene Hits

In [None]:
hits = get_gene_hits_no_annotation(genes=hier_df_genes.loc["C877", "CD_MemberList"], term="MP:0005451", MPO=MPO2, term_mapping=term_mapping)
for hit in hits:
    print(hit)

In [None]:
hits = get_gene_hits_no_annotation(genes=hier_df_genes.loc["C877", "CD_MemberList"], term="MP:0005378", MPO=MPO2, term_mapping=term_mapping)
print(len(hits))

## Visualize Results
### Bar chart

In [None]:
a = plot_hbar_comparison(results, communities=["C877"], mps=top_level, color_idx=3, vert = 12, mp_graph=mp_graph,
                         sig_level=0.05, community_names=["conserved BMI"], sort_by="q", label_number="percent_community")

### Network Plot

In [None]:
n = draw_significance_hierarchy(results.loc[results.observed>=2], "C877", "MP:0009642", mp_graph, MPO2, hier_df_genes, 
                                term_mapping, select_on="ppv", 
                                size_by="obs", alpha_by="x",color_by="q", vert=12, label="leaf", 
                                descriptive_labels=True, adjust_root=0.1)