# Running "ELAND"
This notebook runs what is currently the very bare bones of ELAND

In [1]:
# import libraries & scripts
import os
import pandas as pd
import numpy as np
import bihidef
from netZooPy import sambar
from goatools import obo_parser
from goatools.go_enrichment import GOEnrichmentStudy
from pybiomart import Dataset
import eland
import matplotlib.pyplot as plt
import seaborn as sns
from eland import filter_panda, process_panda 

#set working directory
os.chdir("results")

In [2]:
# set some params
#breast
#panda_file = "gtex_breast_panda.txt"
#panda_edgelist = "gtex_breast_edgelist.txt"
#prior_file = "motif_prior_names_2024_filtered.txt"
#edgelist_fil = "gtex_breast_edgelist_filtered.csv"
#edgelist_fil = "brca_fil_edges.csv"
#edgelist_top = "gtex_subset_first_half.csv"
#edgelist_end = "gtex_subset_end.csv"
#edgelist_end2 = "gtex_subset_end2.csv"

# uterus
panda_file = "gtex_uterus_panda.txt"
panda_edgelist = "gtex_uterus_edgelist.txt"
prior_file = "motif_prior_names_2024_filtered.txt"
edgelist_fil = "gtex_uterus_edgelist_filtered.csv"

In [None]:
pd.read("gtex_uterus_panda.txt", sep = "")

In [7]:
# filter network
process_panda.process_edge_list(panda_file, panda_edgelist)
fil_edges = filter_panda.filter_edges(prior_file=prior_file ,panda_file=panda_edgelist)
fil_edges.to_csv(edgelist_fil, index=False, header=False)

  df = pd.read_csv(input_file, delim_whitespace=True, header=None)


ValueError: The input file does not have at least four columns.

In [None]:
# run bihidef
# this will save a bunch of files in the working directory
bihidef.bihidef(edgelist_fil, maxres=5)

In [None]:
# bihidef for UCEC


In [None]:
# get communities
sign = pd.read_csv("pvg.nodes", delimiter = "\t")

# Extract clusters
clusters = sign.iloc[:, 0].astype(str)
clusters = clusters.str[7:]  # removing the first 7 characters
clusters = np.array([list(map(int, c.split('-'))) for c in clusters])

In [None]:
# select clusters with sizes between selected range
# set range
min_size = 10
max_size = 200

fil_clust = sign[(sign.iloc[:,1] >= min_size) & (sign.iloc[:,1] <= max_size)]

#write to gmt
eland.gmt_from_bihidef(fil_clust, "fil_comm.gmt")

In [None]:
#run sambar with bihidef communities
sambar.sambar(mut_file="../data/BRCAmutMatrixFinal.csv",
                            esize_file="../data/esizef.csv",
                            genes_file="../data/genes.txt",
                            gmtfile="fil_comm.gmt")

In [None]:
# run sambar with pathways
sambar.sambar(mut_file="../data/BRCAmutMatrixFinal.csv",
                            esize_file="../data/esizef.csv",
                            genes_file="../data/genes.txt",
                            gmtfile="../data/c2.cp.v2024.1.Hs.symbols.gmt")

# Go enrichment for selected communities

In [None]:
# Load the GTEx gene annotation file
anno = pd.read_csv("../data/GTEx_gene_names.txt", delimiter="\t")

# Column 2: Summary of the number of genes in the community
print(sign.iloc[:, 1].describe())  # Summarizing the gene count per community

# Extract genes from the third column
allgenes = []
for i in range(len(fil_clust)):
    genes_in_com = fil_clust.iloc[i, 2]
    genes_in_com = genes_in_com.split(" ")  # Splitting by space
    allgenes.extend(genes_in_com)

allgenes = list(map(lambda x: x[:15], allgenes))  # Restricting to 15 characters
print(len(allgenes))  # Total genes
print(len(set(allgenes)))  # Unique genes

# Subset the annotation data for the genes in allgenes
anno_sub = anno[anno.iloc[:, 1].isin(allgenes)]
background = set(anno.iloc[:, 0])  # Using all genes as the background

In [None]:
# get go annotation
# Connect to the Ensembl human dataset using BioMart
dataset = Dataset(name='hsapiens_gene_ensembl', 
                  host='http://www.ensembl.org')

# Query BioMart for GO terms
gene_go_df = dataset.query(attributes=['ensembl_gene_id', 'go_id', 'hgnc_symbol'])

# filter based on your list of genes 
gene_go_df = gene_go_df[gene_go_df['HGNC symbol'].isin(allgenes)]

# remove any NAs
gene_go_df_clean = gene_go_df.dropna(subset=['GO term accession'])

print(gene_go_df_clean.head())  # Show the gene-to-GO mappings

# Convert DataFrame to a dictionary {gene_id: set([go_id1, go_id2, ...])}
gene_to_go_dict = gene_go_df_clean.groupby('Gene stable ID')['GO term accession'].apply(set).to_dict()


In [None]:
# run go enrichment
godag = obo_parser.GODag("../data/go-basic.obo") 

goea = GOEnrichmentStudy(
        background, 
        gene_to_go_dict,  # Provide gene-to-GO mappings
        godag, 
        propagate_counts=True,
        methods=["bonferroni","fdr_bh"]
    )
    
goea_results = goea.run_study(study=gene_go_df_clean['Gene stable ID'])

# Comparing hierarchical clustering on community scores vs pathway scores
SAMBAR already outputs some clustering, so here just plotting heatmaps with dendograms.

In [None]:
# read in clusters
comm_clust = pd.read_csv("clustergroups_comm.csv", delimiter=",")
path_clust = pd.read_csv("clustergroups_path.csv", delimiter=",")


In [None]:
# plot
sns.clustermap(path_clust)