# Running "ELAND"
This notebook runs what is currently the very bare bones of ELAND

In [1]:
# import libraries & scripts
import os
import pandas as pd
import numpy as np
import bihidef
import filter_panda
from netZooPy import sambar
#import sambar_adapted_fun
from goatools import obo_parser
from goatools.go_enrichment import GOEnrichmentStudy
from collections import defaultdict
from pybiomart import Dataset

#set working directory
os.chdir("results")

In [None]:
# filter network
fil_edges = filter_panda.filter_edges(prior_file="../data/prior.txt",panda_file="../data/panda_net.txt")
fil_edges.to_csv("../data/brca_fil_edges.csv", index=False, header=False)

In [None]:
# run bihidef
# this will save a bunch of files in the working directory
bihidef.bihidef("../data/brca_fil_edges.csv")

In [2]:
# get communities
sign = pd.read_csv("pvg.nodes", delimiter = "\t")

# Extract clusters
clusters = sign.iloc[:, 0].astype(str)
clusters = clusters.str[7:]  # removing the first 7 characters
clusters = np.array([list(map(int, c.split('-'))) for c in clusters])

In [3]:
# select clusters with sizes between selected range
# set range
min_size = 10
max_size = 200

fil_clust = sign[(sign.iloc[:,1] >= min_size) & (sign.iloc[:,1] <= max_size)]

fil_clust.to_csv("fil_comm.gmt", sep = "\t", index = False)

In [3]:
# trying to import bihidef output into sambar
sambar_test = sambar.sambar(mut_file="../data/BRCAmutMatrixFinal.csv",
                            esize_file="../data/esizef.csv",
                            genes_file="../data/genes.txt",
                            gmtfile="fil_comm.gmt")

KeyboardInterrupt: 

# Go enrichment for selected communities

In [None]:
# Load the GTEx gene annotation file
anno = pd.read_csv("../data/GTEx_gene_names.txt", delimiter="\t")

# Column 2: Summary of the number of genes in the community
print(sign.iloc[:, 1].describe())  # Summarizing the gene count per community

# Extract genes from the third column
allgenes = []
for i in range(len(fil_clust)):
    genes_in_com = fil_clust.iloc[i, 2]
    genes_in_com = genes_in_com.split(" ")  # Splitting by space
    allgenes.extend(genes_in_com)

allgenes = list(map(lambda x: x[:15], allgenes))  # Restricting to 15 characters
print(len(allgenes))  # Total genes
print(len(set(allgenes)))  # Unique genes

# Subset the annotation data for the genes in allgenes
anno_sub = anno[anno.iloc[:, 1].isin(allgenes)]
background = set(anno.iloc[:, 0])  # Using all genes as the background

In [None]:
# get go annotation
# Connect to the Ensembl human dataset using BioMart
dataset = Dataset(name='hsapiens_gene_ensembl', 
                  host='http://www.ensembl.org')

# Query BioMart for GO terms
gene_go_df = dataset.query(attributes=['ensembl_gene_id', 'go_id', 'hgnc_symbol'])

# filter based on your list of genes 
gene_go_df = gene_go_df[gene_go_df['HGNC symbol'].isin(allgenes)]

# remove any NAs
gene_go_df_clean = gene_go_df.dropna(subset=['GO term accession'])

print(gene_go_df.head())  # Show the gene-to-GO mappings

# Convert DataFrame to a dictionary {gene_id: set([go_id1, go_id2, ...])}
gene_to_go_dict = gene_go_df_clean.groupby('Gene stable ID')['GO term accession'].apply(set).to_dict()


In [None]:
# run go enrichment
godag = obo_parser.GODag("../data/go-basic.obo") 

goea = GOEnrichmentStudy(
        background, 
        gene_to_go_dict,  # Provide gene-to-GO mappings
        godag, 
        propagate_counts=True,
        methods=["fdr_bh"]
    )
    
goea_results = goea.run_study()