In [4]:
import pandas as pd
import networkx as nx
import igraph as ig
import matplotlib.pyplot as plt
import re
from math import ceil
from matplotlib.colors import Normalize
import gseapy as gp
import matplotlib.cm as cm

def enrichr_analysis(gene_list, gene_set='GO_Biological_Process_2023', organism='mouse'):
    try:
        enrichr_results = gp.enrichr(
            gene_list=gene_list,
            gene_sets=gene_set,
            organism=organism
        )
        enrichr_df = enrichr_results.results
    except Exception as e:
        print(f"Enrichr analysis failed: {e}")
        return None

    pval_threshold = 0.05
    enrichr_df = enrichr_df[enrichr_df['Adjusted P-value'] <= pval_threshold]

    return enrichr_df[['Term', 'Adjusted P-value', 'Genes']]

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union else 0

def cluster_enriched_terms_jaccard(enrichr_df, output_path='pathway_jaccard_clusters.png'):
    print(f"Number of pathways before filtering: {len(enrichr_df)}")

    filtered_enrichr_df = enrichr_df.copy()
    gene_counts = {}
    pathway_gene_map = {}

    for _, row in filtered_enrichr_df.iterrows():
        genes = set(row['Genes'].split(';'))
        pathway_gene_map[row['Term']] = genes
        gene_counts[row['Term']] = len(genes)

    filtered_enrichr_df['num_genes'] = filtered_enrichr_df['Term'].map(gene_counts)

    idx = filtered_enrichr_df['num_genes'] >= 1
    if idx.sum() > 0:
        filtered_enrichr_df = filtered_enrichr_df[idx]
        print(f"Pathways with >= 1 genes found: {idx.sum()}")
    else:
        print("No pathways with at least 1 gene found. Skipping further analysis.")
        return

    gene_counts = {term: gene_counts[term] for term in filtered_enrichr_df['Term']}

    G = nx.Graph()
    for pathway1, genes1 in pathway_gene_map.items():
        if pathway1 not in filtered_enrichr_df['Term'].values:
            continue
        for pathway2, genes2 in pathway_gene_map.items():
            if pathway1 != pathway2 and pathway2 in filtered_enrichr_df['Term'].values:
                similarity = jaccard_similarity(genes1, genes2)
                if similarity > 0:
                    G.add_edge(pathway1, pathway2, weight=similarity)

    nodes_to_plot = [node for node, degree in G.degree() if degree >= 1]
    subgraph = G.subgraph(nodes_to_plot)

    ig_graph = ig.Graph.TupleList(subgraph.edges(data=False), directed=False)
    partition = ig_graph.community_leiden(resolution=0.7)
    term_cluster_map = {ig_graph.vs[idx]['name']: cluster for idx, cluster in enumerate(partition.membership)}
    filtered_enrichr_df['Cluster'] = filtered_enrichr_df['Term'].map(term_cluster_map)

    nconfpath = len(filtered_enrichr_df)
    print("Final number of confident pathways : " + str(nconfpath))

    pos = nx.fruchterman_reingold_layout(subgraph, seed=42, k=0.3)
    node_size = [gene_counts[node] * 100 for node in subgraph.nodes()]
    num_clusters = len(set(term_cluster_map.values()))
    
    xsize = 12
    ysize = 14
    plt.figure(figsize=(xsize, ysize))

    if num_clusters <= 1:
        print("All pathways belong to the same cluster. Assigning a default color.")
        color_map = ['red'] * len(subgraph.nodes())
    else:
        cmap = plt.colormaps.get_cmap('tab10')
        color_map = [cmap(term_cluster_map[node] % 20) for node in subgraph.nodes()]

    labels = {}
    for node in subgraph.nodes():
        node_name = re.sub(r' \(GO:\d+\)', '', node)
        words = node_name.split()
        wrapped_label = '\n'.join(words)
        labels[node] = wrapped_label

    representative_nodes = {}
    for cluster_id in set(term_cluster_map.values()):
        cluster_nodes = [node for node, cluster in term_cluster_map.items() if cluster == cluster_id]
        if cluster_nodes:
            representative_node = max(cluster_nodes, key=lambda node: gene_counts.get(node, 0))
            representative_nodes[cluster_id] = representative_node

    for node in subgraph.nodes():
        if node in representative_nodes.values():
            font_size = 12
            nx.draw_networkx_labels(
                subgraph, pos, labels={node: labels[node]}, 
                font_size=font_size,
                bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3', alpha=0.7)
            )
        nx.draw_networkx_nodes(subgraph, pos, nodelist=[node], node_color=color_map[list(subgraph.nodes()).index(node)], node_size=node_size[list(subgraph.nodes()).index(node)])

    nx.draw_networkx_edges(subgraph, pos, edge_color='gray')

    plt.title('Pathway Analysis - Jaccard Similarity Clusters', fontsize=16)
    plt.savefig(output_path, format='png', bbox_inches='tight', dpi=300)
    print(f"Graph saved to {output_path}")
    plt.close()
    
def compare_enriched_terms(gene_list1, gene_list2, gene_set='GO_Biological_Process_2023', organism='mouse', output_path='compared_pathways_clusters.png'):
    """Perform Enrichr analysis for two gene lists, remove common terms, and plot unique terms for gene_list1."""

    df1 = enrichr_analysis(gene_list1, gene_set, organism)
    df2 = enrichr_analysis(gene_list2, gene_set, organism)

    if df1 is None or df2 is None:
        print("Enrichr analysis failed for one or both gene lists.")
        return

    common_terms = set(df1['Term']).intersection(set(df2['Term']))
    df1_filtered = df1[~df1['Term'].isin(common_terms)]

    print(f"Removed {len(common_terms)} common pathways.")
    print(f"Remaining pathways for Gene List 1: {len(df1_filtered)}")

    if df1_filtered.empty:
        print("No unique pathways left for plotting.")
        return

    cluster_enriched_terms_jaccard(df1_filtered, output_path=output_path)

In [5]:
import os
import pandas as pd

def open_matlab_outputfile(fname):
    """Open a file and filter genes based on the type (DV or DE)."""
    df = pd.read_excel(fname)

    # Get the filename from the full path
    filename = os.path.basename(fname)

    if "dv" in filename.lower():
        # Open DV summary spreadsheet
        print(f"Processing DV file: {filename}")
        # Keep important and confident genes
        colname = "DiffDist" # dist_diff
        idx = (df["pval"] < 0.05) & (df[colname] > 0)
        df = df[idx]
        gene_list = df["gene"]
        print(f"Number of confident genes (DV): {len(gene_list)}")
        return df, gene_list, "dv"

    else:
        # Open DE summary spreadsheet
        print(f"Processing DE file: {filename}")
        idx = (df['p_val_adj'] < 0.05) & (df['abs_log2FC'] >= 0.7)
        df = df[idx]
        gene_list = df['gene']
        print(f"Number of confident genes (DE): {len(gene_list)}")
        return df, gene_list, "de"

def open_gl(fname):
    df = pd.read_csv(fname, header=None)
    gl = df[0].to_list()
    return gl

In [6]:
fname = "qubo_features.txt"
gl1 = open_gl(fname)
fname = "lasso_features.txt"
gl2 = open_gl(fname)

compare_enriched_terms(gl1, gl2, gene_set='GO_Biological_Process_2023', organism='human', output_path='qubo_only_vs_lasso_clustered_pathways.png' )

Removed 16 common pathways.
Remaining pathways for Gene List 1: 22
Number of pathways before filtering: 22
Pathways with >= 1 genes found: 22
Final number of confident pathways : 22


  node_collection = ax.scatter(


Graph saved to qubo_only_vs_lasso_clustered_pathways.png


In [None]:
fname = "fittree_features.txt"
gl1 = open_gl(fname)
fname = "qubo_features.txt"
gl2 = open_gl(fname)

compare_enriched_terms(gl1, gl2, gene_set='GO_Biological_Process_2023', organism='human', output_path='qubo_only_vs_fittree_clustered_pathways.png' )

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ssromerogon\\Documents\\vscode_working_dir\\QUBO_Feature_Selection\\qubo_fs_matlab\\GSE134839_anticancer_drug_resistance_system\\miscelaneous\\qubo_features.txt'