In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT = Path.home() / "emt_network"


TCGA_SIG_FILE = PROJECT / "resources/signatures_generated/tcga_brca_M_vs_E_logFC.tsv"
GEO_SIG_FILE  = PROJECT / "resources/signatures_generated/geo_gse96058_M_vs_E_logFC.tsv"


OMNIPATH_RAW = PROJECT / "data/raw/omnipath_interactions.tsv"

RESULTS = PROJECT / "results"
RESULTS.mkdir(exist_ok=True)

print("TCGA sig exists:", TCGA_SIG_FILE.exists())
print("GEO  sig exists:", GEO_SIG_FILE.exists())
print("OmniPath raw exists:", OMNIPATH_RAW.exists())


TCGA sig exists: True
GEO  sig exists: True
OmniPath raw exists: True


In [2]:
tcga_sig = pd.read_csv(TCGA_SIG_FILE, sep="\t", index_col=0).iloc[:, 0]
geo_sig  = pd.read_csv(GEO_SIG_FILE,  sep="\t", index_col=0).iloc[:, 0]

print("TCGA signature:", tcga_sig.shape, "range:", (tcga_sig.min(), tcga_sig.max()))
print("GEO  signature:", geo_sig.shape,  "range:", (geo_sig.min(),  geo_sig.max()))

tcga_sig.head()


TCGA signature: (341,) range: (np.float64(-1.2801099173553805), np.float64(2.8896115702479364))
GEO  signature: (346,) range: (np.float64(-1.5519129477245848), np.float64(2.253879986995334))


CDH3     2.889612
FUT3     2.586209
KRT7     2.585810
PSCA     2.580238
SFRP1    2.366091
Name: logFC_M_minus_E, dtype: float64

In [3]:
PKN_FILE = PROJECT / "resources/networks/omnipath_full_signed_directed.tsv"

pkn = pd.read_csv(PKN_FILE, sep="\t")
print("PKN loaded:", pkn.shape)
pkn.head()


PKN loaded: (71679, 3)


Unnamed: 0,source_genesymbol,target_genesymbol,sign
0,CALM3,TRPC1,-1
1,CALM1,TRPC1,-1
2,CALM2,TRPC1,-1
3,CAV1,TRPC1,1
4,DRD2,TRPC1,1


In [4]:
import sys
!{sys.executable} -m pip install -U corneto

import corneto as cn
print("corneto version:", cn.__version__)


corneto version: 1.0.0b7


In [5]:
from corneto.graph import Graph

edges = []
for _, r in pkn.iterrows():
    edges.append((r["source_genesymbol"], int(r["sign"]), r["target_genesymbol"]))

G = Graph.from_tuples(edges)

print("Graph nodes:", len(G.V))
print("Graph edges:", len(G.E))


Graph nodes: 7148
Graph edges: 71679


In [6]:
import numpy as np
from corneto._data import Data

# PCST import 
try:
    from corneto.methods.pcst import PrizeCollectingSteinerTree
except ModuleNotFoundError:
    from corneto.methods.future.pcst import PrizeCollectingSteinerTree


# 1) choose high-signal genes (based on |logFC|)
def select_high_signal(sig, top_frac=0.40):
    sig = sig.dropna()
    thr = sig.abs().quantile(1 - top_frac)
    return sig[sig.abs() >= thr], float(thr)

tcga_selected, tcga_thr = select_high_signal(tcga_sig, top_frac=0.40)
tcga_selected = tcga_selected[tcga_selected.index.isin(set(G.V))]

print("TCGA selected genes:", len(tcga_selected), "| threshold |logFC| >=", round(tcga_thr, 3))


# 2) build CORNETO features:
# - edge costs (small)
# - vertex weights from |logFC| 
EDGE_COST = 0.05
WEIGHT_SCALE = 20.0

features = []


# we built Graph from `edges`, so edge id = index in `edges`
for i in range(len(edges)):
    features.append({"mapping": "edge", "id": i, "value": EDGE_COST})

# vertex weights
for gene, val in tcga_selected.abs().items():
    features.append({
        "mapping": "vertex",
        "id": gene,
        "role": "prize",   
        "value": float(val) * WEIGHT_SCALE
    })

D = Data.from_dict({"sample": {"features": features}})


# 3) solve PCST
pcst = PrizeCollectingSteinerTree(lambda_reg=0, strict_acyclic=False)
P = pcst.build(G, D)
P.solve(solver="scipy", verbosity=0)

print("P.expr keys:", list(P.expr.keys()))
print("with_flow sum:", float(np.sum(P.expr["with_flow"].value)))


# 4) extract selected edges 
edge_var = P.expr["selected_prized_flow_edges_0"].value
edge_idx = np.flatnonzero(np.asarray(edge_var) != 0)

tcga_subG = pcst.processed_graph.edge_subgraph(edge_idx)
tcga_selected_edges = [edges[i] for i in edge_idx]
tcga_sel_df = pd.DataFrame(tcga_selected_edges, columns=["source_genesymbol", "sign", "target_genesymbol"])
tcga_sel_out = RESULTS / "tcga_pcst_selected_edges.tsv"
tcga_sel_df.to_csv(tcga_sel_out, sep="\t", index=False)

print("TCGA subnetwork edges:", len(tcga_subG.E))
print("TCGA subnetwork nodes:", len(tcga_subG.V))
print("Saved TCGA selected edges:", tcga_sel_out, "n=", tcga_sel_df.shape[0])

TCGA selected genes: 68 | threshold |logFC| >= 0.936
P.expr keys: ['_flow', '_flow_prize_pos_0', '_flow_i', '_flow_prize_neg_0', 'flow', 'with_flow', 'selected_prized_flow_edges_0']
with_flow sum: 85.0
TCGA subnetwork edges: 48
TCGA subnetwork nodes: 38
Saved TCGA selected edges: /home/sameeksha/emt_network/results/tcga_pcst_selected_edges.tsv n= 48


In [7]:
#  node list 
tcga_nodes = list(tcga_subG.V)
print("Nodes in TCGA subnetwork:", len(tcga_nodes))

#  PKN edges filtered 
node_set = set(tcga_nodes)
tcga_sub_edges_df = pkn[
    pkn["source_genesymbol"].isin(node_set) &
    pkn["target_genesymbol"].isin(node_set)
].copy()

print("Induced edges after filtering:", tcga_sub_edges_df.shape[0])


tcga_out = RESULTS / "tcga_induced_subnetwork.tsv"
tcga_sub_edges_df.to_csv(tcga_out, sep="\t", index=False)
print("Saved:", tcga_out)


Nodes in TCGA subnetwork: 38
Induced edges after filtering: 87
Saved: /home/sameeksha/emt_network/results/tcga_induced_subnetwork.tsv


In [8]:
import numpy as np
from corneto._data import Data

try:
    from corneto.methods.pcst import PrizeCollectingSteinerTree
except ModuleNotFoundError:
    from corneto.methods.future.pcst import PrizeCollectingSteinerTree


def select_high_signal(sig, top_frac=0.40):
    sig = sig.dropna()
    thr = sig.abs().quantile(1 - top_frac)
    return sig[sig.abs() >= thr], float(thr)

geo_selected, geo_thr = select_high_signal(geo_sig, top_frac=0.40)
geo_selected = geo_selected[geo_selected.index.isin(set(G.V))]

print("GEO selected genes:", len(geo_selected), "| threshold |logFC| >=", round(geo_thr, 3))

EDGE_COST = 0.05
WEIGHT_SCALE = 20.0

features = []

for i in range(len(edges)):
    features.append({"mapping": "edge", "id": i, "value": EDGE_COST})

for gene, val in geo_selected.abs().items():
    features.append({
        "mapping": "vertex",
        "id": gene,
        "role": "prize",
        "value": float(val) * WEIGHT_SCALE
    })

D = Data.from_dict({"sample": {"features": features}})

pcst = PrizeCollectingSteinerTree(lambda_reg=0, strict_acyclic=False)
P = pcst.build(G, D)
P.solve(solver="scipy", verbosity=0)

print("with_flow sum:", float(np.sum(P.expr["with_flow"].value)))

edge_var = P.expr["selected_prized_flow_edges_0"].value
edge_idx = np.flatnonzero(np.asarray(edge_var) != 0)

geo_subG = pcst.processed_graph.edge_subgraph(edge_idx)
geo_selected_edges = [edges[i] for i in edge_idx]  # (source, sign, target)

geo_sel_df = pd.DataFrame(geo_selected_edges, columns=["source_genesymbol", "sign", "target_genesymbol"])
geo_sel_out = RESULTS / "geo_pcst_selected_edges.tsv"
geo_sel_df.to_csv(geo_sel_out, sep="\t", index=False)

print("Saved GEO selected edges:", geo_sel_out, "n=", geo_sel_df.shape[0])
print("GEO subnetwork edges:", len(geo_subG.E))
print("GEO subnetwork nodes:", len(geo_subG.V))


GEO selected genes: 70 | threshold |logFC| >= 0.536
with_flow sum: 87.0
Saved GEO selected edges: /home/sameeksha/emt_network/results/geo_pcst_selected_edges.tsv n= 51
GEO subnetwork edges: 51
GEO subnetwork nodes: 38


In [9]:
geo_nodes = list(geo_subG.V)
print("Nodes in GEO subnetwork:", len(geo_nodes))

node_set = set(geo_nodes)
geo_sub_edges_df = pkn[
    pkn["source_genesymbol"].isin(node_set) &
    pkn["target_genesymbol"].isin(node_set)
].copy()

print("Induce edges after filtering:", geo_sub_edges_df.shape[0])

geo_out = RESULTS / "geo_induced_subnetwork.tsv"
geo_sub_edges_df.to_csv(geo_out, sep="\t", index=False)
print("Saved:", geo_out)


Nodes in GEO subnetwork: 38
Induce edges after filtering: 82
Saved: /home/sameeksha/emt_network/results/geo_induced_subnetwork.tsv


In [10]:
# nodes overlap
tcga_nodes = set(tcga_subG.V)
geo_nodes  = set(geo_subG.V)

shared_nodes = tcga_nodes & geo_nodes
all_nodes = tcga_nodes | geo_nodes

print("TCGA nodes:", len(tcga_nodes))
print("GEO nodes :", len(geo_nodes))
print("Shared nodes:", len(shared_nodes))
print("Node overlap %:", round(100 * len(shared_nodes) / len(all_nodes), 2))


tcga_sel = pd.read_csv(RESULTS / "tcga_pcst_selected_edges.tsv", sep="\t")
geo_sel  = pd.read_csv(RESULTS / "geo_pcst_selected_edges.tsv",  sep="\t")

tcga_sel_edges = set(zip(tcga_sel["source_genesymbol"], tcga_sel["sign"], tcga_sel["target_genesymbol"]))
geo_sel_edges  = set(zip(geo_sel["source_genesymbol"],  geo_sel["sign"],  geo_sel["target_genesymbol"]))

shared_sel_edges = tcga_sel_edges & geo_sel_edges
all_sel_edges = tcga_sel_edges | geo_sel_edges


print("\nTCGA selected edges:", len(tcga_sel_edges))
print("GEO selected edges :", len(geo_sel_edges))
print("Shared selected edges:", len(shared_sel_edges))
print("Selected edge overlap %:", round(100 * len(shared_sel_edges) / len(all_sel_edges), 2))


# edges overlap for induced subnetwork
tcga_df = pd.read_csv(RESULTS / "tcga_induced_subnetwork.tsv", sep="\t")
geo_df  = pd.read_csv(RESULTS / "geo_induced_subnetwork.tsv",  sep="\t")

tcga_edges = set(zip(tcga_df["source_genesymbol"], tcga_df["sign"], tcga_df["target_genesymbol"]))
geo_edges  = set(zip(geo_df["source_genesymbol"],  geo_df["sign"],  geo_df["target_genesymbol"]))

shared_edges = tcga_edges & geo_edges
all_edges = tcga_edges | geo_edges

print("\nTCGA induced edges:", len(tcga_edges))
print("GEO induced edges :", len(geo_edges))
print("Shared induced edges:", len(shared_edges))
print("Induced edges overlap %:", round(100 * len(shared_edges) / len(all_edges), 2))

print("\nSome shared nodes:", sorted(list(shared_nodes))[:25])




TCGA nodes: 38
GEO nodes : 38
Shared nodes: 31
Node overlap %: 68.89

TCGA selected edges: 48
GEO selected edges : 51
Shared selected edges: 35
Selected edge overlap %: 54.69

TCGA induced edges: 87
GEO induced edges : 82
Shared induced edges: 70
Induced edges overlap %: 70.71

Some shared nodes: ['ASPH', 'CABP1', 'CALM1', 'CALM2', 'CALM3', 'CAV1', 'CNR1', 'DRD2', 'EGFR', 'GABARAP', 'HOMER1', 'ITPR1', 'ITPR3', 'MARCKS', 'MX1', 'PKD2', 'PRKG1', 'RNF24', 'SNF8', 'STIM1', 'TRPC1', 'TRPC3', 'TRPC4', 'TRPC5', 'TRPC6']


In [11]:
summary_text = f"""
BRCA EMT network inference with CORNETO (OmniPath signed+directed)

TCGA:
- selected_nodes={len(tcga_nodes)}
- selected_edges(PCST)={len(tcga_sel_edges)}
- induced_edges={len(tcga_edges)}

GEO:
- selected_nodes={len(geo_nodes)}
- selected_edges(PCST)={len(geo_sel_edges)}
- induced_edges={len(geo_edges)}
Shared nodes: {len(shared_nodes)}  (overlap {round(100*len(shared_nodes)/len(all_nodes),2)}%)

Shared selected edges (PCST): {len(shared_sel_edges)}  (overlap {round(100*len(shared_sel_edges)/len(all_sel_edges),2)}%)
Shared induced edges: {len(shared_edges)}  (overlap {round(100*len(shared_edges)/len(all_edges),2)}%)

Example shared nodes (first 25):
{", ".join(sorted(list(shared_nodes))[:25])}
""".strip()

out_path = RESULTS / "network_overlap_summary.txt"
out_path.write_text(summary_text + "\n")

print("Saved:", out_path)
print(summary_text)


Saved: /home/sameeksha/emt_network/results/network_overlap_summary.txt
BRCA EMT network inference with CORNETO (OmniPath signed+directed)

TCGA:
- selected_nodes=38
- selected_edges(PCST)=48
- induced_edges=87

GEO:
- selected_nodes=38
- selected_edges(PCST)=51
- induced_edges=82
Shared nodes: 31  (overlap 68.89%)

Shared selected edges (PCST): 35  (overlap 54.69%)
Shared induced edges: 70  (overlap 70.71%)

Example shared nodes (first 25):
ASPH, CABP1, CALM1, CALM2, CALM3, CAV1, CNR1, DRD2, EGFR, GABARAP, HOMER1, ITPR1, ITPR3, MARCKS, MX1, PKD2, PRKG1, RNF24, SNF8, STIM1, TRPC1, TRPC3, TRPC4, TRPC5, TRPC6
