In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT = Path.home() / "emt_network"


TCGA_SIG_FILE = PROJECT / "resources/signatures_generated/tcga_brca_M_vs_E_logFC.tsv"
GEO_SIG_FILE  = PROJECT / "resources/signatures_generated/geo_gse96058_M_vs_E_logFC.tsv"


OMNIPATH_RAW = PROJECT / "data/raw/omnipath_interactions.tsv"

RESULTS = PROJECT / "results"
RESULTS.mkdir(exist_ok=True)

print("TCGA sig exists:", TCGA_SIG_FILE.exists())
print("GEO  sig exists:", GEO_SIG_FILE.exists())
print("OmniPath raw exists:", OMNIPATH_RAW.exists())


TCGA sig exists: True
GEO  sig exists: True
OmniPath raw exists: True


In [3]:
tcga_sig = pd.read_csv(TCGA_SIG_FILE, sep="\t", index_col=0).iloc[:, 0]
geo_sig  = pd.read_csv(GEO_SIG_FILE,  sep="\t", index_col=0).iloc[:, 0]

print("TCGA signature:", tcga_sig.shape, "range:", (tcga_sig.min(), tcga_sig.max()))
print("GEO  signature:", geo_sig.shape,  "range:", (geo_sig.min(),  geo_sig.max()))

tcga_sig.head()


TCGA signature: (341,) range: (np.float64(-1.2801099173553805), np.float64(2.8896115702479364))
GEO  signature: (341,) range: (np.float64(-1.7034064890770535), np.float64(2.283572286419677))


CDH3     2.889612
FUT3     2.586209
KRT7     2.585810
PSCA     2.580238
SFRP1    2.366091
Name: logFC_M_minus_E, dtype: float64

In [4]:
PKN_FILE = PROJECT / "resources/networks/omnipath_full_signed_directed.tsv"

pkn = pd.read_csv(PKN_FILE, sep="\t")
print("PKN loaded:", pkn.shape)
pkn.head()


PKN loaded: (71679, 3)


Unnamed: 0,source_genesymbol,target_genesymbol,sign
0,CALM3,TRPC1,-1
1,CALM1,TRPC1,-1
2,CALM2,TRPC1,-1
3,CAV1,TRPC1,1
4,DRD2,TRPC1,1


In [5]:
import sys
!{sys.executable} -m pip install -U corneto

import corneto as cn
print("corneto version:", cn.__version__)


Collecting corneto
  Downloading corneto-1.0.0b7-py3-none-any.whl.metadata (8.5 kB)
Downloading corneto-1.0.0b7-py3-none-any.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.5/288.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: corneto
  Attempting uninstall: corneto
    Found existing installation: corneto 1.0.0b6
    Uninstalling corneto-1.0.0b6:
      Successfully uninstalled corneto-1.0.0b6
Successfully installed corneto-1.0.0b7
corneto version: 1.0.0b7


In [6]:
from corneto.graph import Graph

edges = []
for _, r in pkn.iterrows():
    edges.append((r["source_genesymbol"], int(r["sign"]), r["target_genesymbol"]))

G = Graph.from_tuples(edges)

print("Graph nodes:", len(G.V))
print("Graph edges:", len(G.E))


Graph nodes: 7148
Graph edges: 71679


In [7]:
import numpy as np
from corneto._data import Data

# PCST import (version differs)
try:
    from corneto.methods.pcst import PrizeCollectingSteinerTree
except ModuleNotFoundError:
    from corneto.methods.future.pcst import PrizeCollectingSteinerTree


# 1) choose high-signal genes (based on |logFC|)
def select_high_signal(sig, top_frac=0.40):
    sig = sig.dropna()
    thr = sig.abs().quantile(1 - top_frac)
    return sig[sig.abs() >= thr], float(thr)

tcga_selected, tcga_thr = select_high_signal(tcga_sig, top_frac=0.40)
tcga_selected = tcga_selected[tcga_selected.index.isin(set(G.V))]

print("TCGA selected genes:", len(tcga_selected), "| threshold |logFC| >=", round(tcga_thr, 3))


# 2) build CORNETO features:
# - edge costs (small)
# - vertex weights from |logFC| (scaled so signal matters)
EDGE_COST = 0.05
WEIGHT_SCALE = 20.0

features = []

# edge costs: id must match edge order used when Graph was created
# we built Graph from `edges`, so edge id = index in `edges`
for i in range(len(edges)):
    features.append({"mapping": "edge", "id": i, "value": EDGE_COST})

# vertex weights
for gene, val in tcga_selected.abs().items():
    features.append({
        "mapping": "vertex",
        "id": gene,
        "role": "prize",   # CORNETO keyword; we’ll call it “weight” in writing
        "value": float(val) * WEIGHT_SCALE
    })

D = Data.from_dict({"sample": {"features": features}})


# 3) solve PCST
pcst = PrizeCollectingSteinerTree(lambda_reg=0, strict_acyclic=False)
P = pcst.build(G, D)
P.solve(solver="scipy", verbosity=0)

print("P.expr keys:", list(P.expr.keys()))
print("with_flow sum:", float(np.sum(P.expr["with_flow"].value)))


# 4) extract selected edges (your version uses this variable name)
edge_var = P.expr["selected_prized_flow_edges_0"].value
edge_idx = np.flatnonzero(np.asarray(edge_var) != 0)

tcga_subG = pcst.processed_graph.edge_subgraph(edge_idx)

print("TCGA subnetwork edges:", len(tcga_subG.E))
print("TCGA subnetwork nodes:", len(tcga_subG.V))


TCGA selected genes: 68 | threshold |logFC| >= 0.936
P.expr keys: ['_flow', '_flow_i', '_flow_prize_pos_0', '_flow_prize_neg_0', 'flow', 'with_flow', 'selected_prized_flow_edges_0']
with_flow sum: 81.0
TCGA subnetwork edges: 47
TCGA subnetwork nodes: 36


In [8]:
#  node list 
tcga_nodes = list(tcga_subG.V)
print("Nodes in TCGA subnetwork:", len(tcga_nodes))

#  PKN edges filtered 
node_set = set(tcga_nodes)
tcga_sub_edges_df = pkn[
    pkn["source_genesymbol"].isin(node_set) &
    pkn["target_genesymbol"].isin(node_set)
].copy()

print("Edges after filtering:", tcga_sub_edges_df.shape[0])


tcga_out = RESULTS / "tcga_pcst_subnetwork.tsv"
tcga_sub_edges_df.to_csv(tcga_out, sep="\t", index=False)
print("Saved:", tcga_out)


Nodes in TCGA subnetwork: 36
Edges after filtering: 75
Saved: /home/sameeksha/emt_network/results/tcga_pcst_subnetwork.tsv


In [9]:
import numpy as np
from corneto._data import Data

try:
    from corneto.methods.pcst import PrizeCollectingSteinerTree
except ModuleNotFoundError:
    from corneto.methods.future.pcst import PrizeCollectingSteinerTree


def select_high_signal(sig, top_frac=0.40):
    sig = sig.dropna()
    thr = sig.abs().quantile(1 - top_frac)
    return sig[sig.abs() >= thr], float(thr)

geo_selected, geo_thr = select_high_signal(geo_sig, top_frac=0.40)
geo_selected = geo_selected[geo_selected.index.isin(set(G.V))]

print("GEO selected genes:", len(geo_selected), "| threshold |logFC| >=", round(geo_thr, 3))

EDGE_COST = 0.05
WEIGHT_SCALE = 20.0

features = []

for i in range(len(edges)):
    features.append({"mapping": "edge", "id": i, "value": EDGE_COST})

for gene, val in geo_selected.abs().items():
    features.append({
        "mapping": "vertex",
        "id": gene,
        "role": "prize",
        "value": float(val) * WEIGHT_SCALE
    })

D = Data.from_dict({"sample": {"features": features}})

pcst = PrizeCollectingSteinerTree(lambda_reg=0, strict_acyclic=False)
P = pcst.build(G, D)
P.solve(solver="scipy", verbosity=0)

print("with_flow sum:", float(np.sum(P.expr["with_flow"].value)))

edge_var = P.expr["selected_prized_flow_edges_0"].value
edge_idx = np.flatnonzero(np.asarray(edge_var) != 0)

geo_subG = pcst.processed_graph.edge_subgraph(edge_idx)

print("GEO subnetwork edges:", len(geo_subG.E))
print("GEO subnetwork nodes:", len(geo_subG.V))


GEO selected genes: 70 | threshold |logFC| >= 0.507
with_flow sum: 85.00000000000001
GEO subnetwork edges: 51
GEO subnetwork nodes: 41


In [10]:
geo_nodes = list(geo_subG.V)
print("Nodes in GEO subnetwork:", len(geo_nodes))

node_set = set(geo_nodes)
geo_sub_edges_df = pkn[
    pkn["source_genesymbol"].isin(node_set) &
    pkn["target_genesymbol"].isin(node_set)
].copy()

print("Edges after filtering:", geo_sub_edges_df.shape[0])

geo_out = RESULTS / "geo_pcst_subnetwork.tsv"
geo_sub_edges_df.to_csv(geo_out, sep="\t", index=False)
print("Saved:", geo_out)


Nodes in GEO subnetwork: 41
Edges after filtering: 96
Saved: /home/sameeksha/emt_network/results/geo_pcst_subnetwork.tsv


In [11]:
# nodes overlap
tcga_nodes = set(tcga_subG.V)
geo_nodes  = set(geo_subG.V)

shared_nodes = tcga_nodes & geo_nodes
all_nodes = tcga_nodes | geo_nodes

print("TCGA nodes:", len(tcga_nodes))
print("GEO nodes :", len(geo_nodes))
print("Shared nodes:", len(shared_nodes))
print("Node overlap %:", round(100 * len(shared_nodes) / len(all_nodes), 2))


# edges overlap (from the saved TSVs)
tcga_df = pd.read_csv(RESULTS / "tcga_pcst_subnetwork.tsv", sep="\t")
geo_df  = pd.read_csv(RESULTS / "geo_pcst_subnetwork.tsv",  sep="\t")

tcga_edges = set(zip(tcga_df["source_genesymbol"], tcga_df["sign"], tcga_df["target_genesymbol"]))
geo_edges  = set(zip(geo_df["source_genesymbol"],  geo_df["sign"],  geo_df["target_genesymbol"]))

shared_edges = tcga_edges & geo_edges
all_edges = tcga_edges | geo_edges

print("\nTCGA edges:", len(tcga_edges))
print("GEO edges :", len(geo_edges))
print("Shared edges:", len(shared_edges))
print("Edge overlap %:", round(100 * len(shared_edges) / len(all_edges), 2))

print("\nSome shared nodes:", sorted(list(shared_nodes))[:25])


TCGA nodes: 36
GEO nodes : 41
Shared nodes: 32
Node overlap %: 71.11

TCGA edges: 75
GEO edges : 96
Shared edges: 65
Edge overlap %: 61.32

Some shared nodes: ['CABP1', 'CALM1', 'CALM2', 'CALM3', 'CAV1', 'EGFR', 'FAF1', 'GABARAP', 'HOMER1', 'ITPR3', 'KIF13B', 'MDFI', 'MX1', 'NCS1', 'NHERF1', 'NTRK1', 'PIRT', 'PKD2', 'PRKACA', 'PRKG1', 'RNF24', 'SRC', 'TRPC1', 'TRPC3', 'TRPC4']


In [12]:
summary_text = f"""
BRCA EMT network inference with CORNETO (OmniPath signed+directed)

TCGA: nodes={len(tcga_nodes)}, induced_edges={len(tcga_edges)}
GEO : nodes={len(geo_nodes)},  induced_edges={len(geo_edges)}

Shared nodes: {len(shared_nodes)}  (overlap {round(100*len(shared_nodes)/len(all_nodes),2)}%)
Shared edges: {len(shared_edges)}  (overlap {round(100*len(shared_edges)/len(all_edges),2)}%)

Example shared nodes (first 25):
{", ".join(sorted(list(shared_nodes))[:25])}
""".strip()

out_path = RESULTS / "network_overlap_summary.txt"
out_path.write_text(summary_text + "\n")

print("Saved:", out_path)
print(summary_text)


Saved: /home/sameeksha/emt_network/results/network_overlap_summary.txt
BRCA EMT network inference with CORNETO (OmniPath signed+directed)

TCGA: nodes=36, induced_edges=75
GEO : nodes=41,  induced_edges=96

Shared nodes: 32  (overlap 71.11%)
Shared edges: 65  (overlap 61.32%)

Example shared nodes (first 25):
CABP1, CALM1, CALM2, CALM3, CAV1, EGFR, FAF1, GABARAP, HOMER1, ITPR3, KIF13B, MDFI, MX1, NCS1, NHERF1, NTRK1, PIRT, PKD2, PRKACA, PRKG1, RNF24, SRC, TRPC1, TRPC3, TRPC4
