## EMT contrast signatures (M vs E) and OmniPath prior network preparation

**Inputs (from Notebook 3 & 4):**

- data/processed/tcga_brca_expression_common.tsv

- data/processed/geo_brca_expression_common.tsv

- EMT score tables from Notebook 4 (TCGA + GEO)

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT = Path.home() / "emt_network"

tcga_expr = pd.read_csv(PROJECT/"data/processed/tcga_brca_expression_common.tsv", sep="\t", index_col=0)
geo_expr  = pd.read_csv(PROJECT/"data/processed/geo_brca_expression_common.tsv",  sep="\t", index_col=0)

tcga_scores = pd.read_csv(PROJECT/"results/emt_scores_tcga.csv", index_col=0)
geo_scores  = pd.read_csv(PROJECT/"results/emt_scores_geo.csv",  index_col=0)

tcga_expr.shape, geo_expr.shape, tcga_scores.shape, geo_scores.shape


((341, 1218), (346, 3273), (1218, 6), (3273, 6))

In [2]:
def pick_extremes(scores_df: pd.DataFrame, frac: float = 0.10):
    s = scores_df["consensus_rank"].astype(float).sort_values()
    n = len(s)
    k = int(np.floor(n * frac))
    E = s.index[:k]      # lowest EMT
    M = s.index[-k:]     # highest EMT
    return list(E), list(M)

tcga_E, tcga_M = pick_extremes(tcga_scores, frac=0.10)
geo_E, geo_M   = pick_extremes(geo_scores, frac=0.10)

len(tcga_E), len(tcga_M), len(geo_E), len(geo_M)


(121, 121, 327, 327)

In [3]:
def logfc_signature(expr: pd.DataFrame, E_samples, M_samples):
    E_mean = expr[E_samples].mean(axis=1)
    M_mean = expr[M_samples].mean(axis=1)
    logfc = (M_mean - E_mean)  # since data already log-like (GEO transformed; TCGA HiSeqV2 is log2-ish)
    return logfc.sort_values(ascending=False)

tcga_logfc = logfc_signature(tcga_expr, tcga_E, tcga_M)
geo_logfc  = logfc_signature(geo_expr,  geo_E,  geo_M)

tcga_logfc.head(), tcga_logfc.tail()


(CDH3     2.889612
 FUT3     2.586209
 KRT7     2.585810
 PSCA     2.580238
 SFRP1    2.366091
 dtype: float64,
 SORD     -0.921959
 TFF1     -0.956021
 AGR2     -1.043614
 CKMT1A   -1.130734
 TFF3     -1.280110
 dtype: float64)

In [4]:
out_dir = PROJECT / "resources" / "signatures_generated"
out_dir.mkdir(parents=True, exist_ok=True)

tcga_sig_path = out_dir / "tcga_brca_M_vs_E_logFC.tsv"
geo_sig_path  = out_dir / "geo_gse96058_M_vs_E_logFC.tsv"

tcga_logfc.to_frame("logFC_M_minus_E").to_csv(tcga_sig_path, sep="\t")
geo_logfc.to_frame("logFC_M_minus_E").to_csv(geo_sig_path, sep="\t")

tcga_sig_path, geo_sig_path


(PosixPath('/home/sameeksha/emt_network/resources/signatures_generated/tcga_brca_M_vs_E_logFC.tsv'),
 PosixPath('/home/sameeksha/emt_network/resources/signatures_generated/geo_gse96058_M_vs_E_logFC.tsv'))

In [5]:
import sys
!{sys.executable} -m pip install -U omnipath




In [6]:
from pathlib import Path

PROJECT = Path.home() / "emt_network"
cache_path = PROJECT / "data" / "raw" / "omnipath_interactions.tsv"

cache_path.parent.mkdir(parents=True, exist_ok=True)
print("Will save to:", cache_path)


Will save to: /home/sameeksha/emt_network/data/raw/omnipath_interactions.tsv


In [7]:
import requests

url = "https://omnipathdb.org/interactions/?genesymbols=yes"
headers = {"User-Agent": "Mozilla/5.0"}

r = requests.get(url, stream=True, timeout=180, headers=headers)
print("HTTP:", r.status_code)
r.raise_for_status()

with open(cache_path, "wb") as f:
    for chunk in r.iter_content(chunk_size=1024*1024):
        if chunk:
            f.write(chunk)

print("Saved bytes:", cache_path.stat().st_size)


HTTP: 200
Saved bytes: 4518940


In [8]:
import pandas as pd

pkn_raw = pd.read_csv(cache_path, sep="\t")
print("Shape:", pkn_raw.shape)
print("Columns:", pkn_raw.columns.tolist()[:25])
pkn_raw.head(3)


Shape: (85217, 10)
Columns: ['source', 'target', 'source_genesymbol', 'target_genesymbol', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition']


Unnamed: 0,source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition
0,P0DP25,P48995,CALM3,TRPC1,1,0,1,1,0,1
1,P0DP23,P48995,CALM1,TRPC1,1,0,1,1,0,1
2,P0DP24,P48995,CALM2,TRPC1,1,0,1,1,0,1


In [9]:
import numpy as np

pkn = pkn_raw[pkn_raw["is_directed"] == 1].copy()

pkn["sign"] = np.where(
    pkn["is_stimulation"] == 1, 1,
    np.where(pkn["is_inhibition"] == 1, -1, np.nan)
)

pkn = pkn.dropna(subset=["sign", "source_genesymbol", "target_genesymbol"])
pkn = pkn[["source_genesymbol", "target_genesymbol", "sign"]].drop_duplicates()
pkn["sign"] = pkn["sign"].astype(int)

print("Signed+directed gene-symbol PKN:", pkn.shape)
pkn.head()


Signed+directed gene-symbol PKN: (71679, 3)


Unnamed: 0,source_genesymbol,target_genesymbol,sign
0,CALM3,TRPC1,-1
1,CALM1,TRPC1,-1
2,CALM2,TRPC1,-1
3,CAV1,TRPC1,1
4,DRD2,TRPC1,1


In [10]:
tcga_expr = pd.read_csv(
    PROJECT / "data/processed/tcga_brca_expression_common.tsv",
    sep="\t", index_col=0
)
panel = set(tcga_expr.index)

pkn_panel = pkn[
    pkn["source_genesymbol"].isin(panel) &
    pkn["target_genesymbol"].isin(panel)
].copy()

print("Panel PKN shape:", pkn_panel.shape)
pkn_panel.head()


Panel PKN shape: (35, 3)


Unnamed: 0,source_genesymbol,target_genesymbol,sign
3,CAV1,TRPC1,1
1561,TWIST1,CDH1,-1
4493,MAF,CRYAB,1
5754,ERBB2,ERBB3,1
5939,PTK6,STAP2,1


In [11]:
# didnt use this file
from pathlib import Path 

net_dir = PROJECT / "resources" / "networks"
net_dir.mkdir(parents=True, exist_ok=True)

out_path = net_dir / "omnipath_panel_signed_directed.tsv"
pkn_panel.to_csv(out_path, sep="\t", index=False)

out_path


PosixPath('/home/sameeksha/emt_network/resources/networks/omnipath_panel_signed_directed.tsv')

In [12]:
net_dir = PROJECT / "resources" / "networks"
net_dir.mkdir(parents=True, exist_ok=True)

full_path = net_dir / "omnipath_full_signed_directed.tsv"
pkn.to_csv(full_path, sep="\t", index=False)

print("Saved:", full_path)
print("Edges:", pkn.shape[0])

Saved: /home/sameeksha/emt_network/resources/networks/omnipath_full_signed_directed.tsv
Edges: 71679
