In [6]:
from pathlib import Path
import pandas as pd

PROJECT = Path.home() / "emt_network"
tcga_path = PROJECT / "data" / "processed" / "tcga_brca_expression.tsv"
geo_path  = PROJECT / "data" / "processed" / "geo_brca_expression.tsv"

tcga = pd.read_csv(tcga_path, sep="\t", index_col=0)
geo  = pd.read_csv(geo_path,  sep="\t", index_col=0)

tcga.shape, geo.shape


((341, 1218), (346, 3409))

In [7]:
print("TCGA NA:", tcga.isna().sum().sum())
print("GEO  NA:", geo.isna().sum().sum())


tcga = tcga.apply(pd.to_numeric, errors="coerce")
geo  = geo.apply(pd.to_numeric, errors="coerce")

print("TCGA NA after numeric:", tcga.isna().sum().sum())
print("GEO  NA after numeric:", geo.isna().sum().sum())


TCGA NA: 0
GEO  NA: 0
TCGA NA after numeric: 0
GEO  NA after numeric: 0


In [8]:
for g in ["CDH1","VIM","EPCAM","ZEB1","ZEB2","SNAI1","SNAI2","TWIST1"]:
    print(g, "TCGA:", g in tcga.index, "| GEO:", g in geo.index)


CDH1 TCGA: True | GEO: True
VIM TCGA: True | GEO: True
EPCAM TCGA: True | GEO: True
ZEB1 TCGA: True | GEO: True
ZEB2 TCGA: True | GEO: True
SNAI1 TCGA: True | GEO: True
SNAI2 TCGA: True | GEO: True
TWIST1 TCGA: True | GEO: True


In [9]:
common_genes = sorted(set(tcga.index).intersection(set(geo.index)))
len(common_genes), common_genes[:20]


(341,
 ['ABCC3',
  'ABHD11',
  'AGR2',
  'AKAP12',
  'AKAP2',
  'AKR1B10',
  'AKT3',
  'ANGPTL2',
  'ANK2',
  'ANKRD22',
  'ANTXR2',
  'AP1M2',
  'AP1S2',
  'ARHGAP32',
  'ARHGAP8',
  'ASPN',
  'ATP2C2',
  'AXL',
  'AZGP1',
  'BCAS1'])

In [10]:
out_common = PROJECT / "data" / "processed" / "common_genes.txt"
out_common.write_text("\n".join(common_genes) + "\n")
out_common


PosixPath('/home/sameeksha/emt_network/data/processed/common_genes.txt')

In [14]:
tcga_common = tcga.loc[common_genes]
geo_common = geo_common.loc[:, ~geo_common.columns.str.endswith("repl")]

tcga_common.shape, geo_common.shape


((341, 1218), (341, 3273))

In [15]:
tcga_common.to_csv(PROJECT / "data" / "processed" / "tcga_brca_expression_common.tsv", sep="\t")
geo_common.to_csv(PROJECT / "data" / "processed" / "geo_brca_expression_common.tsv", sep="\t")
