In [1]:
import giraffe
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
census = pd.read_csv("data/onco_tsg/raw/Census_allThu Feb 2 22 18 29 2023.csv")
translate = pd.read_csv("data/onco_tsg/raw/gen_v26_mapping.csv")

df = census[census['Tier'] == 1][['Gene Symbol', 'Role in Cancer', 'Tumour Types(Somatic)']]
sign = []
for elem in census['Role in Cancer']:
    if isinstance(elem, float):
        sign.append(0)
        continue
    if 'TSG' in elem:
        sign.append(1)
    elif 'oncogene' in elem:
        sign.append(-1)
    else:
        sign.append(0)
df['sign'] = pd.DataFrame(sign)

In [3]:
def rank_val(A, B, gene):
    vec = np.sum(np.abs(A - B), axis = 1)
    return 1 - vec.argsort().argsort().loc[gene] / len(vec)

def experiment(giraffe_gtex, giraffe_tcga, tissue):
    hit = 0
    tot = 0
    scores = np.sum(giraffe_gtex - giraffe_tcga, axis = 1)
    for gene in df['Gene Symbol']:
        if isinstance(df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0], float):
            continue
        if tissue not in df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0]:
            continue
        if gene not in set(translate['gene_name']):
            continue
        translation = translate[translate['gene_name'] == gene]['gene_id'].values[0][0:15]
        if translation not in scores.index:
            continue
        gt = df[df['Gene Symbol'] == gene]['sign'].values[0]
        p = rank_val(giraffe_gtex, giraffe_tcga, translation)
        if p > 0.10 :
            continue
        #print(df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0])
        if gt == 0:
            continue
        real = scores[translation]
        tot += 1
        if real > 0 and gt == 1:
            hit += 1
            print(str(gene) + " TSG, with ranking " + str(p) + " and correct direction")
        elif real < 0 and gt == -1:
            hit += 1
            print(str(gene) + " OG, with ranking " + str(p) + " and correct direction")
        else:
            print(str(gene) + " with p-val " + str(p) + " and wrong direction")

### Breast

In [4]:
gtex = "data/breast/raw/expression.csv"
tcga = "https://granddb.s3.amazonaws.com/cancer/breast_cancer/cancer_breast_expression_tcga.csv"

In [5]:
expression_tcga = pd.read_csv(tcga, sep = ',', index_col = 0)
expression_gtex = pd.read_csv(gtex, index_col = 0, header = None)
motif = pd.read_csv("data/breast/motif.csv", index_col = 0)
ppi = pd.read_csv("data/breast/ppi_matrix.csv", index_col = 0)

In [6]:
expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)

In [7]:
print(expression_tcga.shape)
print(expression_gtex.shape)
print(motif.shape)
print(ppi.shape)

(22609, 1134)
(22609, 217)
(22609, 644)
(644, 644)


In [8]:
giraffe_gtex = giraffe.Giraffe(expression_gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_gtex = pd.DataFrame(giraffe_gtex, index = motif.index, columns = motif.columns)

In [9]:
giraffe_tcga = giraffe.Giraffe(expression_tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_tcga = pd.DataFrame(giraffe_tcga, index = motif.index, columns = motif.columns)

In [10]:
experiment(giraffe_gtex, giraffe_tcga, 'breast')

AKT1 with p-val 0.03149188376310319 and wrong direction
ARID1A TSG, with ranking 0.07992392410102167 and correct direction
BAP1 TSG, with ranking 0.07262594541996548 and correct direction
CDKN1B TSG, with ranking 0.05272236719890311 and correct direction
EP300 TSG, with ranking 0.04250519704542444 and correct direction
ETV6 TSG, with ranking 0.08381617939758501 and correct direction
NOTCH1 TSG, with ranking 0.04471670573665354 and correct direction
TBX3 TSG, with ranking 0.05104162059356898 and correct direction


### Skin

In [11]:
gtex = "data/onco_tsg/Skin.csv"
tcga = "https://granddb.s3.us-east-2.amazonaws.com/cancer/aggnets/expression/expression_tcga_HNSC.txt"

In [12]:
expression_tcga = pd.read_csv(tcga, sep = '\t', index_col = 0)
expression_gtex = pd.read_csv(gtex, index_col = 0, header = None)
motif = pd.read_csv("data/on", index_col = 0)
ppi = pd.read_csv("data/prostate/ppi_matrix.csv", index_col = 0)

In [13]:
expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)

In [14]:
print(expression_tcga.shape)
print(expression_gtex.shape)
print(motif.shape)
print(ppi.shape)

(29374, 500)
(29374, 661)
(29374, 644)
(644, 644)


In [15]:
giraffe_gtex = giraffe.Giraffe(expression_gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_gtex = pd.DataFrame(giraffe_gtex, index = motif.index, columns = motif.columns)

In [16]:
giraffe_tcga = giraffe.Giraffe(expression_tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_tcga = pd.DataFrame(-giraffe_tcga, index = motif.index, columns = motif.columns)

In [17]:
experiment(giraffe_gtex, giraffe_tcga, 'HNSC')

NFE2L2 TSG, with ranking 0.04735480356778099 and correct direction
TSC2 TSG, with ranking 0.043201470688363885 and correct direction


### Colon

In [140]:
tcga = "https://granddb.s3.amazonaws.com/cancer/colon_cancer/cancer_colon_expression_tcga.txt"
gtex = "data/colon/raw/expression.csv"

In [141]:
expression_tcga = pd.read_csv(tcga, sep = '\t', index_col = 0)
expression_gtex = pd.read_csv(gtex, index_col = 0, header = None)
motif = pd.read_csv("data/colon/motif.csv", index_col = 0)
ppi = pd.read_csv("data/colon/ppi_matrix.csv", index_col = 0)

  expression_gtex = pd.read_csv(gtex, index_col = 0, header = None)


In [142]:
genes = expression_tcga.index
genes_id = []
for gene in genes:
    if len(list(translate[translate['gene_name'] == gene]['gene_id'])) > 0:
        genes_id.append(list(translate[translate['gene_name'] == gene]['gene_id'])[0][0:15])
    else:
        genes_id.append(-1)
expression_tcga.index = genes_id

In [143]:
expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)


In [144]:
print(expression_tcga.shape)
print(expression_gtex.shape)
print(motif.shape)
print(ppi.shape)

(12383, 445)
(12383, 203)
(12383, 644)
(644, 644)


In [157]:
giraffe_gtex = giraffe.Giraffe(expression_gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0])).get_regulation()
giraffe_gtex = pd.DataFrame(giraffe_gtex, index = motif.index, columns = motif.columns)

In [158]:
giraffe_tcga = giraffe.Giraffe(expression_tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0])).get_regulation()
giraffe_tcga = pd.DataFrame(giraffe_tcga, index = motif.index, columns = motif.columns)

In [162]:
experiment(giraffe_gtex, giraffe_tcga, 'colon')

ERBB3 OG, with ranking 0.04425421949446817 and correct direction


### Lung

In [18]:
tcga = "https://granddb.s3.us-east-2.amazonaws.com/cancer/aggnets/expression/expression_tcga_LUAD.txt"
#tcga = "https://granddb.s3.us-east-2.amazonaws.com/cancer/aggnets/expression/expression_tcga_LUSC.txt"
gtex = "data/onco_tsg/Lung.csv"

In [19]:
expression_tcga = pd.read_csv(tcga, sep = '\t', index_col = 0)
expression_gtex = pd.read_csv(gtex, index_col = 0)
motif = pd.read_csv("data/onco_tsg/motif.csv", index_col = 0)
ppi = pd.read_csv("data/onco_tsg/ppi_matrix.csv", index_col = 0)

In [20]:
expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)

In [21]:
print(expression_tcga.shape)
print(expression_gtex.shape)
print(motif.shape)
print(ppi.shape)

(29374, 524)
(29374, 360)
(29374, 644)
(644, 644)


In [22]:
giraffe_gtex = giraffe.Giraffe(expression_gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_gtex = pd.DataFrame(giraffe_gtex, index = motif.index, columns = motif.columns)

In [23]:
giraffe_tcga = giraffe.Giraffe(expression_tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_tcga = pd.DataFrame(giraffe_tcga, index = motif.index, columns = motif.columns)

In [24]:
experiment(giraffe_gtex, giraffe_tcga, 'lung')

KRAS OG, with ranking 0.04207802818819362 and correct direction


### Thyroid

In [25]:
tcga = "https://granddb.s3.us-east-2.amazonaws.com/cancer/aggnets/expression/expression_tcga_THCA.txt"
gtex = "data/onco_tsg/Thyroid.csv"

In [26]:
expression_tcga = pd.read_csv(tcga, sep = '\t', index_col = 0)
expression_gtex = pd.read_csv(gtex, index_col = 0)
motif = pd.read_csv("data/onco_tsg/motif.csv", index_col = 0)
ppi = pd.read_csv("data/onco_tsg/ppi_matrix.csv", index_col = 0)

In [27]:
expression_tcga = expression_tcga.loc[expression_tcga.index.isin(expression_gtex.index),:].astype(np.float)
expression_gtex = expression_gtex.loc[expression_gtex.index.isin(expression_tcga.index),:].astype(np.float)
motif = motif.loc[motif.index.isin(expression_gtex.index), :].astype(np.float)

In [28]:
print(expression_tcga.shape)
print(expression_gtex.shape)
print(motif.shape)
print(ppi.shape)

(29374, 502)
(29374, 355)
(29374, 644)
(644, 644)


In [29]:
giraffe_gtex = giraffe.Giraffe(expression_gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_gtex = pd.DataFrame(giraffe_gtex, index = motif.index, columns = motif.columns)

In [30]:
giraffe_tcga = giraffe.Giraffe(expression_tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), save_computation = True).get_regulation()
giraffe_tcga = pd.DataFrame(giraffe_tcga, index = motif.index, columns = motif.columns)

In [31]:
experiment(giraffe_gtex, giraffe_tcga, 'thyroid')

KRAS OG, with ranking 0.04963573228024787 and correct direction
NTRK1 TSG, with ranking 0.09978212024239119 and correct direction
