In [1]:
from data.pancreas import preprocessing
import giraffe 
from netZooPy.otter.otter import otter
import numpy as np
import pandas as pd

In [2]:
cache = True

In [3]:
if not cache:
    preprocessing.generate_data()

In [8]:
gtex = pd.read_csv("data/pancreas/raw/xprs_panc_gtex_snail.tsv", sep = '\t', index_col = 0)
tcga = pd.read_csv("data/pancreas/raw/xprs_panc_tcga_snail.tsv", sep = '\t', index_col = 0)
genes = [gene[0:15] for gene in gtex.index]
gtex = pd.DataFrame(gtex.to_numpy(), index = genes, columns = gtex.columns)
tcga = pd.DataFrame(tcga.to_numpy(), index = genes, columns = tcga.columns)
motif = pd.read_csv("data/pancreas/motif.txt", index_col = 0)
ppi = pd.read_csv("data/pancreas/ppi.csv", index_col = 0)

In [72]:
giraffe_gtex = giraffe.Giraffe(gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0])).get_regulation()
pd.DataFrame(giraffe_gtex, index = gtex.index, columns = motif.columns).to_csv("data/pancreas/giraffe_gtex.csv")

In [7]:
giraffe_tcga = giraffe.Giraffe(tcga.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0])).get_regulation()
pd.DataFrame(giraffe_tcga, index = gtex.index, columns = motif.columns).to_csv("data/pancreas/giraffe_tcga.csv")

In [8]:
tcga_otter = otter(motif.to_numpy().T, ppi.to_numpy() + np.eye(ppi.shape[0]), np.corrcoef(tcga.to_numpy()), Iter = 200)
pd.DataFrame(tcga_otter, index = motif.columns, columns = gtex.index).to_csv("data/pancreas/tcga_otter.csv")

In [9]:
gtex_otter = otter(motif.to_numpy().T, ppi.to_numpy() + np.eye(ppi.shape[0]), np.corrcoef(gtex.to_numpy()), Iter = 200)
pd.DataFrame(gtex_otter, index = motif.columns, columns = gtex.index).to_csv("data/pancreas/gtex_otter.csv")

# Oncogenes vs Tumor Suppressor Genes

In [2]:
giraffe_gtex = pd.read_csv("data/pancreas/giraffe_gtex.csv", index_col = 0)
giraffe_tcga = pd.read_csv("data/pancreas/giraffe_tcga.csv", index_col = 0)
census = pd.read_csv("data/pancreas/raw/Census_allThu Feb 2 22 18 29 2023.csv")
translate = pd.read_csv("data/pancreas/raw/gen_v26_mapping.csv")

In [3]:
df = census[census['Tier'] == 1][['Gene Symbol', 'Role in Cancer', 'Tumour Types(Somatic)']]
sign = []
for elem in census['Role in Cancer']:
    if isinstance(elem, float):
        sign.append(0)
        continue
    if 'TSG' in elem:
        sign.append(-1)
    elif 'oncogene' in elem:
        sign.append(1)
    else:
        sign.append(0)
df['sign'] = pd.DataFrame(sign)

In [15]:
from scipy import spatial
def p_val(A, B, gene):
    real = spatial.distance.sqeuclidean(A.loc[gene], B.loc[gene])
    cnt = 0
    for i in range(100):
        if real < spatial.distance.sqeuclidean(np.random.permutation(A.loc[gene]), np.random.permutation(B.loc[gene])):
            #print(str(real) + " " + str(spatial.distance.cosine(np.random.permutation(A.loc[gene]), np.random.permutation(B.loc[gene]))))
            cnt += 1
    return cnt / 100

scores = np.sum(giraffe_gtex - giraffe_tcga, axis = 1)

hit = 0
tot = 0
for gene in df['Gene Symbol']:
    if isinstance(df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0], float):
        continue
    if 'pancr' not in df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0]:
        continue
    if gene not in set(translate['gene_name']):
        continue
    translation = translate[translate['gene_name'] == gene]['gene_id'].values[0][0:15]
    if translation not in scores.index:
        continue
    gt = df[df['Gene Symbol'] == gene]['sign'].values[0]
    p = p_val(giraffe_gtex, giraffe_tcga, translation)
    if gt == 0:
        continue
    real = scores[translation]
    tot += 1
    if real > 0 and gt == 1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    elif real < 0 and gt == -1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    else:
        print(str(gene) + " with p-val " + str(p) + " and wrong direction")
print(str(hit) + " " + str(tot))

ACVR1B with p-val 1.0 and correct direction
ACVR2A with p-val 1.0 and wrong direction
AKT2 with p-val 1.0 and correct direction
APC with p-val 1.0 and correct direction
ATRX with p-val 1.0 and correct direction
BRAF with p-val 1.0 and wrong direction
DAXX with p-val 1.0 and wrong direction
EP300 with p-val 1.0 and correct direction
FAT1 with p-val 1.0 and correct direction
FAT4 with p-val 1.0 and wrong direction
GNAS with p-val 1.0 and wrong direction
HIF1A with p-val 1.0 and correct direction
KRAS with p-val 1.0 and correct direction
MAP2K4 with p-val 1.0 and correct direction
MEN1 with p-val 1.0 and correct direction
PREX2 with p-val 1.0 and correct direction
RNF43 with p-val 1.0 and correct direction
SMAD4 with p-val 1.0 and correct direction
SND1 with p-val 1.0 and wrong direction
STK11 with p-val 1.0 and correct direction
14 20


In [20]:
from scipy import spatial
def p_val(A, B, gene):
    vec = np.sum(A - B, axis = 1)
    return vec.argsort().argsort().loc[gene] / len(vec)

scores = np.sum(giraffe_gtex - giraffe_tcga, axis = 1)

hit = 0
tot = 0
for gene in df['Gene Symbol']:
    if isinstance(df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0], float):
        continue
    if 'pancr' not in df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0]:
        continue
    if gene not in set(translate['gene_name']):
        continue
    translation = translate[translate['gene_name'] == gene]['gene_id'].values[0][0:15]
    if translation not in scores.index:
        continue
    gt = df[df['Gene Symbol'] == gene]['sign'].values[0]
    p = p_val(giraffe_gtex, giraffe_tcga, translation)
    if gt == 0:
        continue
    real = scores[translation]
    tot += 1
    if real > 0 and gt == -1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    elif real < 0 and gt == 1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    else:
        print(str(gene) + " with p-val " + str(p) + " and wrong direction")
print(str(hit) + " " + str(tot))

ACVR1B with p-val 0.5911771325680846 and wrong direction
ACVR2A with p-val 0.7882511816340311 and correct direction
AKT2 with p-val 0.7322079675894666 and wrong direction
APC with p-val 0.052397029034436195 and wrong direction
ATRX with p-val 0.35471528246680173 and wrong direction
BRAF with p-val 0.09930227323880261 and correct direction
DAXX with p-val 0.879585865406257 and correct direction
EP300 with p-val 0.23128516767949583 and wrong direction
FAT1 with p-val 0.32923700202565837 and wrong direction
FAT4 with p-val 0.7361692550078776 and correct direction
GNAS with p-val 0.037722259734413684 and correct direction
HIF1A with p-val 0.7764573486383074 and wrong direction
KRAS with p-val 0.628944406932253 and wrong direction
MAP2K4 with p-val 0.14737789781679045 and wrong direction
MEN1 with p-val 0.37083051991897364 and wrong direction
PREX2 with p-val 0.8431240153049742 and wrong direction
RNF43 with p-val 0.29673643934278643 and wrong direction
SMAD4 with p-val 0.3143821742066171 a

In [25]:
def p_val(A, B, gene):
    real = np.sum(A - B, axis = 1).loc[gene]
    cnt = 0
    for i in range(100):
        a = pd.DataFrame(np.random.permutation(A.values), index = A.index, columns = A.columns)
        b = pd.DataFrame(np.random.permutation(B.values), index = B.index, columns = B.columns)
        if np.abs(np.sum(a - b, axis = 1).loc[gene]) >= np.abs(real):
            cnt += 1
    return cnt / 100

scores = np.sum(giraffe_gtex - giraffe_tcga, axis = 1)

hit = 0
tot = 0
for gene in df['Gene Symbol']:
    if isinstance(df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0], float):
        continue
    if 'pancr' not in df[df['Gene Symbol'] == gene]['Tumour Types(Somatic)'].values[0]:
        continue
    if gene not in set(translate['gene_name']):
        continue
    translation = translate[translate['gene_name'] == gene]['gene_id'].values[0][0:15]
    if translation not in scores.index:
        continue
    gt = df[df['Gene Symbol'] == gene]['sign'].values[0]
    p = p_val(giraffe_gtex, giraffe_tcga, translation)
    if gt == 0:
        continue
    real = scores[translation]
    tot += 1
    if real > 0 and gt == 1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    elif real < 0 and gt == -1:
        hit += 1
        print(str(gene) + " with p-val " + str(p) + " and correct direction")
    else:
        print(str(gene) + " with p-val " + str(p) + " and wrong direction")
print(str(hit) + " " + str(tot))

ACVR1B with p-val 0.99 and correct direction
ACVR2A with p-val 0.91 and wrong direction
AKT2 with p-val 0.97 and correct direction
APC with p-val 0.79 and correct direction
ATRX with p-val 0.9 and correct direction
BRAF with p-val 0.82 and wrong direction
DAXX with p-val 0.89 and wrong direction
EP300 with p-val 0.9 and correct direction
FAT1 with p-val 0.95 and correct direction
FAT4 with p-val 0.93 and wrong direction
GNAS with p-val 0.74 and wrong direction
HIF1A with p-val 0.93 and correct direction
KRAS with p-val 0.99 and correct direction
MAP2K4 with p-val 0.82 and correct direction
MEN1 with p-val 0.93 and correct direction
PREX2 with p-val 0.89 and correct direction
RNF43 with p-val 0.94 and correct direction
SMAD4 with p-val 0.86 and correct direction
SND1 with p-val 0.76 and wrong direction
STK11 with p-val 0.94 and correct direction
14 20


# TF Knockdown

In [267]:
giraffe_gtex = giraffe.Giraffe(gtex.to_numpy(), motif.to_numpy(), ppi.to_numpy() + np.eye(ppi.shape[0]), iterations = 50).get_regulation()

In [270]:
intervention = pd.read_csv("data/pancreas/raw/ZIC2.csv")[['Target Gene', 'Mean Expr. of Control', 'Mean Expr. of Treat']]
translate = pd.read_csv("data/pancreas/raw/gen_v26_mapping.csv")
intervention['diff'] = intervention[intervention.columns[1]] > intervention[intervention.columns[2]]

In [271]:
tf_index = list(motif.columns).index('ZIC2')
score = 0
tot = 0
a = 0
b = 0
aa = 0
bb = 0
for i in range(intervention.shape[0]):
    gene = intervention.iloc[i]['Target Gene']
    if intervention['Target Gene'][i] not in set(translate['gene_name']):
        continue
    translation = translate[translate['gene_name'] == intervention['Target Gene'][i]]['gene_id'].values[0][0:15]
    if translation not in gtex.index:
        continue
    tot += 1
    if giraffe_gtex[list(gtex.index).index(translation), tf_index] > 0:
        a += 1
        if intervention.iloc[i]['diff']:
            aa += 1
            score += 1
    else:
        b += 1
        if not intervention.iloc[i]['diff']:
            bb += 1
            score += 1
print(score / tot)

0.6863169280692266


In [None]:
'HOXB7': 0.67
'FOXO3': 0.53
'ZIC2': 0.69