In [1]:
from NetworkAnalysis.UndirectedInteractionNetwork import UndirectedInteractionNetwork
from NetworkAnalysis.MultiGraph import MultiGraph
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random

In [2]:
# Initialize scaffold
GRAPH_PATH = "/home/bioit/pstrybol/ppi_network_scaffolds/reactome_2021.txt"


reactome = pd.read_csv(GRAPH_PATH, sep='\t')
colnames = ['Gene1', 'Gene2'] # Select relevant columns from the edgelist
# Initialize an UndirectedInteractionNetwork object
nw_obj = UndirectedInteractionNetwork(reactome, colnames=colnames,
                                      keeplargestcomponent=True,
                                      node_types={name: "gene" for name in pd.unique(reactome[colnames].values.ravel())})



13751 Nodes and 257496 interactions
13751 Nodes and 257496 interactions


In [3]:
# Randomly generate cell line - gene pairs
# Let's say we have 50 cell lines with each 100 dependencies

cell2dependency = {f"C{i}": random.sample(list(nw_obj.node_names), k=100) for i in range(1, 50)}
cellEdge_list = pd.DataFrame(cell2dependency).melt()

cell_node_type = {i: "gene" if i in nw_obj.node_names else "cellLine" for i in cellEdge_list.values.ravel()}

cell_nw_obj = UndirectedInteractionNetwork(cellEdge_list,
                                           keeplargestcomponent=False,
                                           node_types=cell_node_type)

4197 Nodes and 4900 interactions


In [4]:
# Combine networks into multigraph
multigraph_obj = MultiGraph(graph_dict={"scaffold": nw_obj, "cellDependencies": cell_nw_obj})
print(multigraph_obj)

13794 Nodes and 262396 interactions
          Gene_A                    Gene_B              type
0         16-5-5                     CDC42          scaffold
1         16-5-5                     PARD3          scaffold
2         16-5-5                    PARD3B          scaffold
3           A1CF                   APOBEC1          scaffold
4           A1CF                     EP300          scaffold
...          ...                       ...               ...
262391      BOD1                       C49  cellDependencies
262392       C49                      TLE5  cellDependencies
262393       C49                     RBMX2  cellDependencies
262394  ADAMTS13                       C49  cellDependencies
262395       C49  KIF5B-RET(NM_020630)_K22  cellDependencies

[262396 rows x 3 columns]


In [6]:
X_train, X_val, X_test,\
    Y_train, Y_val, Y_test = multigraph_obj.getTrainTestData(train_ratio=0.7, 
                                                                     neg_pos_ratio=5., 
                                                                     train_validation_ratio=None, 
                                                                     excluded_sets=None,
                                                                     random_state=42, 
                                                                     mode="SLT", debug_mode=True)


Starting with edge type 0: cellDependencies
Returning UndirectedInteractionNetwork object.
Continuing with Gene_A and Gene_B as columns for the nodes
4197 Nodes and 4900 interactions
Starting with edge type 1: scaffold
Returning UndirectedInteractionNetwork object.
Continuing with Gene_A and Gene_B as columns for the nodes
13751 Nodes and 257496 interactions
Size of pos test: 78717
Size of pos train: 183677
Size of pos validation: 0
Size of neg test: 393588
Size of neg train: 849407
Size of neg validation: 0
