# Basic graph handling in NetworkAnalysis

In [1]:
from NetworkAnalysis.UndirectedInteractionNetwork import UndirectedInteractionNetwork
import pandas as pd
import random

In [2]:
GRAPH_PATH = "/home/bioit/pstrybol/ppi_network_scaffolds/reactome_2021.txt"
reactome = pd.read_csv(GRAPH_PATH, sep='\t')
colnames = ['Gene1', 'Gene2'] # Select relevant columns from the edgelist
nw_obj = UndirectedInteractionNetwork(reactome, colnames=colnames, keeplargestcomponent=False)

# Print out the nodes as integers or named, same for the interactions
print(f"{nw_obj.nodes[:5]} sample of the node integers")
print(f"{nw_obj.node_names[:5]} sample of the node names")
print("-------------------------------------------------------")
print(f"{nw_obj.interactions.head()} interactions")
print("-------------------------------------------------------")
print(f"{nw_obj.getInteractionNamed().head()} named interactions")
print("-------------------------------------------------------")
print(f"{nw_obj.N_nodes} number of nodes and {nw_obj.interactions.shape[0]} edges")


13953 Nodes and 257629 interactions
[0 1 2 3 4] sample of the node integers
['16-5-5' 'A1CF' 'A2M' 'A4GNT' 'AAAS'] sample of the node names
-------------------------------------------------------
   Gene_A  Gene_B
0       0    1717
1       0    8563
2       0    8564
3       1     563
4       1    3519 interactions
-------------------------------------------------------
   Gene_A   Gene_B
0  16-5-5    CDC42
1  16-5-5    PARD3
2  16-5-5   PARD3B
3    A1CF  APOBEC1
4    A1CF    EP300 named interactions
-------------------------------------------------------
13953 number of nodes and 257629 edges


In [3]:
# Other useful statistics are:

# Degree of the network
print(nw_obj.getDegreeDF(return_names=True, set_index=True).head())
print("-------------------------------------------------------")

# The N-th order neighbors of the entire graph (gene_list=None) of only certain nodes (gene_list=list of nodes):
# Warning: This method becomes very slow on large networks
order = nw_obj.getNOrderNeighbors(order=1, include_lower_order=True, gene_list=None)
print({k:order[k] for k in list(order.keys())[:5]})
print("-------------------------------------------------------")


# Find the Geodesic distance (=shortest path distance between two nodes) between two lists of nodes
print(nw_obj.getGeodesicDistance(start_genes=['TP53', 'KRAS'], stop_genes=['ERBB2', 'BRAC1', 'GNB1']))


          Gene  Count
EP300    EP300   1087
RPS27A  RPS27A    978
GNB1      GNB1    972
UBA52    UBA52    941
PRKACA  PRKACA    920
-------------------------------------------------------
{0: [1717, 8563, 8564], 1717: [0, 15, 53, 55, 56, 57, 117, 118, 119, 120, 121, 122, 123, 133, 134, 276, 284, 427, 541, 605, 610, 627, 633, 636, 638, 650, 651, 653, 655, 660, 661, 662, 663, 666, 668, 672, 674, 680, 683, 684, 685, 721, 722, 723, 724, 725, 726, 731, 732, 946, 989, 1005, 1048, 1049, 1088, 1121, 1122, 1123, 1126, 1166, 1170, 1234, 1250, 1251, 1252, 1358, 1408, 1409, 1410, 1411, 1435, 1462, 1472, 1643, 1665, 1686, 1691, 1705, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1736, 1737, 1742, 1747, 1779, 1802, 1843, 1844, 1845, 1846, 1847, 1848, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 2063, 2070, 2071, 2133, 2473, 2521, 2522, 2525, 2528, 2576, 2590, 2674, 2718, 2881, 2882, 2883, 2999, 3023, 3098, 3107, 3108, 3109, 3114, 3115, 3116, 3117, 3184, 3232, 3240, 3241, 3242, 3243,

In [4]:
# Extract the disconnected components if there are any
print(f"The Reactome PPI graph of 2021 is {'' if nw_obj.isConnected else 'not'} connected")
print("-------------------------------------------------------")

components = nw_obj.getComponents(return_subgraphs=True, verbose=False)
print(f"{len(components)} disconnected components are found") 
# Each of the components can either be returned as a pd.DataFrame (return_subgraphs=False) or as
# object of the relevant class (return_subgraphs=True)



The Reactome PPI graph of 2021 is not connected
85 disconnected components are found


In [5]:
# If the graph is too large, or you are only interested in part of the graph we can subsample and return as class object
subsampled_ppi = nw_obj.subsetNetwork(nodes=random.sample(list(nw_obj.node_names), k=100), 
                                      inplace=False, keeplargestcomponent=False, and_or='and')
print("-------------------------------------------------------")

subsampled_ppi = nw_obj.subsetNetwork(nodes=random.sample(list(nw_obj.node_names), k=100), 
                                      inplace=False, keeplargestcomponent=False, and_or='or')


# Note that you can choose whether to subset the network with the input genes in a stringent manner (and_or='and') 
# where each node in the interaction is required to be in the input list. Or, a more general subset can be taken
# (and_or='or') where only one of the interaction partners needs to be in the input list

11 Nodes and 6 interactions
3211 Nodes and 4962 interactions


In [6]:
# The PyLouvain algorithm can be used to cluster nodes from the graph, this also be used to find initial structures
# in the graph that need to be retained in the latent space when performing Network Representaiotn Learning (NRL)
communities = nw_obj.findcommunities(verbose=True)


Object is a fully connected graph, returning object copy.
5 Nodes and 6 interactions
808 Nodes and 14686 interactions
808 Nodes and 14686 interactions
19 Nodes and 66 interactions
19 Nodes and 66 interactions
1782 Nodes and 29745 interactions
1782 Nodes and 29745 interactions
21 Nodes and 112 interactions
21 Nodes and 112 interactions
689 Nodes and 8293 interactions
689 Nodes and 8293 interactions
60 Nodes and 186 interactions
60 Nodes and 186 interactions
31 Nodes and 122 interactions
31 Nodes and 122 interactions
Object is a fully connected graph, returning object copy.
41 Nodes and 170 interactions
Object is a fully connected graph, returning object copy.
518 Nodes and 20631 interactions
645 Nodes and 9612 interactions
645 Nodes and 9612 interactions
3 Nodes and 2 interactions
3 Nodes and 2 interactions
6 Nodes and 8 interactions
6 Nodes and 8 interactions
Object is a fully connected graph, returning object copy.
14 Nodes and 58 interactions
Object is a fully connected graph, return

In [7]:
# As a convenience function, we can direcly use agglomerative clustering on the network object
nw_obj.clusterbydiffusion(kernel='LEX', alpha=0.01, nclusters=50, linkage='average', verbose=True)

Network Propagation Complete: 110 seconds




Size of the largest cluster: 13751
Size of the smallest cluster: 2
Number of clusters: 50


[['16-5-5',
  'A1CF',
  'A2M',
  'A4GNT',
  'AAAS',
  'AAG1',
  'AAK1',
  'AAMP',
  'AANAT',
  'AAR2',
  'AARS1',
  'AASDHPPT',
  'AATF',
  'AATK',
  'ABCA1',
  'ABCA12',
  'ABCA2',
  'ABCA3',
  'ABCA4',
  'ABCA7',
  'ABCB1',
  'ABCB10',
  'ABCB11',
  'ABCB4',
  'ABCB7',
  'ABCC2',
  'ABCC6',
  'ABCC8',
  'ABCC9',
  'ABCD1',
  'ABCD2',
  'ABCD3',
  'ABCD4',
  'ABCE1',
  'ABCF1',
  'ABCF2',
  'ABCF3',
  'ABCG1',
  'ABCG2',
  'ABCG4',
  'ABCG5',
  'ABCG8',
  'ABHD17A',
  'ABHD17B',
  'ABHD17C',
  'ABHD2',
  'ABHD4',
  'ABHD5',
  'ABI1',
  'ABI2',
  'ABI3',
  'ABL1',
  'ABL2',
  'ABLIM1',
  'ABLIM2',
  'ABLIM3',
  'ABR',
  'ABRAXAS1',
  'ABRAXAS2',
  'ABT1',
  'ABTB1',
  'ACAA1',
  'ACAA2',
  'ACACA',
  'ACACB',
  'ACAD9',
  'ACADL',
  'ACADM',
  'ACAN',
  'ACAP1',
  'ACAP2',
  'ACBD3',
  'ACBD5',
  'ACD',
  'ACE',
  'ACE2',
  'ACER1',
  'ACER2',
  'ACHE',
  'ACIN1',
  'ACKR1',
  'ACKR2',
  'ACKR3',
  'ACKR4',
  'ACLY',
  'ACO1',
  'ACO2',
  'ACOT1',
  'ACOT11',
  'ACOT12',
  'ACOT13',
  