# Pré-processamento das redes PPI extraídas de [NDEx network exchange](http://www.ndexbio.org/#/networkset/e8ebbdde-86dc-11e7-a10d-0ac135e8bacf?accesskey=7fbd23635b798321954e66c63526c46397a3f45b40298cf43f22d07d4feed0fa)

Esse notebook eu faço o processamento mais básico nas redes vindas do site, removendo as colunas que não serão usadas e salvando o arquivo

In [1]:
import numpy as np
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Rede Parsimonious Composite Network (PCNet)

In [39]:
# Abrindo os arquivos em CX usando json

pcnet_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/Parsimonious Composite Network (PCNet).cx'
with open(pcnet_file) as cx_f:
    all_network_components = json.load(cx_f)

pcnet_nodes = None
pcnet_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        pcnet_nodes = comp['nodes']
    if 'edges' in comp:
        pcnet_edges = comp['edges']
print ("Loaded PCNet PPI network with {} nodes and {} edges".format(len(pcnet_nodes), len(pcnet_edges)))

Loaded PCNet PPI network with 19781 nodes and 2724724 edges


In [40]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(pcnet_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(pcnet_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']
edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,1,UBE2Q1,RNF14
1,0,2,UBE2Q1,UBE2Q2
2,0,6239,UBE2Q1,TMCO1
3,0,7486,UBE2Q1,UBAC1
4,0,6851,UBE2Q1,WWP1


In [41]:
edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/pcnet_edgelist.tsv', sep='\t')

# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

In [42]:
# Carregando a rede e removendo as colunas que não serão usadas pela GAT

pcnet_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/pcnet_edgelist.tsv', sep= '\t')

In [43]:
pcnet_network = pcnet_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

pcnet_network

Unnamed: 0,Source_Name,Target_Name
0,UBE2Q1,RNF14
1,UBE2Q1,UBE2Q2
2,UBE2Q1,TMCO1
3,UBE2Q1,UBAC1
4,UBE2Q1,WWP1
...,...,...
2724719,SLC7A14,SLC7A10
2724720,SLC7A14,SLC7A11
2724721,SELE,SELP
2724722,SIGLEC1,SELP


## Rede ConsensusPathDB

In [44]:
CPDB_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/ConsensusPathDB.cx'
with open(CPDB_file) as cx_f:
    all_network_components = json.load(cx_f)

CPDB_nodes = None
CPDB_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        CPDB_nodes = comp['nodes']
    if 'edges' in comp:
        CPDB_edges = comp['edges']
print ("Loaded CPDB PPI network with {} nodes and {} edges".format(len(CPDB_nodes), len(CPDB_edges)))

Loaded CPDB PPI network with 16301 nodes and 1648426 edges


In [46]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(CPDB_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(CPDB_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']

edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/cpdb_edgelist.tsv', sep='\t')
# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,10016,RNF14,VDR
1,0,132,RNF14,SMAD4
2,0,12577,RNF14,UBE2D4
3,0,2099,RNF14,UBE2D2
4,0,2100,RNF14,UBE2D3


In [47]:
cpdb_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/cpdb_edgelist.tsv', sep= '\t')

cpdb_network = cpdb_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

cpdb_network

Unnamed: 0,Source_Name,Target_Name
0,RNF14,VDR
1,RNF14,SMAD4
2,RNF14,UBE2D4
3,RNF14,UBE2D2
4,RNF14,UBE2D3
...,...,...
1648421,CFL1,SERPINH1
1648422,DNM1L,PLEKHG1
1648423,SERPINH1,MRPL23
1648424,NFIA,NFIX


## Rede HPRD

In [2]:
HPRD_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/HPRD.cx'
with open(HPRD_file) as cx_f:
    all_network_components = json.load(cx_f)

HPRD_nodes = None
HPRD_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        HPRD_nodes = comp['nodes']
    if 'edges' in comp:
        HPRD_edges = comp['edges']
print ("Loaded HPRD PPI network with {} nodes and {} edges".format(len(HPRD_nodes), len(HPRD_edges)))

Loaded HPRD PPI network with 9465 nodes and 37039 edges


In [55]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(HPRD_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(HPRD_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']

edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/HPRD_edgelist.tsv', sep='\t')
# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,5802,RNF14,NR3C1
1,0,3134,RNF14,NCOA4
2,0,3992,RNF14,ESR1
3,0,6093,RNF14,UBE2E1
4,0,7372,RNF14,FAM46A


In [56]:
HPRD_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/HPRD_edgelist.tsv', sep= '\t')

HPRD_network = HPRD_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

HPRD_network

Unnamed: 0,Source_Name,Target_Name
0,RNF14,NR3C1
1,RNF14,NCOA4
2,RNF14,ESR1
3,RNF14,UBE2E1
4,RNF14,FAM46A
...,...,...
37034,SELL,SELE
37035,CRYAB,CRYAA
37036,PLAU,PLAT
37037,NFIC,NFIB


## Rede IRefIndex

In [57]:
IRefIndex_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/IRefIndex.cx'
with open(IRefIndex_file) as cx_f:
    all_network_components = json.load(cx_f)

IRefIndex_nodes = None
IRefIndex_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        IRefIndex_nodes = comp['nodes']
    if 'edges' in comp:
        IRefIndex_edges = comp['edges']
print ("Loaded IRefIndex PPI network with {} nodes and {} edges".format(len(IRefIndex_nodes), len(IRefIndex_edges)))

Loaded IRefIndex PPI network with 14667 nodes and 133548 edges


In [58]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(IRefIndex_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(IRefIndex_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']

edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/IRefIndex_edgelist.tsv', sep='\t')
# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,6496,RNF14,AR
1,0,11271,RNF14,TCF3
2,0,9500,RNF14,UBE2E1
3,0,1920,RNF14,UBE2D4
4,0,1923,RNF14,UBE2D3


In [59]:
IRefIndex_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/IRefIndex_edgelist.tsv', sep= '\t')

IRefIndex_network = IRefIndex_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

IRefIndex_network

Unnamed: 0,Source_Name,Target_Name
0,RNF14,AR
1,RNF14,TCF3
2,RNF14,UBE2E1
3,RNF14,UBE2D4
4,RNF14,UBE2D3
...,...,...
133543,VCAN,SELL
133544,VCAN,SELP
133545,CAND1,LDHD
133546,CAND1,LDHA


## Rede MultiNet

In [60]:
MultiNet_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/MultiNet.cx'
with open(MultiNet_file) as cx_f:
    all_network_components = json.load(cx_f)

MultiNet_nodes = None
MultiNet_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        MultiNet_nodes = comp['nodes']
    if 'edges' in comp:
        MultiNet_edges = comp['edges']
print ("Loaded MultiNet PPI network with {} nodes and {} edges".format(len(MultiNet_nodes), len(MultiNet_edges)))

Loaded MultiNet PPI network with 14445 nodes and 109598 edges


In [61]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(MultiNet_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(MultiNet_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']

edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/MultiNet_edgelist.tsv', sep='\t')
# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,8882,RNF14,VDR
1,0,11698,RNF14,ACVR1
2,0,11873,RNF14,DYRK2
3,0,9359,RNF14,UBE2E1
4,0,1911,RNF14,UBE2D4


In [62]:
MultiNet_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/MultiNet_edgelist.tsv', sep= '\t')

MultiNet_network = MultiNet_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

MultiNet_network

Unnamed: 0,Source_Name,Target_Name
0,RNF14,VDR
1,RNF14,ACVR1
2,RNF14,DYRK2
3,RNF14,UBE2E1
4,RNF14,UBE2D4
...,...,...
109593,LDHD,LDHB
109594,LDHD,LDHC
109595,LDHA,LDHB
109596,LDHA,LDHC


## Rede STRING

In [63]:
STRING_file = 'C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/STRING.cx'
with open(STRING_file) as cx_f:
    all_network_components = json.load(cx_f)

STRING_nodes = None
STRING_edges = None
for comp in all_network_components:
    if 'nodes' in comp:
        STRING_nodes = comp['nodes']
    if 'edges' in comp:
        STRING_edges = comp['edges']
print ("Loaded STRING PPI network with {} nodes and {} edges".format(len(STRING_nodes), len(STRING_edges)))

Loaded STRING PPI network with 18266 nodes and 5135768 edges


In [64]:
# get DataFrames for edges and nodes
edgelist = pd.DataFrame(STRING_edges).drop('@id', axis=1)
edgelist.columns = ['Source', 'Target']
nodes = pd.DataFrame(STRING_nodes).drop('@id', axis=1)
nodes.columns = ['Name', 'hgnc_symbol']

# join edgelist with the node names to have meaningful node names
edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')
edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']
edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')
edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']

edgelist_names.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/STRING_edgelist.tsv', sep='\t')
# Salvando o processamento puro das redes (Mesmo processamento usado no EMOGI)

edgelist_names.head()

Unnamed: 0,Source,Target,Source_Name,Target_Name
0,0,1,RNF14,UBE2Q1
1,0,10,RNF14,UBE2Q2
2,0,4,RNF14,RNF11
3,0,15951,RNF14,PPP3R1
4,0,13718,RNF14,HSPA4


In [65]:
STRING_edgelist = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/networks/Netwoks_NDEx/STRING_edgelist.tsv', sep= '\t')

STRING_network = STRING_edgelist.drop(columns=['Unnamed: 0', 'Source', 'Target'])

STRING_network

Unnamed: 0,Source_Name,Target_Name
0,RNF14,UBE2Q1
1,RNF14,UBE2Q2
2,RNF14,RNF11
3,RNF14,PPP3R1
4,RNF14,HSPA4
...,...,...
5135763,SLC7A14,SLC7A13
5135764,SELE,SELP
5135765,SLC7A10,SLC7A13
5135766,SLC7A11,SLC7A13
