# Processar as redes e extrair informações pelo IGraph

Nesse notebook eu faço todos os preparos para atribuir as features da rede à cada uma das rede, incluindo a rede UNITY 

** Neste notebook as redes estão traduzidas e já com o número de genes correspondennte ao número de features disponíeis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene
import h5py
import networkx as nx
import seaborn as sns
%matplotlib inline

## Rede Parsimonious Composite Network (PCNet)

In [66]:
pcnet_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/PCNET_network.tsv', sep = '\t')

pcnet_network = pcnet_network.drop(columns=['weight', 'Unnamed: 0'])

pcnet_network

Unnamed: 0,source,target
0,RNF14,UBE2Q1
1,UBE2Q1,UBE2Q2
2,TMCO1,UBE2Q1
3,UBAC1,UBE2Q1
4,UBE2Q1,WWP1
...,...,...
2712876,SLC7A10,SLC7A14
2712877,SLC7A11,SLC7A14
2712878,SELE,SELP
2712879,SELP,SIGLEC1


In [67]:
# Teste com a rede
# Adicionando novas propriedades com o Igraph

# Método de ler arquivo externo para o Igraph

from igraph import *

g = Graph.DataFrame(pcnet_network, directed=False)

In [68]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 19116
Number of edges: 2712881
Density of the graph: 0.01484873359048549


In [69]:
betweenness = g.betweenness()

In [70]:
closeness = g.closeness()

# Dá pra remover a normalização (normalized=False)

In [71]:
degree = g.degree()

In [72]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

# mode=zero coloca 0 ao invés de NaN 

In [73]:
named_vertex_list = g.vs()["name"]

In [74]:
features = {'gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_pcnet = pd.DataFrame(data = features)

features_pcnet

Unnamed: 0,gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,2113.888728,0.428117,147,0.390830
1,A1CF,21015.474359,0.453995,378,0.146689
2,A2M,25571.804053,0.466766,528,0.161908
3,A2ML1,1304.695377,0.390413,70,0.265839
4,A3GALT2,0.000000,0.258046,2,1.000000
...,...,...,...,...,...
19111,ZYG11A,571.429360,0.412664,16,0.241667
19112,ZYG11B,4193.140233,0.434382,82,0.147546
19113,ZYX,45292.050381,0.487889,887,0.155705
19114,ZZEF1,8962.770517,0.447501,160,0.134277


In [75]:
features_pcnet['clustering_coefficient'].isnull().values.any()

False

In [76]:
features_pcnet.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,19116.0,19116.0,19116.0,19116.0
mean,13126.59,0.426686,283.833543,0.26192
std,170020.4,0.045474,342.335498,0.166531
min,0.0,0.236166,1.0,0.0
25%,500.9621,0.399273,51.0,0.160059
50%,3131.099,0.437014,168.0,0.223593
75%,11269.56,0.460836,387.25,0.321432
max,22579550.0,0.671338,10008.0,1.0


In [77]:
colunas_para_normalizar = ['betweenness', 'degree']

features_pcnet_num = features_pcnet[colunas_para_normalizar]

In [78]:
normalized_pcnet = (features_pcnet_num-features_pcnet_num.min())/(features_pcnet_num.max()-features_pcnet_num.min())

features_pcnet[normalized_pcnet.columns] = normalized_pcnet

features_pcnet

Unnamed: 0,gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000094,0.428117,0.014590,0.390830
1,A1CF,0.000931,0.453995,0.037674,0.146689
2,A2M,0.001133,0.466766,0.052663,0.161908
3,A2ML1,0.000058,0.390413,0.006895,0.265839
4,A3GALT2,0.000000,0.258046,0.000100,1.000000
...,...,...,...,...,...
19111,ZYG11A,0.000025,0.412664,0.001499,0.241667
19112,ZYG11B,0.000186,0.434382,0.008094,0.147546
19113,ZYX,0.002006,0.487889,0.088538,0.155705
19114,ZZEF1,0.000397,0.447501,0.015889,0.134277


In [79]:
features_pcnet.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_pcnet2.tsv', sep='\t')

## Rede ConsensusPathDB

In [53]:
cpdb_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/CPDB_network.tsv', sep = '\t')

cpdb_network = cpdb_network.drop(columns=['weight', 'Unnamed: 0'])

cpdb_network

Unnamed: 0,source,target
0,RNF14,VDR
1,RNF14,SMAD4
2,RNF14,UBE2D4
3,RNF14,UBE2D2
4,RNF14,UBE2D3
...,...,...
1641844,CFL1,SERPINH1
1641845,DNM1L,PLEKHG1
1641846,MRPL23,SERPINH1
1641847,NFIA,NFIX


In [54]:
# Teste com a rede
# Adicionando novas propriedades com o Igraph

# Método de ler arquivo externo para o Igraph

from igraph import *

g = Graph.DataFrame(cpdb_network, directed=False)

In [55]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 16243
Number of edges: 1641849
Density of the graph: 0.012446792743046529


In [56]:
betweenness = g.betweenness()

In [57]:
closeness = g.closeness()

In [58]:
degree = g.degree()

In [59]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [60]:
named_vertex_list = g.vs()["name"]

In [61]:
features = {'gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_cpdb = pd.DataFrame(data = features)

features_cpdb

Unnamed: 0,gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,16417.445370,0.330832,12,0.045455
1,A1CF,466.288346,0.347189,13,0.089744
2,A2M,111535.227827,0.422422,231,0.159081
3,A2ML1,5620.194837,0.411993,206,0.711295
4,A4GALT,38.784912,0.294981,5,0.000000
...,...,...,...,...,...
16238,ZXDC,11606.867427,0.391348,87,0.739642
16239,ZYG11B,2749.092811,0.377584,73,0.554033
16240,ZYX,60064.116263,0.464479,1279,0.720111
16241,ZZEF1,7085.116494,0.389488,130,0.266547


In [62]:
features_cpdb.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,16243.0,16243.0,16243.0,16243.0
mean,14391.1,0.368269,202.160808,0.339283
std,76413.64,0.055545,379.520736,0.290625
min,0.0,0.190106,1.0,0.0
25%,43.89172,0.33142,6.0,0.086957
50%,940.9602,0.364443,33.0,0.293151
75%,8831.295,0.404191,209.0,0.523762
max,7406854.0,1.0,2685.0,1.0


In [63]:
colunas_para_normalizar = ['betweenness', 'degree']

features_cpdb_num = features_cpdb[colunas_para_normalizar]

In [64]:
normalized_cpdb = (features_cpdb_num-features_cpdb_num.min())/(features_cpdb_num.max()-features_cpdb_num.min())

features_cpdb[normalized_cpdb.columns] = normalized_cpdb

features_cpdb

Unnamed: 0,gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.002217,0.330832,0.004098,0.045455
1,A1CF,0.000063,0.347189,0.004471,0.089744
2,A2M,0.015058,0.422422,0.085693,0.159081
3,A2ML1,0.000759,0.411993,0.076379,0.711295
4,A4GALT,0.000005,0.294981,0.001490,0.000000
...,...,...,...,...,...
16238,ZXDC,0.001567,0.391348,0.032042,0.739642
16239,ZYG11B,0.000371,0.377584,0.026826,0.554033
16240,ZYX,0.008109,0.464479,0.476155,0.720111
16241,ZZEF1,0.000957,0.389488,0.048063,0.266547


In [65]:
features_cpdb.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_cpdb2.tsv', sep='\t')

## Rede HPRD

In [2]:
HPRD_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/HPRD_network.tsv', sep = '\t')

HPRD_network = HPRD_network.drop(columns=['weight', 'Unnamed: 0'])

HPRD_network

Unnamed: 0,source,target
0,NR3C1,RNF14
1,NCOA4,RNF14
2,ESR1,RNF14
3,RNF14,UBE2E1
4,RNF14,TENT5A
...,...,...
36839,SELE,SELL
36840,CRYAA,CRYAB
36841,PLAT,PLAU
36842,NFIB,NFIC


In [3]:
from igraph import *

g = Graph.DataFrame(HPRD_network, directed=False)

In [4]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 9438
Number of edges: 36844
Density of the graph: 0.0008273377506666206


In [5]:
betweenness = g.betweenness()

In [6]:
closeness = g.closeness()

In [7]:
degree = g.degree()

In [8]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [9]:
named_vertex_list = g.vs()["name"]

In [20]:
features = {'Gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_HPRD = pd.DataFrame(data = features)

features_HPRD

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000000,1.000000,1,0.000000
1,A1CF,1085.665471,0.216218,5,0.100000
2,A2M,93056.528595,0.268178,28,0.010582
3,AAGAB,193.787629,0.235709,2,0.000000
4,AAMP,0.000000,0.211438,1,0.000000
...,...,...,...,...,...
9433,ZWINT,1608.560504,0.213496,4,0.333333
9434,ZXDC,47.097132,0.227652,3,0.000000
9435,ZYG11B,0.000000,0.202896,2,1.000000
9436,ZYX,14002.501200,0.271666,15,0.038095


In [21]:
features_HPRD.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,9438.0,9438.0,9438.0,9438.0
mean,14434.63,0.258789,7.807586,0.103293
std,61342.77,0.116118,14.51105,0.209445
min,0.0,0.102681,1.0,0.0
25%,0.0,0.221024,1.0,0.0
50%,830.5735,0.242942,3.0,0.0
75%,9252.461,0.263241,8.0,0.107143
max,2051643.0,1.0,269.0,1.0


In [24]:
colunas_para_normalizar = ['betweenness', 'degree']

features_HPRD_num = features_HPRD[colunas_para_normalizar]

In [25]:
normalized_HPRD = (features_HPRD_num-features_HPRD_num.min())/(features_HPRD_num.max()-features_HPRD_num.min())

features_HPRD[normalized_HPRD.columns] = normalized_HPRD

features_HPRD

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000000,1.000000,0.000000,0.000000
1,A1CF,0.000529,0.216218,0.014925,0.100000
2,A2M,0.045357,0.268178,0.100746,0.010582
3,AAGAB,0.000094,0.235709,0.003731,0.000000
4,AAMP,0.000000,0.211438,0.000000,0.000000
...,...,...,...,...,...
9433,ZWINT,0.000784,0.213496,0.011194,0.333333
9434,ZXDC,0.000023,0.227652,0.007463,0.000000
9435,ZYG11B,0.000000,0.202896,0.003731,1.000000
9436,ZYX,0.006825,0.271666,0.052239,0.038095


In [26]:
features_HPRD.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_HPRD2.tsv', sep='\t')

## Rede IRefIndex

In [40]:
IRefIndex_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/IREF_network.tsv', sep = '\t')

IRefIndex_network = IRefIndex_network.drop(columns=['weight', 'Unnamed: 0'])

IRefIndex_network

Unnamed: 0,source,target
0,AR,RNF14
1,RNF14,TCF3
2,RNF14,UBE2E1
3,RNF14,UBE2D4
4,RNF14,UBE2D3
...,...,...
133090,SELL,VCAN
133091,SELP,VCAN
133092,CAND1,LDHD
133093,CAND1,LDHA


In [41]:
# Teste com a rede
# Adicionando novas propriedades com o Igraph

# Método de ler arquivo externo para o Igraph

from igraph import *

g = Graph.DataFrame(IRefIndex_network, directed=False)

In [42]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 14627
Number of edges: 133095
Density of the graph: 0.0012442593294278452


In [43]:
betweenness = g.betweenness()

In [44]:
closeness = g.closeness()

In [45]:
degree = g.degree()

In [46]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [47]:
named_vertex_list = g.vs()["name"]

In [48]:
features = {'Gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_IRefIndex = pd.DataFrame(data = features)

features_IRefIndex

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,14945.627114,0.313320,10,0.022222
1,A1CF,609.936600,0.311720,8,0.035714
2,A2M,80077.362083,0.350583,95,0.034938
3,AAAS,175.276561,0.303527,3,0.000000
4,AACS,240.606680,0.397731,2,0.000000
...,...,...,...,...,...
14622,ZYG11A,0.000000,0.397147,1,0.000000
14623,ZYG11B,567.593206,0.399803,7,0.571429
14624,ZYX,41430.684930,0.411592,70,0.081573
14625,ZZEF1,3366.198437,0.398797,12,0.136364


In [49]:
features_IRefIndex.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,14627.0,14627.0,14627.0,14627.0
mean,13745.07,0.356689,18.198537,0.19092
std,591842.2,0.062456,75.565904,0.248986
min,0.0,0.159775,1.0,0.0
25%,4.728685,0.30436,2.0,0.0
50%,368.5127,0.397147,6.0,0.109091
75%,4033.74,0.400868,17.0,0.268197
max,71415020.0,1.0,7855.0,1.0


In [50]:
colunas_para_normalizar = ['betweenness', 'degree']

features_IRefIndex_num = features_IRefIndex[colunas_para_normalizar]

In [51]:
normalized_IRefIndex = (features_IRefIndex_num-features_IRefIndex_num.min())/(features_IRefIndex_num.max()-features_IRefIndex_num.min())

features_IRefIndex[normalized_IRefIndex.columns] = normalized_IRefIndex

features_IRefIndex

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000209,0.313320,0.001146,0.022222
1,A1CF,0.000009,0.311720,0.000891,0.035714
2,A2M,0.001121,0.350583,0.011968,0.034938
3,AAAS,0.000002,0.303527,0.000255,0.000000
4,AACS,0.000003,0.397731,0.000127,0.000000
...,...,...,...,...,...
14622,ZYG11A,0.000000,0.397147,0.000000,0.000000
14623,ZYG11B,0.000008,0.399803,0.000764,0.571429
14624,ZYX,0.000580,0.411592,0.008785,0.081573
14625,ZZEF1,0.000047,0.398797,0.001401,0.136364


In [52]:
features_IRefIndex.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_IRefIndex2.tsv', sep='\t')

## Rede MultiNet

In [27]:
MultiNet_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/MULTINET_network.tsv', sep = '\t')

MultiNet_network = MultiNet_network.drop(columns=['weight', 'Unnamed: 0'])

MultiNet_network

Unnamed: 0,source,target
0,RNF14,VDR
1,ACVR1,RNF14
2,DYRK2,RNF14
3,RNF14,UBE2E1
4,RNF14,UBE2D4
...,...,...
108563,LDHB,LDHD
108564,LDHC,LDHD
108565,LDHA,LDHB
108566,LDHA,LDHC


In [28]:
# Teste com a rede
# Adicionando novas propriedades com o Igraph

# Método de ler arquivo externo para o Igraph

from igraph import *

g = Graph.DataFrame(MultiNet_network, directed=False)

In [29]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 13987
Number of edges: 108568
Density of the graph: 0.0011099763727203493


In [30]:
betweenness = g.betweenness()

In [31]:
closeness = g.closeness()

In [32]:
degree = g.degree()

In [33]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [34]:
named_vertex_list = g.vs()["name"]

In [35]:
features = {'Gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_MultiNet = pd.DataFrame(data = features)

features_MultiNet

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000000,1.000000,1,0.000000
1,A1CF,13991.165081,0.270370,5,0.100000
2,A2M,29383.925891,0.321576,20,0.036842
3,AAAS,690.777544,0.303816,2,0.000000
4,AACS,121.853872,0.272239,14,0.670330
...,...,...,...,...,...
13982,ZXDC,6.240442,0.277358,3,0.000000
13983,ZYG11B,0.000000,0.262454,1,0.000000
13984,ZYX,58643.466095,0.356741,147,0.063554
13985,ZZEF1,64.776437,0.250521,3,0.000000


In [36]:
features_MultiNet.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,13987.0,13987.0,13987.0,13987.0
mean,16547.66,0.301565,15.52413,0.145295
std,164765.6,0.049578,42.936628,0.228178
min,0.0,0.164976,1.0,0.0
25%,5.381284,0.276362,2.0,0.0
50%,652.3142,0.299097,5.0,0.048905
75%,4932.319,0.32248,13.0,0.190476
max,11157570.0,1.0,1494.0,1.0


In [37]:
colunas_para_normalizar = ['betweenness', 'degree']

features_MultiNet_num = features_MultiNet[colunas_para_normalizar]

In [38]:
normalized_MultiNet = (features_MultiNet_num-features_MultiNet_num.min())/(features_MultiNet_num.max()-features_MultiNet_num.min())

features_MultiNet[normalized_MultiNet.columns] = normalized_MultiNet

features_MultiNet

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000000e+00,1.000000,0.000000,0.000000
1,A1CF,1.253962e-03,0.270370,0.002679,0.100000
2,A2M,2.633543e-03,0.321576,0.012726,0.036842
3,AAAS,6.191113e-05,0.303816,0.000670,0.000000
4,AACS,1.092119e-05,0.272239,0.008707,0.670330
...,...,...,...,...,...
13982,ZXDC,5.593014e-07,0.277358,0.001340,0.000000
13983,ZYG11B,0.000000e+00,0.262454,0.000000,0.000000
13984,ZYX,5.255937e-03,0.356741,0.097790,0.063554
13985,ZZEF1,5.805607e-06,0.250521,0.001340,0.000000


In [39]:
features_MultiNet.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_MultiNet2.tsv', sep='\t')

## Rede STRING

In [80]:
STRING_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/STRING_network.tsv', sep = '\t')

STRING_network = STRING_network.drop(columns=['weight', 'Unnamed: 0'])

STRING_network

Unnamed: 0,source,target
0,RNF14,UBE2Q1
1,RNF14,UBE2Q2
2,RNF11,RNF14
3,PPP3R1,RNF14
4,HSPA4,RNF14
...,...,...
5002407,SLC7A13,SLC7A14
5002408,SELE,SELP
5002409,SLC7A10,SLC7A13
5002410,SLC7A11,SLC7A13


In [81]:
# Teste com a rede
# Adicionando novas propriedades com o Igraph

# Método de ler arquivo externo para o Igraph

from igraph import *

g = Graph.DataFrame(STRING_network, directed=False)

In [82]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 17872
Number of edges: 5002412
Density of the graph: 0.031324737661587145


In [83]:
betweenness = g.betweenness()

In [84]:
closeness = g.closeness()

In [85]:
degree = g.degree()

In [86]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [87]:
named_vertex_list = g.vs()["name"]

In [88]:
features = {'Gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_STRING = pd.DataFrame(data = features)

features_STRING

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,5253.644977,0.478295,428,0.374335
1,A1CF,648.937435,0.447816,122,0.220566
2,A2M,7839.773306,0.485717,537,0.274202
3,A2ML1,3595.793085,0.433100,140,0.170606
4,A4GALT,764.478846,0.444630,75,0.179099
...,...,...,...,...,...
17867,ZYG11A,175.391474,0.397876,49,0.380102
17868,ZYG11B,616.325903,0.416972,66,0.265268
17869,ZYX,8106.738542,0.490275,500,0.265828
17870,ZZEF1,14118.069789,0.495591,808,0.485185


In [89]:
features_STRING.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,17872.0,17872.0,17872.0,17872.0
mean,10515.19,0.463544,559.804387,0.31544
std,35184.01,0.042582,655.531382,0.145698
min,0.0,0.269073,1.0,0.0
25%,1085.367,0.435899,110.0,0.223041
50%,3401.583,0.470197,310.0,0.297449
75%,9738.051,0.494251,758.0,0.378183
max,2171985.0,0.632759,7532.0,1.0


In [90]:
colunas_para_normalizar = ['betweenness', 'degree']

features_STRING_num = features_STRING[colunas_para_normalizar]

In [91]:
normalized_STRING = (features_STRING_num-features_STRING_num.min())/(features_STRING_num.max()-features_STRING_num.min())

features_STRING[normalized_STRING.columns] = normalized_STRING

features_STRING

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.002419,0.478295,0.056699,0.374335
1,A1CF,0.000299,0.447816,0.016067,0.220566
2,A2M,0.003609,0.485717,0.071172,0.274202
3,A2ML1,0.001656,0.433100,0.018457,0.170606
4,A4GALT,0.000352,0.444630,0.009826,0.179099
...,...,...,...,...,...
17867,ZYG11A,0.000081,0.397876,0.006374,0.380102
17868,ZYG11B,0.000284,0.416972,0.008631,0.265268
17869,ZYX,0.003732,0.490275,0.066259,0.265828
17870,ZZEF1,0.006500,0.495591,0.107157,0.485185


In [92]:
features_STRING.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_STRING2.tsv', sep='\t')

## UNITY Network

In [93]:
UNITY_network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final/UNITY_network.tsv', sep = '\t')

UNITY_network = UNITY_network.drop(columns=['weight', 'Unnamed: 0'])

UNITY_network

Unnamed: 0,source,target
0,TMCO1,UBE2Q1
1,UBE2Q1,ZNF706
2,RNF115,UBE2Q1
3,METTL13,UBE2Q1
4,DHX8,UBE2Q1
...,...,...
7365077,SLC7A13,SLC7A14
7365078,SELE,SELP
7365079,SLC7A10,SLC7A13
7365080,SLC7A11,SLC7A13


In [94]:
from igraph import *

g = Graph.DataFrame(UNITY_network, directed=False)

In [95]:
print("Number of vertices:", g.vcount())
print("Number of edges:", g.ecount())
print("Density of the graph:", 2*g.ecount()/(g.vcount()*(g.vcount()-1)))

Number of vertices: 19602
Number of edges: 7365082
Density of the graph: 0.038337957235106884


In [96]:
betweenness = g.betweenness()

In [97]:
closeness = g.closeness()

In [98]:
degree = g.degree()

In [99]:
clustering_coefficient = g.transitivity_local_undirected(mode='zero')

In [100]:
named_vertex_list = g.vs()["name"]

In [101]:
features = {'Gene': named_vertex_list, 'betweenness': betweenness, 'closeness': closeness, 'degree': degree, 'clustering_coefficient': clustering_coefficient}

features_UNITY = pd.DataFrame(data = features)

features_UNITY

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,2700.808523,0.483569,456,0.366879
1,A1CF,6200.188933,0.482688,435,0.158292
2,A2M,11953.037870,0.495989,923,0.215963
3,A2ML1,3685.027173,0.481006,353,0.360546
4,A3GALT2,0.000000,0.324552,2,1.000000
...,...,...,...,...,...
19597,ZYG11A,95.002963,0.439337,58,0.372656
19598,ZYG11B,1235.513121,0.467069,163,0.245172
19599,ZYX,31136.844003,0.522178,2148,0.358591
19600,ZZEF1,14047.078172,0.498335,960,0.382458


In [102]:
features_UNITY.describe()

# Verificar se o closeness não foge do intervalo de 0 a 1 

Unnamed: 0,betweenness,closeness,degree,clustering_coefficient
count,19602.0,19602.0,19602.0,19602.0
mean,10967.45,0.475996,751.4623,0.313689
std,53359.24,0.041046,798.479187,0.161185
min,0.0,0.246018,1.0,0.0
25%,905.4893,0.462081,165.0,0.220145
50%,3553.262,0.484657,490.0,0.28518
75%,10438.66,0.501519,1077.0,0.368292
max,5881562.0,0.68761,10765.0,1.0


In [103]:
colunas_para_normalizar = ['betweenness', 'degree']

features_UNITY_num = features_UNITY[colunas_para_normalizar]

In [104]:
normalized_UNITY = (features_UNITY_num-features_UNITY_num.min())/(features_UNITY_num.max()-features_UNITY_num.min())

features_UNITY[normalized_UNITY.columns] = normalized_UNITY

features_UNITY

Unnamed: 0,Gene,betweenness,closeness,degree,clustering_coefficient
0,A1BG,0.000459,0.483569,0.042271,0.366879
1,A1CF,0.001054,0.482688,0.040320,0.158292
2,A2M,0.002032,0.495989,0.085656,0.215963
3,A2ML1,0.000627,0.481006,0.032702,0.360546
4,A3GALT2,0.000000,0.324552,0.000093,1.000000
...,...,...,...,...,...
19597,ZYG11A,0.000016,0.439337,0.005295,0.372656
19598,ZYG11B,0.000210,0.467069,0.015050,0.245172
19599,ZYX,0.005294,0.522178,0.199461,0.358591
19600,ZZEF1,0.002388,0.498335,0.089093,0.382458


In [105]:
features_UNITY.to_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/features_das_redes/features_UNITY2.tsv', sep='\t')