# GCN data requirements:

https://github.com/kimiyoung/planetoid#prepare-the-data

https://github.com/tkipf/gcn#data

In [1]:
import sys
import random
import numpy as np
import pandas as pd
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from IPython.display import clear_output, display, HTML

SEED = 42
LABEL_RATE = 0.2 # [0.2, 0.1, 0.05]
random.seed(SEED)
CELL_LINE = 'GM12878'
K_MER = 5
TEST_EID = 981 # arbitrary enhancer id to be used tests

## Create a new DF where each element is a tuple of 3 elements: (id, name, sequence)

In [2]:
def getNodeById(df_ep, node_id):
    for row in range(len(df_ep)):
        enh = df_ep['enhancer'][row]
        pro = df_ep['promoter'][row]
        if enh[0] == node_id:
            return enh
        elif pro[0] == node_id:
            return pro

In [3]:
df_ep_sentences = pd.read_csv('{}/ep_sentences_{}mer.csv'.format(CELL_LINE, K_MER))

e_list = []
p_list = []

for i in range(len(df_ep_sentences)):
    e_list.append( (df_ep_sentences['enhancer_name'][i], df_ep_sentences['enhancer_sentence'][i]) )
    p_list.append( (df_ep_sentences['promoter_name'][i], df_ep_sentences['promoter_sentence'][i]) )

x_list = sorted(list(set(list(df_ep_sentences['enhancer_name']) + list(df_ep_sentences['promoter_name']))))

id_dict = {}
chr_id = 0
for x in x_list:
    id_dict[x] = chr_id
    chr_id += 1
    
# DUMP ID_DICT
nodes_file = open('{}/nodes'.format(CELL_LINE), "wb")
pkl.dump(id_dict, nodes_file)
nodes_file.close()

for i in range(len(e_list)):
    e_list[i] = (id_dict[e_list[i][0]], ) + e_list[i]
    
for i in range(len(p_list)):
    p_list[i] = (id_dict[p_list[i][0]], ) + p_list[i]

df_ep = pd.DataFrame({'enhancer': e_list, 'promoter': p_list})
display(df_ep.head())

Unnamed: 0,enhancer,promoter
0,"(1265, GM12878|chr1:9685722-9686400, TGACA GAC...","(1266, GM12878|chr1:9747084-9749721, TTTTG TTT..."
1,"(1199, GM12878|chr1:24136556-24136600, GTGGC T...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
2,"(1200, GM12878|chr1:24136600-24136932, GAAAC A...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
3,"(1201, GM12878|chr1:24137625-24137875, GTGCC T...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
4,"(1202, GM12878|chr1:24139145-24139414, GCCCA C...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."


In [4]:
test_enh = getNodeById(df_ep, TEST_EID)
print('{} \n'.format(test_enh))
print('Test sentence length = {}'.format(len(test_enh[2].split(' '))))

(981, 'GM12878|chr19:39930827-39930919', 'ACAAA CAAAT AAATG AATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAC TGACT GACTA ACTAC CTACA TACAG ACAGC CAGCT AGCTG GCTGC CTGCA TGCAT GCATG CATGT ATGTA TGTAA GTAAA TAAAT AAATA AATAG ATAGT TAGTG AGTGT GTGTT TGTTT GTTTA TTTAC TTACT TACTC ACTCT CTCTG TCTGT CTGTG TGTGC GTGCC TGCCA GCCAG CCAGG CAGGT AGGTA GGTAT GTATT TATTG ATTGG TTGGT TGGTT GGTTT GTTTA TTTAA TTAAA TAAAT AAATG AATGC ATGCT TGCTT GCTTT CTTTA TTTAA TTAAG TAAGT AAGTA AGTAT GTATG TATGT ATGTT TGTTA GTTAG TTAGC TAGCT AGCTT GCTTA CTTAT TTATT TATTT ATTTA TTTAC') 

Test sentence length = 89


# CREATE ADJACENCY MATRIX (NxN)

N = Number of nodes (enhancers + promoters)

In [5]:
adj = sp.csr_matrix((len(id_dict), len(id_dict)), dtype=np.int32)

for i in range(len(df_ep)):
    x = df_ep['enhancer'][i][0]
    y = df_ep['promoter'][i][0]
    adj[x,y] = 1
    adj[y,x] = 1

display(adj)

  self._set_intXint(row, col, x.flat[0])


<2668x2668 sparse matrix of type '<class 'numpy.int32'>'
	with 4226 stored elements in Compressed Sparse Row format>

In [6]:
print('Enhancer {} -> Promoter {}'.format(TEST_EID, adj[TEST_EID].indices))

Enhancer 981 -> Promoter [983]


# CREATE FEATURE VECTORS (NxD)

N = Number of nodes (enhancers + promoters)

D = Number of words in vocabulary (corpus)

In [7]:
merged_list = list(set(list(df_ep['enhancer']) + list(df_ep['promoter'])))
merged_list = sorted(merged_list) # sort by first element (id)

corpus = []
for t in merged_list:
    corpus.append(t[2])

print('TEST FOR ENHANCER {} \n'.format(TEST_EID))
print('Length = {} \n'.format(len(corpus[TEST_EID].split(' '))))
print(corpus[TEST_EID])

TEST FOR ENHANCER 981 

Length = 89 

ACAAA CAAAT AAATG AATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAC TGACT GACTA ACTAC CTACA TACAG ACAGC CAGCT AGCTG GCTGC CTGCA TGCAT GCATG CATGT ATGTA TGTAA GTAAA TAAAT AAATA AATAG ATAGT TAGTG AGTGT GTGTT TGTTT GTTTA TTTAC TTACT TACTC ACTCT CTCTG TCTGT CTGTG TGTGC GTGCC TGCCA GCCAG CCAGG CAGGT AGGTA GGTAT GTATT TATTG ATTGG TTGGT TGGTT GGTTT GTTTA TTTAA TTAAA TAAAT AAATG AATGC ATGCT TGCTT GCTTT CTTTA TTTAA TTAAG TAAGT AAGTA AGTAT GTATG TATGT ATGTT TGTTA GTTAG TTAGC TAGCT AGCTT GCTTA CTTAT TTATT TATTT ATTTA TTTAC


In [8]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(corpus)

In [9]:
display(features)

<2668x1024 sparse matrix of type '<class 'numpy.float64'>'
	with 1113047 stored elements in Compressed Sparse Row format>

# Test feature vectors

In [10]:
print('TESTS FOR NODE {} \n'.format(TEST_EID))

vector = list(features[TEST_EID].indices)
print('{} \n'.format(vector)) # vector tokens are not ordered

# Get sentence with id = test_id from df
test_enh = getNodeById(df_ep, TEST_EID)
sent = test_enh[2]

tokens = []
for word in sent.split(' '):
    tokens.append(vectorizer.vocabulary_[word.lower()])
    # used lower() because vectorizer keys are lowercased (e.g. aactg)

tokens = list(set(tokens)) # since vector does not have duplicated tokens

print('{} \n'.format(tokens))
print('Number of unique words in sentence: {} vs {}'.format(len(vector), len(tokens)))
print('Comparison of sets of unique words: {} \n'.format(sorted(vector) == sorted(tokens)))

print(features[TEST_EID])

TESTS FOR NODE 981 

[225, 691, 231, 718, 830, 236, 73, 172, 590, 903, 315, 807, 786, 119, 779, 568, 719, 827, 967, 814, 239, 764, 927, 499, 754, 203, 50, 478, 44, 959, 187, 751, 975, 956, 1003, 250, 57, 910, 227, 56, 969, 295, 540, 916, 741, 113, 158, 299, 14, 704, 915, 484, 330, 953, 703, 943, 633, 179, 508, 639, 1009, 252, 831, 12, 259, 797, 636, 64, 452, 962, 771, 960, 1008, 944, 594, 494, 891, 159] 

[12, 14, 540, 44, 50, 568, 56, 57, 64, 73, 590, 594, 113, 119, 633, 636, 639, 158, 159, 172, 691, 179, 187, 703, 704, 203, 718, 719, 225, 227, 741, 231, 236, 751, 239, 754, 250, 764, 252, 259, 771, 779, 786, 797, 295, 807, 299, 814, 315, 827, 830, 831, 330, 891, 903, 910, 915, 916, 927, 943, 944, 953, 956, 959, 960, 962, 452, 967, 969, 975, 478, 484, 1003, 494, 1008, 1009, 499, 508] 

Number of unique words in sentence: 78 vs 78
Comparison of sets of unique words: True 

  (0, 225)	0.09889583756087965
  (0, 691)	0.10906175037204857
  (0, 231)	0.09277112814377503
  (0, 718)	0.111881297

In [11]:
random.seed(SEED)
test_eid_list = []

for row in [random.randint(0, len(df_ep)) for i in range(5)]:
    test_eid_list.append(df_ep['enhancer'][row][0])

for test_eid in test_eid_list:
    L1 = len(features[test_eid].indices)

    test_enh = getNodeById(df_ep, test_eid)
    sent = test_enh[2]
    
    L2 = len(set(sent.split(' ')))
    print('Node {} number of unique words {} =? {}'.format(test_eid, L1, L2))

Node 981 number of unique words 78 =? 78
Node 124 number of unique words 166 =? 166
Node 1196 number of unique words 147 =? 147
Node 2540 number of unique words 643 =? 643
Node 2295 number of unique words 99 =? 99


# CREATE BINARY LABEL MATRIX (NxE)

N = Number of nodes (enhancers + promoters)

E = Number of classes

In [12]:
labels = np.zeros(shape=(len(id_dict),2), dtype=np.int8) # 8-bit signed integer (-128 to 127)

for i in range(len(df_ep)):
    eid = df_ep['enhancer'][i][0]
    pid = df_ep['promoter'][i][0]
    labels[eid] = [1,0] # enhancer class
    labels[pid] = [0,1] # promoter class

display(labels)

array([[0, 1],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [0, 1]], dtype=int8)

# TRAIN TEST VALIDATION SPLIT

**Label rate = 0.2** (the number of labeled nodes that are used for training divided by the total number of nodes in dataset)

**20%** labeled training (x), **40%** unlabaled training (ux), **20%** validation (vx), **20%** test (tx)

allx = x + ux + vx

In [13]:
def getIdPortions(id_dict):

    idx = list(id_dict.values())
    idx_allx, idx_tx = train_test_split(idx, test_size=0.2, random_state=SEED)
    idx_x_vx, idx_ux = train_test_split(idx_allx, test_size=1-LABEL_RATE*2/0.8, random_state=SEED)
    idx_x, idx_vx = train_test_split(idx_x_vx, test_size=0.5, random_state=SEED)
    
    return idx_x, idx_ux, idx_vx, idx_tx

In [14]:
idx_x, idx_ux, idx_vx, idx_tx = getIdPortions(id_dict)

print(' {} labeled training \n {} validation \n {} test \n{} unlabeled training'
      .format(len(idx_x), len(idx_vx), len(idx_tx), len(idx_ux)))

 533 labeled training 
 534 validation 
 534 test 
1067 unlabeled training


## DUMP INDEX FILES

In [15]:
idx_x_file = open('{}/x.index'.format(CELL_LINE), "wb")
pkl.dump(idx_x, idx_x_file)
idx_x_file.close()

idx_ux_file = open('{}/ux.index'.format(CELL_LINE), "wb")
pkl.dump(idx_ux, idx_ux_file)
idx_ux_file.close()

idx_vx_file = open('{}/vx.index'.format(CELL_LINE), "wb")
pkl.dump(idx_vx, idx_vx_file)
idx_vx_file.close()

idx_tx_file = open('{}/tx.index'.format(CELL_LINE), "wb")
pkl.dump(idx_tx, idx_tx_file)
idx_tx_file.close()

## DUMP FEATURE VECTORS & CLASS LABELS

In [16]:
features_file = open('{}/features'.format(CELL_LINE), "wb")
pkl.dump(features, features_file)
features_file.close()

labels_file = open('{}/labels'.format(CELL_LINE), "wb")
pkl.dump(labels, labels_file)
labels_file.close()

## DUMP GRAPH EDGES

In [17]:
graph = {i: np.nonzero(row)[1].tolist() for i,row in enumerate(adj)}

# Print first k elements of graph
{k: graph[k] for k in list(graph)[:15]}

graph_file = open('{}/graph'.format(CELL_LINE), "wb")
pkl.dump(graph, graph_file)
graph_file.close()

## LOAD INDICES

In [18]:
def load_indices(cell_line):
    idx_x_file = open('{}/x.index'.format(cell_line), "rb")
    loaded_idx_x = pkl.load(idx_x_file)
    idx_x_file.close()

    idx_ux_file = open('{}/ux.index'.format(cell_line), "rb")
    loaded_idx_ux = pkl.load(idx_ux_file)
    idx_ux_file.close()

    idx_vx_file = open('{}/vx.index'.format(cell_line), "rb")
    loaded_idx_vx = pkl.load(idx_vx_file)
    idx_vx_file.close()

    idx_tx_file = open('{}/tx.index'.format(cell_line), "rb")
    loaded_idx_tx = pkl.load(idx_tx_file)
    idx_tx_file.close()
    
    return loaded_idx_x, loaded_idx_ux, loaded_idx_vx, loaded_idx_tx

In [19]:
loaded_idx_x, loaded_idx_ux, loaded_idx_vx, loaded_idx_tx = load_indices(CELL_LINE)

print('Labeled train indices:\t\t{}\tLength = {}'.format(loaded_idx_x[:5], len(loaded_idx_x)))
print('Unlabeled train indices:\t{}\tLength = {}'.format(loaded_idx_ux[:5], len(loaded_idx_ux)))
print('Validation indices:\t\t{}\tLength = {}'.format(loaded_idx_vx[:5], len(loaded_idx_vx)))
print('Test indices:\t\t\t{}\tLength = {}\n'.format(loaded_idx_tx[:5], len(loaded_idx_tx)))

Labeled train indices:		[2625, 2472, 1978, 1248, 1877]	Length = 533
Unlabeled train indices:	[374, 206, 1348, 2556, 2088]	Length = 1067
Validation indices:		[687, 1191, 1516, 1246, 1054]	Length = 534
Test indices:			[298, 1809, 1575, 2448, 929]	Length = 534



## LOAD FEATURE VECTORS & CLASS LABELS & GRAPH

In [20]:
def load_features_labels_graph(cell_line):

    features_file = open('{}/features'.format(cell_line), "rb")
    loaded_features = pkl.load(features_file)
    features_file.close()

    labels_file = open('{}/labels'.format(cell_line), "rb")
    loaded_labels = pkl.load(labels_file)
    labels_file.close()
    
    graph_file = open('{}/graph'.format(cell_line), "rb")
    loaded_graph = pkl.load(graph_file)
    graph_file.close()

    loaded_adj = nx.adjacency_matrix(nx.from_dict_of_lists(loaded_graph))
    
    return loaded_features, loaded_labels, loaded_adj

In [21]:
loaded_features, loaded_labels, loaded_adj = load_features_labels_graph(CELL_LINE)

## SELECT SUBSETS OF FEATURES & LABELS

In [22]:
loaded_x = loaded_features[loaded_idx_x]
loaded_y = loaded_labels[loaded_idx_x]

loaded_ux = loaded_features[loaded_idx_ux]
loaded_uy = loaded_labels[loaded_idx_ux]

loaded_vx = loaded_features[loaded_idx_vx]
loaded_vy = loaded_labels[loaded_idx_vx]

loaded_tx = loaded_features[loaded_idx_tx]
loaded_ty = loaded_labels[loaded_idx_tx]

print('Labeled train features:\t\t{}'.format(loaded_x.shape))
print('Unlabeled train features:\t{}'.format(loaded_ux.shape))
print('Validation features:\t\t{}'.format(loaded_vx.shape))
print('Test features:\t\t\t{}\n'.format(loaded_tx.shape))

print('Labeled train labels:\t\t{}'.format(loaded_y.shape))
print('Unlabeled train labels:\t\t{}'.format(loaded_uy.shape))
print('Validation labels:\t\t{}'.format(loaded_vy.shape))
print('Test labels:\t\t\t{}'.format(loaded_ty.shape))

Labeled train features:		(533, 1024)
Unlabeled train features:	(1067, 1024)
Validation features:		(534, 1024)
Test features:			(534, 1024)

Labeled train labels:		(533, 2)
Unlabeled train labels:		(1067, 2)
Validation labels:		(534, 2)
Test labels:			(534, 2)


## TESTS FOR LOADED DATA

In [23]:
node_index = 0
node_id = loaded_idx_ux[node_index]

print('Number of unique words for node {}'.format(node_id))
print('Actual dataframe -> {}'.format(len(set(getNodeById(df_ep, node_id)[2].split(' ')))))
print('Loaded features  -> {}'.format(len(loaded_ux[node_index].indices)))

Number of unique words for node 374
Actual dataframe -> 808
Loaded features  -> 808


In [24]:
display(loaded_adj)

<2668x2668 sparse matrix of type '<class 'numpy.longlong'>'
	with 4226 stored elements in Compressed Sparse Row format>

In [25]:
display(adj)

<2668x2668 sparse matrix of type '<class 'numpy.int32'>'
	with 4226 stored elements in Compressed Sparse Row format>

In [26]:
print(np.allclose(adj.A, loaded_adj.A))

True


In [27]:
print('Enhancer {} -> Promoter {}'.format(TEST_EID, adj[TEST_EID].indices))

Enhancer 981 -> Promoter [983]


In [28]:
print('Enhancer {} -> Promoter {}'.format(TEST_EID, loaded_adj[TEST_EID].indices))

Enhancer 981 -> Promoter [983]


In [29]:
print(adj[:10])

  (0, 1)	1
  (1, 0)	1
  (2, 4)	1
  (2, 5)	1
  (3, 83)	1
  (3, 84)	1
  (4, 2)	1
  (5, 2)	1
  (6, 7)	1
  (6, 8)	1
  (6, 9)	1
  (6, 10)	1
  (6, 11)	1
  (6, 12)	1
  (7, 6)	1
  (8, 6)	1
  (9, 6)	1


In [30]:
print(loaded_adj[:10])

  (0, 1)	1
  (1, 0)	1
  (2, 4)	1
  (2, 5)	1
  (3, 83)	1
  (3, 84)	1
  (4, 2)	1
  (5, 2)	1
  (6, 7)	1
  (6, 8)	1
  (6, 9)	1
  (6, 10)	1
  (6, 11)	1
  (6, 12)	1
  (7, 6)	1
  (8, 6)	1
  (9, 6)	1
