# GCN data requirements:

https://github.com/kimiyoung/planetoid#prepare-the-data

https://github.com/tkipf/gcn#data

|  |  |
| :-- | :-- |
| ind.GM12878.x | the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object |
| ind.GM12878.tx | the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object |
| ind.GM12878.allx | the feature vectors of both labeled and unlabeled training instances (a superset of ind.GM12878.x) as scipy.sparse.csr.csr_matrix object |
| ind.GM12878.y | the one-hot labels of the labeled training instances as numpy.ndarray object |
| ind.GM12878.ty | the one-hot labels of the test instances as numpy.ndarray object |
| ind.GM12878.ally | the labels for instances in ind.GM12878.allx as numpy.ndarray object |
| ind.GM12878.graph | a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object |
| ind.GM12878.test.index | the indices of test instances in graph, for the inductive setting as list object |

**All objects above must be saved using python pickle module.**

In [1]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
import pandas as pd
from scipy.sparse.linalg.eigen.arpack import eigsh
from sklearn.feature_extraction.text import TfidfVectorizer

CELL_LINE = 'GM12878'
K_MER = 5
TEST_EID = 981 # arbitrary enhancer id to be used tests

## Create a new DF where each element is a tuple of 3 elements: (id, name, sequence)

In [2]:
def getEnhancerById(df_ep, eid):
    for row in range(len(df_ep)):
        enh = df_ep['enhancer'][row]
        if enh[0] == eid:
            return enh

In [3]:
df_ep_sentences = pd.read_csv('data/' + CELL_LINE + '/ep_sentences_' + str(K_MER) + 'mer.csv')

e_list = []
p_list = []

for i in range(len(df_ep_sentences)):
    e_list.append( (df_ep_sentences['enhancer_name'][i], df_ep_sentences['enhancer_sentence'][i]) )
    p_list.append( (df_ep_sentences['promoter_name'][i], df_ep_sentences['promoter_sentence'][i]) )

x_list = sorted(list(set(list(df_ep_sentences['enhancer_name']) + list(df_ep_sentences['promoter_name']))))

id_dict = {}
chr_id = 0
for x in x_list:
    id_dict[x] = chr_id
    chr_id += 1
    
#print(len(id_dict), 'keys in dictionary\n')

for i in range(len(e_list)):
    e_list[i] = (id_dict[e_list[i][0]], ) + e_list[i]
    
for i in range(len(p_list)):
    p_list[i] = (id_dict[p_list[i][0]], ) + p_list[i]

df_ep = pd.DataFrame({'enhancer': e_list, 'promoter': p_list})
df_ep.head()

Unnamed: 0,enhancer,promoter
0,"(1265, GM12878|chr1:9685722-9686400, TGACA GAC...","(1266, GM12878|chr1:9747084-9749721, TTTTG TTT..."
1,"(1199, GM12878|chr1:24136556-24136600, GTGGC T...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
2,"(1200, GM12878|chr1:24136600-24136932, GAAAC A...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
3,"(1201, GM12878|chr1:24137625-24137875, GTGCC T...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."
4,"(1202, GM12878|chr1:24139145-24139414, GCCCA C...","(1205, GM12878|chr1:24193468-24194871, TGAAT G..."


In [4]:
test_enh = getEnhancerById(df_ep, TEST_EID)
print(test_enh, '\n')
print('Test sentence length =', len(test_enh[2].split(' ')))

(981, 'GM12878|chr19:39930827-39930919', 'ACAAA CAAAT AAATG AATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAC TGACT GACTA ACTAC CTACA TACAG ACAGC CAGCT AGCTG GCTGC CTGCA TGCAT GCATG CATGT ATGTA TGTAA GTAAA TAAAT AAATA AATAG ATAGT TAGTG AGTGT GTGTT TGTTT GTTTA TTTAC TTACT TACTC ACTCT CTCTG TCTGT CTGTG TGTGC GTGCC TGCCA GCCAG CCAGG CAGGT AGGTA GGTAT GTATT TATTG ATTGG TTGGT TGGTT GGTTT GTTTA TTTAA TTAAA TAAAT AAATG AATGC ATGCT TGCTT GCTTT CTTTA TTTAA TTAAG TAAGT AAGTA AGTAT GTATG TATGT ATGTT TGTTA GTTAG TTAGC TAGCT AGCTT GCTTA CTTAT TTATT TATTT ATTTA TTTAC') 

Test sentence length = 89


# CREATE ADJACENCY MATRIX (NxN)

N = Number of nodes (enhancers + promoters)

In [5]:
from scipy.sparse import csr_matrix

adj = csr_matrix((len(id_dict), len(id_dict)), dtype=np.longlong)

for i in range(len(df_ep)):
    x = df_ep['enhancer'][i][0]
    y = df_ep['promoter'][i][0]
    adj[x,y] = 1
    adj[y,x] = 1

adj

  self._set_intXint(row, col, x.flat[0])


<2668x2668 sparse matrix of type '<class 'numpy.longlong'>'
	with 4226 stored elements in Compressed Sparse Row format>

In [7]:
print('Enhancer', TEST_EID, '-> Promoter', adj[TEST_EID].indices[0])

Enhancer 981 -> Promoter 983


# CREATE FEATURE VECTORS (NxD)

N = Number of nodes (enhancers + promoters)

D = Number of words in vocabulary (corpus)

In [8]:
merged_list = list(set(list(df_ep['enhancer']) + list(df_ep['promoter'])))
merged_list = sorted(merged_list) # sort by first element (id)

corpus = []
for t in merged_list:
    corpus.append(t[2])

print('TEST FOR ENHANCER', TEST_EID, '\n')
print('Length =', len(corpus[TEST_EID].split(' ')), '\n')
print(corpus[TEST_EID])

TEST FOR ENHANCER 981 

Length = 89 

ACAAA CAAAT AAATG AATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAT TGATG GATGA ATGAC TGACT GACTA ACTAC CTACA TACAG ACAGC CAGCT AGCTG GCTGC CTGCA TGCAT GCATG CATGT ATGTA TGTAA GTAAA TAAAT AAATA AATAG ATAGT TAGTG AGTGT GTGTT TGTTT GTTTA TTTAC TTACT TACTC ACTCT CTCTG TCTGT CTGTG TGTGC GTGCC TGCCA GCCAG CCAGG CAGGT AGGTA GGTAT GTATT TATTG ATTGG TTGGT TGGTT GGTTT GTTTA TTTAA TTAAA TAAAT AAATG AATGC ATGCT TGCTT GCTTT CTTTA TTTAA TTAAG TAAGT AAGTA AGTAT GTATG TATGT ATGTT TGTTA GTTAG TTAGC TAGCT AGCTT GCTTA CTTAT TTATT TATTT ATTTA TTTAC


In [10]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(corpus)

# Test feature vectors

In [12]:
print('TESTS FOR ENHANCER', TEST_EID, '\n')

vector = list(features[TEST_EID].indices)
print(vector, '\n') # vector tokens are not ordered

# Get sentence with id = test_id from df
test_enh = getEnhancerById(df_ep, TEST_EID)
sent = test_enh[2]

tokens = []
for word in sent.split(' '):
    tokens.append(vectorizer.vocabulary_[word.lower()])
    # used lower() because vectorizer keys are lowercased (e.g. aactg)

tokens = list(set(tokens)) # since vector does not have duplicated tokens
print(tokens, '\n')

print('Number of unique words in sentence:', len(vector), "vs", len(tokens))
print('Comparison of sets of unique words:', sorted(vector) == sorted(tokens), '\n')

print(features[TEST_EID])

TESTS FOR ENHANCER 981 

[225, 691, 231, 718, 830, 236, 73, 172, 590, 903, 315, 807, 786, 119, 779, 568, 719, 827, 967, 814, 239, 764, 927, 499, 754, 203, 50, 478, 44, 959, 187, 751, 975, 956, 1003, 250, 57, 910, 227, 56, 969, 295, 540, 916, 741, 113, 158, 299, 14, 704, 915, 484, 330, 953, 703, 943, 633, 179, 508, 639, 1009, 252, 831, 12, 259, 797, 636, 64, 452, 962, 771, 960, 1008, 944, 594, 494, 891, 159] 

[12, 14, 540, 44, 50, 568, 56, 57, 64, 73, 590, 594, 113, 119, 633, 636, 639, 158, 159, 172, 691, 179, 187, 703, 704, 203, 718, 719, 225, 227, 741, 231, 236, 751, 239, 754, 250, 764, 252, 259, 771, 779, 786, 797, 295, 807, 299, 814, 315, 827, 830, 831, 330, 891, 903, 910, 915, 916, 927, 943, 944, 953, 956, 959, 960, 962, 452, 967, 969, 975, 478, 484, 1003, 494, 1008, 1009, 499, 508] 

Number of unique words in sentence: 78 vs 78
Comparison of sets of unique words: True 

  (0, 225)	0.09889583756087965
  (0, 691)	0.10906175037204857
  (0, 231)	0.09277112814377503
  (0, 718)	0.11188

In [13]:
import random
test_eid_list = []

for row in [random.randint(0, len(df_ep)) for i in range(5)]:
    test_eid_list.append(df_ep['enhancer'][row][0])

for test_eid in test_eid_list:
    L1 = len(features[test_eid].indices)

    test_enh = getEnhancerById(df_ep, test_eid)
    sent = test_enh[2]
    
    L2 = len(set(sent.split(' ')))
    print('ENHANCER', test_eid, 'number of unique words', L1, '=?', L2)

ENHANCER 384 number of unique words 281 =? 281
ENHANCER 1204 number of unique words 102 =? 102
ENHANCER 1145 number of unique words 19 =? 19
ENHANCER 2040 number of unique words 317 =? 317
ENHANCER 335 number of unique words 82 =? 82


# CREATE BINARY LABEL MATRIX (NxE)

N = Number of nodes (enhancers + promoters)

E = Number of classes

In [25]:
labels = np.zeros(shape=(len(id_dict),2), dtype=float)

for i in range(len(df_ep)):
    eid = df_ep['enhancer'][i][0]
    pid = df_ep['promoter'][i][0]
    labels[eid] = [1,0] # enhancer class
    labels[pid] = [0,1] # promoter class

labels

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [26]:
# WRITE FILES
# ind.GM12878.allx = features
# ind.GM12878.ally = labels
# ind.GM12878.graph = adj