In [25]:
import sys
import random
import numpy as np
import pandas as pd
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

SEED = 42
random.seed(SEED)
CELL_LINE = 'GM12878'
K_MER = 5
TEST_EID = 981 # arbitrary enhancer id to be used tests

In [13]:
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [20]:
def load_data(dataset_name):
    
    # STEP 1: Load all feature vectors, class labels and graph
    features_file = open('data/{}/features'.format(CELL_LINE), "rb")
    loaded_features = pkl.load(features_file)
    features_file.close()

    labels_file = open('data/{}/labels'.format(CELL_LINE), "rb")
    loaded_labels = pkl.load(labels_file)
    labels_file.close()
    
    graph_file = open('data/{}/graph'.format(CELL_LINE), "rb")
    loaded_graph = pkl.load(graph_file)
    graph_file.close()

    loaded_adj = nx.adjacency_matrix(nx.from_dict_of_lists(loaded_graph))

    # STEP 2: Load IDs of labeled_train/unlabeled_train/validation/test nodes
    idx_x_file = open('data/{}/x.index'.format(CELL_LINE), "rb")
    loaded_idx_x = pkl.load(idx_x_file)
    idx_x_file.close()

    idx_ux_file = open('data/{}/ux.index'.format(CELL_LINE), "rb")
    loaded_idx_ux = pkl.load(idx_ux_file)
    idx_ux_file.close()

    idx_vx_file = open('data/{}/vx.index'.format(CELL_LINE), "rb")
    loaded_idx_vx = pkl.load(idx_vx_file)
    idx_vx_file.close()

    idx_tx_file = open('data/{}/tx.index'.format(CELL_LINE), "rb")
    loaded_idx_tx = pkl.load(idx_tx_file)
    idx_tx_file.close()

    # STEP 3: Take subsets from loaded features and class labels using loaded IDs
    loaded_x = loaded_features[loaded_idx_x]
    loaded_y = loaded_labels[loaded_idx_x]

    loaded_ux = loaded_features[loaded_idx_ux]
    loaded_uy = loaded_labels[loaded_idx_ux]

    loaded_vx = loaded_features[loaded_idx_vx]
    loaded_vy = loaded_labels[loaded_idx_vx]

    loaded_tx = loaded_features[loaded_idx_tx]
    loaded_ty = loaded_labels[loaded_idx_tx]
    
    # STEP 4: MASK LABELS
    train_mask = sample_mask(loaded_idx_x, loaded_labels.shape[0])
    val_mask = sample_mask(loaded_idx_vx, loaded_labels.shape[0])
    test_mask = sample_mask(loaded_idx_tx, loaded_labels.shape[0])

    y_train = np.zeros(loaded_labels.shape)
    y_val = np.zeros(loaded_labels.shape)
    y_test = np.zeros(loaded_labels.shape)
    y_train[train_mask, :] = loaded_labels[train_mask, :]
    y_val[val_mask, :] = loaded_labels[val_mask, :]
    y_test[test_mask, :] = loaded_labels[test_mask, :]

    return loaded_adj, loaded_features, y_train, y_val, y_test, train_mask, val_mask, test_mask

In [26]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(CELL_LINE)