# Convert data




In [1]:
pip install utils



In [2]:
# Instalar las versiones específicas de las librerías
!pip install numpy
!pip install scipy
!pip install scikit-learn
!pip install pandas
!pip install tensorflow
!pip install networkx
!pip install numba



In [3]:
import random
import scipy.sparse as sp

import pickle
from sklearn.preprocessing import scale
from utils import *

# Create Data Loader
# Load expression data file of shape E * N where N is number of genes and E is number of experiments
def load_data(datafile, normalize=True):
    """
    This function loads data set
    :param datafile:
    :return expression data:
    """
    # Load data file
    df = pd.read_csv(datafile, sep='\t', header=0)
    df.columns = [int(x[1:]) - 1 for x in df.columns]
    if normalize==True:
        df = pd.DataFrame(scale(df, axis=0))
    t_data = df.T
    return (t_data)

def create_train_test_split(path, adj, test_size=0.1, validation_size=0.1, save_to_file=True):
    print("Creating train test and validation split")
    g = nx.Graph(adj)

    # Use the updated function name
    adj = nx.to_scipy_sparse_array(g)

    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()

    # Split link information to train and validation with test split size
    edgelist = convertAdjMatrixToSortedRankTSV(adj.todense())
    geneids = edgelist.iloc[:, :2]
    col1 = np.array(geneids).min(axis=1).astype(int)
    col2 = np.array(geneids).max(axis=1).astype(int)
    col3 = np.array(edgelist.iloc[:, 2])
    data_df = pd.DataFrame()
    data_df['i'] = col1
    data_df['j'] = col2
    data_df['k'] = col3
    data_df = data_df.drop_duplicates()

    pos_edges = data_df.loc[data_df.iloc[:, 2] == 1]
    neg_edgelist = data_df.loc[data_df.iloc[:, 2] == 0]
    ind = random.sample(range(len(neg_edgelist)), pos_edges.shape[0])
    neg_edges = pd.DataFrame(np.random.permutation(neg_edgelist.values))
    neg_edges = neg_edges.iloc[ind, :]

    X_pos, test_edges = train_test_split(pos_edges.values, test_size=test_size)
    X_neg, test_edges_false = train_test_split(neg_edges.values, test_size=test_size)

    train_edges, val_edges = train_test_split(X_pos, test_size=validation_size)
    train_edges_false, val_edges_false = train_test_split(X_neg, test_size=validation_size)

    assert set(map(tuple, test_edges_false)).isdisjoint(set(map(tuple, train_edges)))
    assert set(map(tuple, val_edges_false)).isdisjoint(set(map(tuple, train_edges)))
    assert set(map(tuple, train_edges_false)).isdisjoint(set(map(tuple, train_edges)))

    # assert: test, val, train false edges disjoint
    assert set(map(tuple, test_edges_false)).isdisjoint(set(map(tuple, val_edges_false)))
    assert set(map(tuple, test_edges_false)).isdisjoint(set(map(tuple, train_edges_false)))
    assert set(map(tuple, val_edges_false)).isdisjoint(set(map(tuple, train_edges_false)))

    # assert: test, val, train positive edges disjoint
    assert set(map(tuple, val_edges)).isdisjoint(set(map(tuple, train_edges)))
    assert set(map(tuple, test_edges)).isdisjoint(set(map(tuple, train_edges)))
    assert set(map(tuple, val_edges)).isdisjoint(set(map(tuple, test_edges)))

    dataset = {
        'train_pos': train_edges,
        'train_neg': train_edges_false,
        'val_pos': val_edges,
        'val_neg': val_edges_false,
        'test_pos': test_edges,
        'test_neg': test_edges_false
    }

    if save_to_file:
        with open(path + "/split_data_" + str(round(1.0 - validation_size, 2)) + ".pkl", 'wb') as test_split_file:
            pickle.dump(dataset, test_split_file)

    return dataset

# Evaluation

In [4]:
import numpy as np
from numba import jit

from sklearn.metrics import average_precision_score, roc_auc_score

def sigmoid(x):
    x = np.clip(x, -709, 709)
    return 1 / (1 + np.exp(-x))

# Evaluate ROC using predicted matrix
def evaluate_ROC_from_matrix(X_edges, y_true, matrix):
    y_predict = [sigmoid(matrix[int(edge[0]), int(edge[1])]) for edge in X_edges]
    roc = roc_auc_score(y_true, y_predict)
    if roc < 0.5:
        roc = 1 - roc
    pr = average_precision_score(y_true, y_predict)
    return roc, pr

# Load the embedding generated by other methods:LINE, Node2vec
def load_embedding(embedding_file, N, combineAttribute=False, datafile=None):
    f = open(embedding_file)
    i = 0
    line = f.readline()
    line = line.strip().split(' ')
    d = int(line[1])
    embeddings = np.random.randn(int(N), d)
    line = f.readline()
    while line:
        line = line.strip().split(' ')
        embeddings[int(line[0]),:] = line[1:]
        i = i + 1
        line = f.readline()
    f.close()
    if combineAttribute:
        data = load_datafile(datafile, N)
        # print(data.shape)
        temp = np.hstack((embeddings, data))
        # print(temp.shape)
        embeddings = temp
    return embeddings

# Load the expression data
def load_datafile(data_file, N):
    f = open(data_file)
    i = 0
    line = f.readline()
    line = line.strip().split(' ')
    d = len(line[1:])
    data = np.zeros([int(N), d])
    while line:
        # print(i)
        data[int(line[0]),:] = line[1:]
        i = i + 1
        line = f.readline()
        if i < N:
            line = line.strip().split(' ')
        else:
            break
    f.close()
    return data

@jit(nopython=True)
def get_edge_embeddings(Embeddings, edge_list):
    embs = np.empty((len(edge_list), Embeddings.shape[1]), dtype=Embeddings.dtype)
    for i in range(len(edge_list)):
        edge = edge_list[i]
        node1 = int(edge[0])
        node2 = int(edge[1])
        emb1 = Embeddings[node1]
        emb2 = Embeddings[node2]
        edge_emb = emb1 * emb2  # Element-wise multiplication
        embs[i] = edge_emb
    return embs


# Modelo GNE

In [5]:
import pandas as pd
import random
import os
import numpy as np
import tensorflow as tf
from sklearn.base import BaseEstimator, TransformerMixin
from numba import njit

class GNE(BaseEstimator, TransformerMixin):
    def __init__(self, path, data, random_seed=2018, parameters=None):
        # bind params to class
        # bind data to class
        self.path = path
        self.nodes = data.nodes
        self.node_neighbors_map = data.node_neighbors_map
        self.node_N = data.id_N
        self.attr_M = data.attr_M
        self.X_train = data.X

        # bind model parameters to class
        self.id_embedding_size = parameters['id_embedding_size']
        self.attr_embedding_size = parameters['attr_embedding_size']
        self.batch_size = parameters['batch_size']
        self.alpha = parameters['alpha']
        self.n_neg_samples = parameters['n_neg_samples']
        self.epoch = parameters['epoch']
        self.random_seed = random_seed
        self.learning_rate = parameters['learning_rate']
        self.representation_size = parameters['representation_size']

        # init all variables in a tensorflow graph
        self._init_graph()
        print(parameters)

    def _init_graph(self):
        '''
        Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
        '''
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Set graph level random seed
            tf.random.set_seed(self.random_seed)

            # Define a placeholder for input data
            self.train_data_id = tf.compat.v1.placeholder(tf.int32, shape=[None])  # batch_size * 1
            self.train_data_attr = tf.compat.v1.placeholder(tf.float32, shape=[None, self.attr_M])  # batch_size * attr_M
            self.train_labels = tf.compat.v1.placeholder(tf.int32, shape=[None, 1])  # batch_size * 1

            # Define placeholder for dropout
            self.keep_prob = tf.compat.v1.placeholder(tf.float32)

            # load initialzed variable.
            self.weights = self._initialize_weights()

            # Model.
            # Look up embeddings for node_id. u = ENC(node_id)
            self.id_embed = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.weights['in_embeddings'], self.train_data_id), 1)

            # non linear transformation of expression data
            self.attr_embed = tf.nn.l2_normalize(tf.nn.elu(tf.matmul(self.train_data_attr, self.weights['attr_embeddings'])), 1)

            # Concatenation layer to concatenate structure and attribute
            self.embed_layer = tf.concat([self.id_embed, self.alpha * self.attr_embed], 1)

            # Non-linear transformation of concatenated representation
            self.representation_layer_dropout = tf.nn.dropout(self.embed_layer, self.keep_prob)
            self.representation_layer = tf.nn.tanh(tf.matmul(self.representation_layer_dropout, self.weights['hidden_weights']))

            # Compute the loss, using a sample of the negative labels each time.
            self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(self.weights['out_embeddings'], self.weights['biases'], self.train_labels, self.representation_layer, self.n_neg_samples, self.node_N))

            # Adam Optimizer.
            self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)

            # init
            init = tf.compat.v1.global_variables_initializer()
            self.sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True))

            self.sess.run(init)

    def _initialize_weights(self):
        all_weights = dict()

        # Weights associated with structure embedding
        all_weights['in_embeddings'] = tf.Variable(tf.random.uniform([self.node_N, self.id_embedding_size], -1.0, 1.0))  # id_N * id_dim

        # Weights associated with attribute embedding
        all_weights['attr_embeddings'] = tf.Variable(tf.random.normal([self.attr_M, int(self.attr_embedding_size)]))  # attr_M * attr_dim

        # Weights and biases associated with neighborhood embedding
        all_weights['out_embeddings'] = tf.Variable(tf.random.normal([self.node_N, self.representation_size]))
        all_weights['biases'] = tf.Variable(tf.zeros([self.node_N]))

        # Weights associated with hidden layer transformation
        all_weights['hidden_weights'] = tf.Variable(tf.random.normal([self.id_embedding_size + self.attr_embedding_size, self.representation_size]))  # attr_M *eattr_dim
        return all_weights

    def partial_fit(self, X):
        # Create a dictionary to feed to tensorflow graph
        feed_dict = {self.train_data_id: X['batch_data_id'],
                     self.train_data_attr: X['batch_data_attr'],
                     self.train_labels: X['batch_data_label'],
                     self.keep_prob: 0.5}

        # run the graph to compute loss
        loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
        return loss

    def train(self, validation_edges, validation_labels):  # fit a dataset
        # Número de iteraciones ejecutadas
        total_iterations = 0

        # Mejor precisión de validación vista hasta ahora.
        best_validation_accuracy = 0.0

        # Número de iteraciones desde la última mejora en la precisión de validación.
        last_improvement = 0

        # Detener la optimización si no se encuentra mejora en este número de iteraciones.
        require_improvement = 2

        print('Using structure and attribute embedding')
        for epoch in range(self.epoch):
            # Fijar la semilla para aleatorizar la permutación en cada epoch
            random.seed(epoch)

            # Permutación aleatoria de los datos
            perm = np.random.permutation(len(self.X_train['data_id_list']))
            self.X_train['data_id_list'] = self.X_train['data_id_list'][perm]
            self.X_train['data_attr_list'] = self.X_train['data_attr_list'][perm]
            self.X_train['data_label_list'] = self.X_train['data_label_list'][perm]

            # Calcular el número de lotes usando batch_size
            total_batch = int(len(self.X_train['data_id_list']) / self.batch_size)

            # Loop sobre todos los lotes
            total_iterations += 1
            avg_cost = 0.
            for i in range(total_batch):
                # Generar un lote de datos
                batch_xs = {}

                # Fijar la semilla para aleatorizar la permutación en cada lote dentro de cada epoch
                random.seed(epoch * i)
                start_index = np.random.randint(0, len(self.X_train['data_id_list']) - self.batch_size)
                batch_xs['batch_data_id'] = self.X_train['data_id_list'][start_index:(start_index + self.batch_size)]
                batch_xs['batch_data_attr'] = self.X_train['data_attr_list'][start_index:(start_index + self.batch_size)]
                batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)]

                # Entrenar el modelo con el lote de datos y calcular la pérdida
                cost = self.partial_fit(batch_xs)
                avg_cost += cost / total_batch

            # Obtener embeddings del modelo entrenado
            Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
            Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
            Embeddings = Embeddings_out + Embeddings_in

            # Test de predicción de enlaces
            adj_matrix_rec = np.dot(Embeddings, Embeddings.T)
            roc, pr = evaluate_ROC_from_matrix(validation_edges, validation_labels, adj_matrix_rec)

            attr_embeddings = self.getEmbedding('attribute', self.nodes)
            # Si la precisión de validación es una mejora sobre la mejor conocida.
            if roc > best_validation_accuracy:
                # Actualizar la mejor precisión de validación conocida.
                best_validation_accuracy = roc

                # Fijar la iteración para la última mejora a la actual.
                last_improvement = total_iterations

                # Guardar todas las variables del gráfico de TensorFlow en un archivo.
                self.embedding_checkpoints(Embeddings, "save", "all")
                self.embedding_checkpoints(attr_embeddings, "save", "attribute")

                # Una cadena para imprimir a continuación, muestra que se encontró una mejora.
                improved_str = '*'
            else:
                # Una cadena vacía para imprimir a continuación.
                # Muestra que no se encontró mejora.
                improved_str = ''

            # Mensaje de estado para imprimir.
            msg = "Epoch: {0:>6}, Train-Batch Loss: {1:.9f}, Validation AUC: {2:.9f} {3}"
            print(msg.format(epoch + 1, avg_cost, roc, improved_str))

            # Detención temprana: Si no se encuentra mejora en el número requerido de iteraciones, detener el entrenamiento del modelo
            if total_iterations - last_improvement > require_improvement:
                print("No improvement found in a while, stopping optimization.")
                # Romper el bucle for.
                break

        Embeddings = self.embedding_checkpoints(Embeddings, "restore", "all")
        attr_embeddings = self.embedding_checkpoints(attr_embeddings, "restore", "attribute")
        return Embeddings, attr_embeddings

    def getEmbedding(self, type, nodes):
        # Obtener el embedding
        if type == 'embed_layer':
            feed_dict = {self.train_data_id: nodes['node_id'], self.train_data_attr: nodes['node_attr'],
                         self.keep_prob: 1}
            Embedding = self.sess.run(self.representation_layer, feed_dict=feed_dict)
            return Embedding
        if type == 'out_embedding':
            Embedding = self.sess.run(self.weights['out_embeddings'])
            return Embedding
        if type == 'attribute':
            Embedding = self.sess.run(self.weights['attr_embeddings'])
            return Embedding
        if type == 'structure':
            Embedding = self.sess.run(self.weights['in_embeddings'])
            return Embedding

    def embedding_checkpoints(self, Embeddings, type, embedding_type="all"):
        file = self.path + "Embeddings_" + embedding_type + ".txt"
        if type == "save":
            if os.path.isfile(file):
                os.remove(file)
            pd.DataFrame(Embeddings).to_csv(file, index=False, header=False)
        if type == 'restore':
            Embeddings = pd.read_csv(file, header=None)
            return np.array(Embeddings)

# Ejemplo de uso de la función get_edge_embeddings
# pos_train_edge_embs = get_edge_embeddings(embeddings, train_edges)
# neg_train_edge_embs = get_edge_embeddings(embeddings, train_edges_false)
# train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])


# Load data

In [6]:
from numba import jit
import os

class LoadData( object ):
    '''given the path of data, return the data format for GNE
    :param path
    return:
     X: a dictionary, ['data_id_list']-- len(links) of id for nodes in links ;
                      ['data_attr_list']-- a list of attrs for corresponding nodes;
                     ['data_label_list']-- len(links) of neighbor for corresponding nodes

     nodes: a dictionary, ['node_id']--len(nodes) of id for nodes, one by one; ['node_attr']--a list of attrs for corresponding nodes
    '''

    # Three files are needed in the path
    def __init__(self, path, train_links, features_file,  normalize_features=True):
        self.path = path
        self.undirected = True
        # Define files to work with
        self.train_links = train_links
        self.datafile = features_file
        self.attrfile = self.path + "data_standard.txt"
        os.remove(self.attrfile)

        # Load expression data
        data = load_data(self.datafile, normalize_features)
        data.to_csv(self.attrfile, header=None, sep=' ', mode='a')

        # Split train file to train and validation
        # create_train_test_split(self.path, self.linkfile, test_size=test_size, validation_size=validation_size)

        self.node_map = {}
        self.nodes = {}
        self.X = {}

        self.node_neighbors_map = {} # [nodeid: neighbors_set] each node id maps to its neighbors set
        self.construct_nodes()
        self.read_link()
        self.construct_node_neighbors_map()
        self.construct_X()

    def readExp(self):
        f = open(self.attrfile)
        line = f.readline()
        items = line.strip().split(' ')
        self.attr_M = len(items[1:])
        print("Dimension of attributes:", self.attr_M)

    def construct_nodes(self):
        '''construct the dictionary '''
        self.readExp()
        f = open(self.attrfile)
        i = 0
        self.nodes['node_id'] = []
        self.nodes['node_attr'] = []
        line = f.readline()
        while line:
            line = line.strip().split(' ')
            self.node_map[int(line[0])] = i # map the node
            self.nodes['node_id'].append(i) # only put id in nodes, not the original name
            self.nodes['node_attr'].append(line[1:])
            i = i + 1
            line = f.readline()
        f.close()
        self.id_N = i
        print("Number of genes:", self.id_N)

    def construct_X(self):
        self.X['data_id_list'] = np.ndarray(shape=(len(self.links)), dtype=np.int32)
        self.X['data_attr_list'] = np.ndarray(shape=(len(self.links),  self.attr_M), dtype=np.float32)
        self.X['data_label_list'] = np.ndarray(shape=(len(self.links), 1), dtype=np.int32)

        for i in range(len(self.links)):
            self.X['data_id_list'][i] = int(self.node_map[int(self.links[i][0])])
            self.X['data_attr_list'][i] = self.nodes['node_attr'][int(self.links[i][0])]  # dimension need to change to  self.attr_dim
            self.X['data_label_list'][i, 0] = int(self.node_map[int(self.links[i][1])])  # one neighbor of the node


    def construct_node_neighbors_map(self):
        for link in self.links:
            if self.node_map[ int(link[0]) ] not in self.node_neighbors_map:
                self.node_neighbors_map[self.node_map[ link[0] ]] = set([self.node_map[int(link[1])]])
            else:
                self.node_neighbors_map[self.node_map[ link[0] ]].add(self.node_map[int(link[1])])

    def read_link(self):  # read link file to a list of links
        self.links = []
        if self.undirected:
            print("Making adjacency matrix symmetric since the graph is undirected.")
        for edge in self.train_links:
            link = [int(edge[0]), int(edge[1])]
            self.links.append(link)
            if self.undirected:
                link = [int(edge[1]), int(edge[0])]
                self.links.append(link)


In [7]:
import collections
if not hasattr(collections, 'Mapping'):
    import collections.abc
    collections.Mapping = collections.abc.Mapping

# Now you can safely import your library
# Example:
# import your_library

# Rest of your code


# Utils

In [8]:
# Parchear el archivo problemático de networkx
!sed -i 's/from collections import Mapping, Set, Iterable/from collections.abc import Mapping, Set, Iterable/' /usr/local/lib/python3.10/dist-packages/networkx/classes/reportviews.py

# Instalar o actualizar las bibliotecas necesarias
!pip install -U numpy pandas scikit-learn scipy networkx numba


Collecting numpy
  Using cached numpy-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
accelerate 0.32.1 requires numpy<2.0.0,>=1.17, but you have numpy 2.0.1 which is incompatible.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.1 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.14.0 which is incompatible.
tensorflow 2.15.0 requires numpy<2.0.0,>=1.23.5, but you have numpy 2.0.1 which is incompatible.
thinc 8.2.5 requires numpy<2.0.0,>=1.19.0; 

In [9]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 17 10:26:11 2017

@author: kishan_kc
"""
import csv
from random import randint

import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import networkx as nx
from numba import jit
from scipy.sparse import coo_matrix


def convertAdjMatrixToSortedRankTSV(inputFile=None, outputFilename=None, desc=True):
    tbl = inputFile

    rownames = range(tbl.shape[0])
    # First column -> repeat the predictors
    firstCol = np.repeat(rownames, tbl.shape[1]).reshape((tbl.shape[0]*tbl.shape[1], 1))
    # Second column -> repeat the targets
    secondCol = []

    x = np.array([i for i in range(tbl.shape[0])])
    secondCol = np.tile(x, len(range(tbl.shape[1])))
    # for i in range(tbl.shape[1]):
    #     # print(i)
    #     secondCol = np.append(secondCol, range(tbl.shape[0]))
    # print(len(secondCol))

    secondCol = secondCol.reshape((tbl.shape[0]*tbl.shape[1], 1))
    thirdCol = np.matrix.flatten(np.matrix(tbl)).reshape((tbl.shape[0]*tbl.shape[1], 1))
    thirdCol = np.nan_to_num(thirdCol)
    # Gets the indices from a desc sort on the adjancy measures

    # Glue everything together
    result = np.column_stack((firstCol, secondCol, thirdCol))
    # Convert to dataframe
    result = pd.DataFrame(result)
    result.columns = ['c1','c2', 'c3']

    result = pd.DataFrame(result[result['c1']!=result['c2']])
    # Sort it using the indices obtained before
    result =  result.sort_values(['c3', 'c1', 'c2'], ascending=[0, 1, 1])
    result[['c1', 'c2']] = result[['c1', 'c2']].astype(int)
    # print("Write to file if filename is given")
    # result.to_csv(outputFilename, header=False, columns=None, index=False )
    # else write to function output
    return (result)


def convertSortedRankTSVToAdjMatrix (input=None, nodes=None, undirected=True):
    print("Converting to Adjacency matrix")
    tbl = pd.DataFrame(input).drop_duplicates()
    tbl.columns = ['c1', 'c2', 'c3']
    tbl = tbl[tbl['c3']==1]

    tbl = tbl.sort_values(['c2'], ascending=1)
    tbl[['c1', 'c2', 'c3']] = tbl[['c1', 'c2', 'c3']].astype(int)
    # Pre allocate return matrix
    m = np.zeros((nodes, nodes))
    # Get the duplicates
    dups = tbl['c2'].duplicated()

    # # Get the startIndices of another column
    startIndices = list(np.where(dups== False)[0])

    for i in range(len(startIndices)-1):
        # print(i)
        colIndex = tbl.iloc[startIndices[i], 1]
        if startIndices[i]==(startIndices[i + 1] - 1):
            rowIndexes = tbl.iloc[startIndices[i], 0]
            valuesToAdd = tbl.iloc[startIndices[i], 2]
        else:
            rowIndexes = tbl.iloc[startIndices[i]:(startIndices[i + 1]), 0].values
            valuesToAdd = tbl.iloc[startIndices[i]:(startIndices[i + 1] ), 2].values
        m[rowIndexes, colIndex] = valuesToAdd
        if undirected:
            m[colIndex, rowIndexes] = valuesToAdd


    colIndex = tbl.iloc[startIndices[len(startIndices)-1], 1]
    rowIndexes = tbl.iloc[startIndices[len(startIndices)-1]:len(tbl.iloc[:, 1]), 0]
    valuesToAdd = tbl.iloc[startIndices[len(startIndices)-1]:len(tbl.iloc[:, 1]), 2]

    m[rowIndexes, colIndex] = valuesToAdd
    if undirected:
        m[colIndex, rowIndexes] = valuesToAdd

    m = pd.DataFrame(m)
    # m.to_csv(outputFilename, header=False, columns=None, index=False )
    # else write to function output
    return (m)

def load_network(filename, num_genes):
    print ("### Loading [%s]..." % (filename))
    i, j, val = np.loadtxt(filename).T
    A = coo_matrix((val, (i, j)), shape=(num_genes, num_genes))
    A = A.todense()
    A = np.squeeze(np.asarray(A))
    A = A - np.diag(np.diag(A))
    return A


In [10]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Run GNE

In [11]:
from sklearn.linear_model import LogisticRegression

################################# Define parameters to train GNE model #######################################
parameters = {}
# Dimension of topological structure embeddings
parameters['id_embedding_size'] = 128
# Dimension of expression data embeddings
parameters['attr_embedding_size'] = 128
# Dimension of final representation after transformation of concatenated topological properties and expression data representation
parameters['representation_size'] = 128
# Importance of gene expression relative to topological properties
parameters['alpha'] = 0

# Number of negative samples for Negative Sampling
parameters['n_neg_samples'] = 10
# Number of epochs to run the model
parameters['epoch'] = 20
# Number of sample to consider in the batch
parameters['batch_size'] = 256
# Learning rate
parameters['learning_rate'] = 0.005

print(parameters)

################################################################################################################


#################################### Define dataset and files ##################################################

path = '/content/drive/MyDrive/GNE model/'
geneids = pd.read_csv('/content/drive/MyDrive/GNE model/gene_ids.tsv', sep=" ")
num_genes = 19271

# Define the input to GNE model
link_file = "/content/drive/MyDrive/GNE model/archivo_genes.txt"
feature_file = '/content/drive/MyDrive/GNE model/expression_data.tsv'

################################################################################################################


################################# Load network and split to train and test######################################

adj = load_network(link_file, num_genes)

# Perform train-test split
dataset = create_train_test_split(path, adj, test_size=0.1, validation_size=0.1)

train_edges = dataset['train_pos']
train_edges_false = dataset['train_neg']
val_edges = dataset['val_pos']
val_edges_false = dataset['val_neg']
test_edges = dataset['test_pos']
test_edges_false = dataset['test_neg']

# Inspect train/test split
print("Total genes:", adj.shape[0])
print("Training interactions (positive):", len(train_edges))
print("Training interactions (negative):", len(train_edges_false))
print("Validation interactions (positive):", len(val_edges))
print("Validation interactions (negative):", len(val_edges_false))
print("Test interactions (positive):", len(test_edges))
print("Test interactions (negative):", len(test_edges_false))

################################################################################################################


###################### Combine positive and negative interactions for valdiation and test ######################
# Create validation edges and labels
validation_edges = np.concatenate([val_edges, val_edges_false])
val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

# Create test edges and labels
test_edges_data = np.concatenate([test_edges, test_edges_false])
test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

################################################################################################################


################## load interaction and expression data to fit GNE model and learn embeddings ##################
# load dataset to fit GNE model
Data = LoadData(path, train_links=train_edges, features_file=feature_file)

# Define GNE model with data and parameters
model = GNE(path, Data, 2018, parameters)

# learn embeddings
embeddings, attr_embeddings = model.train(validation_edges, val_edge_labels)

################################################################################################################


################## Create feature matrix and true labels for training and randomize the rows  ##################
# Train-set edge embeddings
pos_train_edge_embs = get_edge_embeddings(embeddings, train_edges)
neg_train_edge_embs = get_edge_embeddings(embeddings, train_edges_false)
train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

# Create train-set edge labels: 1 = real edge, 0 = false edge
train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

# Randomize train edges and labels
index = np.random.permutation([i for i in range(len(train_edge_labels))])
train_data = train_edge_embs[index, :]
train_labels = train_edge_labels[index]

################################################################################################################


################## Train the logistic regression on training data and predict on test dataset ##################
# Train logistic regression on train-set edge embeddings
edge_classifier = LogisticRegression(random_state=0)
edge_classifier.fit(train_data, train_labels)

# Test-set edge embeddings, labels
pos_test_edge_embs = get_edge_embeddings(embeddings, test_edges)
neg_test_edge_embs = get_edge_embeddings(embeddings, test_edges_false)
test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

# Randomize test edges and labels
index = np.random.permutation([i for i in range(len(test_edge_labels))])
test_data = test_edge_embs[index, :]
test_labels = test_edge_labels[index]

# Predict the probabilty for test edges by trained classifier
test_preds = edge_classifier.predict_proba(test_data)[:, 1]
test_roc = roc_auc_score(test_labels, test_preds)
test_ap = average_precision_score(test_labels, test_preds)

msg = "Alpha: {0:>6}, GNE Test ROC Score: {1:.9f}, GNE Test AP score: {2:.9f}"
print(msg.format(parameters['alpha'], test_roc, test_ap))

################################################################################################################


########################################## Save the embedding to a file ########################################

embeddings_file = open(path + "embeddings_trainsize_alpha_"+str(parameters['alpha'])+".pkl", 'wb')
pickle.dump(embeddings, embeddings_file)
embeddings_file.close()

################################################################################################################


{'id_embedding_size': 128, 'attr_embedding_size': 128, 'representation_size': 128, 'alpha': 0, 'n_neg_samples': 10, 'epoch': 20, 'batch_size': 256, 'learning_rate': 0.005}
### Loading [/content/drive/MyDrive/GNE model/archivo_genes.txt]...
Creating train test and validation split
Total genes: 19271
Training interactions (positive): 679186
Training interactions (negative): 679186
Validation interactions (positive): 75466
Validation interactions (negative): 75466
Test interactions (positive): 83851
Test interactions (negative): 83851
Dimension of attributes: 536
Number of genes: 19271
Making adjacency matrix symmetric since the graph is undirected.
{'id_embedding_size': 128, 'attr_embedding_size': 128, 'representation_size': 128, 'alpha': 0, 'n_neg_samples': 10, 'epoch': 20, 'batch_size': 256, 'learning_rate': 0.005}
Using structure and attribute embedding
Epoch:      1, Train-Batch Loss: 4.623316287, Validation AUC: 0.516864334 *
Epoch:      2, Train-Batch Loss: 1.379647534, Validation 

In [12]:
import pandas as pd
import itertools

# Definir el rango de números
n = 19271

# Generar todas las combinaciones únicas de dos números hasta el 19271
combinations = itertools.combinations(range(1, n+1), 2)

# Crear un dataframe a partir de las combinaciones
df = pd.DataFrame(combinations, columns=['Number1', 'Number2'])

df


Unnamed: 0,Number1,Number2
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6
...,...,...
185676080,19268,19270
185676081,19268,19271
185676082,19269,19270
185676083,19269,19271


In [13]:
df = df[df['Number1'] != df['Number2']]

In [14]:
import pandas as pd

# Ordenar los valores de cada fila para tener un orden consistente
df_sorted = pd.DataFrame({
    'min': df[['Number1', 'Number2']].min(axis=1),
    'max': df[['Number1', 'Number2']].max(axis=1)
})

# Eliminar los duplicados
df_unique = df_sorted.drop_duplicates()

# Recuperar los pares originales (a, b)
df_result = pd.merge(df, df_unique, how='inner', left_on=['Number1', 'Number2'], right_on=['min', 'max'])

# Seleccionar solo las columnas originales
df_result_bueno = df_result[['Number1', 'Number2']]

In [15]:
embeddings = pd.read_csv('/content/drive/MyDrive/GNE model/Embeddings_all.txt', header = None)

In [16]:
embeddings['vector'] = embeddings.apply(lambda row: row.values, axis=1)

In [17]:
embeddings = embeddings['vector'].reset_index()

In [18]:
inter = pd.read_csv('/content/drive/MyDrive/GNE model/archivo_genes.txt', header = None, sep = ' ')

In [19]:
inter = inter[[0,1]]

In [20]:
new_column_names = ['col_a', 'col_b']
inter.columns = new_column_names

In [21]:
inter

Unnamed: 0,col_a,col_b
0,1,1916
1,1,12887
2,1,13302
3,1,15462
4,1,3767
...,...,...
1677001,19187,18775
1677002,19052,18848
1677003,19130,18867
1677004,18981,18953


In [22]:
import pandas as pd

# Ordenar los valores de cada fila para tener un orden consistente
df_sorted = pd.DataFrame({
    'min': inter[['col_a', 'col_b']].min(axis=1),
    'max': inter[['col_a', 'col_b']].max(axis=1)
})

# Eliminar los duplicados
df_unique = df_sorted.drop_duplicates()

# Recuperar los pares originales (a, b)
df_result = pd.merge(inter, df_unique, how='inner', left_on=['col_a', 'col_b'], right_on=['min', 'max'])

# Seleccionar solo las columnas originales
df_result_int = df_result[['col_a', 'col_b']]

df_result_int

Unnamed: 0,col_a,col_b
0,1,1916
1,1,12887
2,1,13302
3,1,15462
4,1,3767
...,...,...
838498,18775,19187
838499,18848,19052
838500,18867,19130
838501,18953,18981


In [23]:
df_result_fin = pd.merge(df_result_bueno, df_result_int, how='left', left_on=['Number1', 'Number2'], right_on=['col_a', 'col_b'])

In [24]:
df_result_fin = df_result_fin[df_result_fin['col_a'].isna()]

In [25]:
df_result_fin = df_result_fin[['Number1','Number2']]

In [26]:
embeddings['index'] = embeddings['index'] + 1

In [27]:
df_fin = pd.merge(df_result_fin, embeddings, how='left', left_on=['Number1'], right_on=['index'])
df_fin2 = pd.merge(df_fin, embeddings, how='left', left_on=['Number2'], right_on=['index'])

In [28]:
df_fin2 = df_fin2[['Number1','Number2','vector_x','vector_y']]

In [29]:
df_sample = df_fin2.sample(n=1000000)

In [30]:
df_sample['mult'] = df_sample['vector_x'] * df_sample['vector_y']

In [31]:
X = np.array(df_sample['mult'].tolist())
predicciones = edge_classifier.predict_proba(X)[:, 1]

In [32]:
df_sample['pred'] = predicciones
df_sample = df_sample.sort_values(by = 'pred')

In [33]:
df_sample

Unnamed: 0,Number1,Number2,vector_x,vector_y,mult,pred
25058686,1357,3360,"[-2.6358266, -0.39663112, 1.9692703, -0.040974...","[-1.2706289, 1.2266533, -2.3100562, -1.9399147...","[3.34915745334874, -0.48652887223069596, -4.54...",0.000055
70185907,4102,11193,"[-1.7505826, -0.1848206, 1.0242184, -0.4494804...","[1.623704, 3.2561867, 1.4090309, -0.41894278, ...","[-2.8424279699504003, -0.60181037960602, 1.443...",0.000056
58730566,3360,7125,"[-1.2706289, 1.2266533, -2.3100562, -1.9399147...","[-0.36754763, 1.7158035, 1.7432274, 0.8304479,...","[0.467016640804507, 2.10469602542655, -4.02695...",0.000089
54451366,3091,17402,"[-0.01987258, -0.9651417, 1.9870379, 1.5462369...","[0.53070664, 1.1596664, 0.26280192, -0.2931547...","[-0.0105465101599312, -1.1192424007288801, 0.5...",0.000170
58728310,3360,4869,"[-1.2706289, 1.2266533, -2.3100562, -1.9399147...","[-0.4067095, 0.38181633, 2.0332396, 0.9244952,...","[0.51677684460455, 0.46835626118838897, -4.696...",0.000172
...,...,...,...,...,...,...
103730566,6518,10041,"[-2.9851806, -2.1470716, 2.2959936, -1.4347333...","[-0.11654003, -0.88268626, 0.8448115, -0.59174...","[0.347893036679418, 1.8951906005562158, 1.9396...",0.999903
70226804,4105,7428,"[-2.78083, -1.2322586, 2.721886, -1.4001176, 0...","[-2.2642584, -1.3286252, 1.4247599, -0.9420552...","[6.296517686472, 1.63720982887672, 3.878034025...",0.999905
25201131,1365,4105,"[-1.4893998, -2.5911646, 1.7913904, 0.7279224,...","[-2.78083, -1.2322586, 2.721886, -1.4001176, 0...","[4.141767645833999, 3.19298486236556, 4.875960...",0.999931
70001839,4090,4496,"[-2.4411952, -0.021944499, 3.0503387, 0.433986...","[-2.1673741, -1.4100354, 3.2584107, 1.2811902,...","[5.29098324952432, 0.0309425204252646, 9.93925...",0.999935


In [34]:
gene_ids = pd.read_csv('/content/drive/MyDrive/GNE model/gene_ids.tsv', sep = '\t')

In [35]:
gene_ids['I#ID'] = gene_ids['I#ID'].str.replace('G', '').astype(int)

In [36]:
gene_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19270 entries, 0 to 19269
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   I#ID    19270 non-null  int64 
 1   Name    19270 non-null  object
dtypes: int64(1), object(1)
memory usage: 301.2+ KB


In [37]:
df_pred = pd.merge(df_sample, gene_ids, how='left', left_on=['Number1'], right_on=['I#ID'])
df_pred2 = pd.merge(df_pred, gene_ids, how='left', left_on=['Number2'], right_on=['I#ID'])

In [38]:
df_final = df_pred2[['Name_x','Name_y','pred']]

In [39]:
df_final.sort_values(by = 'pred', ascending = False)

Unnamed: 0,Name_x,Name_y,pred
999999,NSF,RPL18A,0.999972
999998,RPL6,SORD,0.999935
999997,EIF4G2,RPL22,0.999931
999996,RPL22,UGT2A1,0.999905
999995,PTDSS1,UGT2B28,0.999903
...,...,...,...
4,PCSK2,TOP1P1,0.000172
3,NDUFV1,LCE3B,0.000170
2,PCSK2,NEBL,0.000089
1,RPL18A,CCDC177,0.000056


In [40]:
df_bueno = df_final[df_final['pred'] > 0.9]

In [41]:
df_bueno

Unnamed: 0,Name_x,Name_y,pred
983816,RORB,USP1,0.900013
983817,SLC10A2,TRIM52-AS1,0.900014
983818,PCNA,DNAJA3,0.900015
983819,GATA3,RBFA,0.900020
983820,EMILIN1,SRPX2,0.900020
...,...,...,...
999995,PTDSS1,UGT2B28,0.999903
999996,RPL22,UGT2A1,0.999905
999997,EIF4G2,RPL22,0.999931
999998,RPL6,SORD,0.999935


In [42]:
df_interacciones = pd.read_csv('/content/drive/MyDrive/genes.csv')

In [43]:
df_interacciones[(df_interacciones['Symbol_x'] == 'RPS6') & (df_interacciones['Symbol_y'] == 'GALNT6')]

Unnamed: 0,gene_id,interactant_id,Symbol_x,Symbol_y,seqname_x,seqname_y


In [44]:
df_interacciones[(df_interacciones['Symbol_x'] == 'RPS10') & (df_interacciones['Symbol_y'] == 'GALNT6')]

Unnamed: 0,gene_id,interactant_id,Symbol_x,Symbol_y,seqname_x,seqname_y


In [45]:
df_interacciones[(df_interacciones['Symbol_x'] == 'RPL21') & (df_interacciones['Symbol_y'] == 'TAX1BP1')]

Unnamed: 0,gene_id,interactant_id,Symbol_x,Symbol_y,seqname_x,seqname_y


In [46]:
df_bueno.to_csv('/content/drive/MyDrive/pred_interacciones.csv', index = False)