In [None]:
import pandas as pd
import numpy as np
import gc

from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
filtered_df = pd.read_csv('drive/MyDrive/TFM/GeneRIF/interactions_human_reduced.csv')
df = filtered_df[['gene_id','interactant_id']].drop_duplicates().sort_values(by=['gene_id','interactant_id'])
graph_df = df.loc[df['gene_id'] != df['interactant_id']]

In [None]:
node2vec_df = pd.read_csv('drive/MyDrive/definitivo/node2vec_p1q1.csv')
node2vec_df_cleaned = node2vec_df.drop(columns=["Unnamed: 0"])
node2vec_df_transpose = node2vec_df_cleaned.transpose()
node2vec_df_transpose.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
1,0.207478,-0.133112,-0.090474,0.063056,-0.065361,-0.287115,-0.009496,0.090515,-0.387027,-0.107791,...,-0.035289,-0.077276,-0.338468,0.005699,-0.048972,0.093171,-0.140659,-0.074125,-0.040365,-0.022939
310,0.14893,-0.197648,-0.113831,0.124112,-0.043199,-0.263765,0.070016,0.033679,-0.314659,-0.162753,...,-0.02255,-0.028462,-0.197567,-0.058505,-0.005705,0.191408,-0.093223,-0.032009,-0.043734,-0.007712
368,0.253915,-0.184889,-0.191043,0.080495,0.219197,-0.260263,-0.053996,0.137636,-0.283346,-0.085089,...,-0.020493,-0.043358,-0.294228,0.070425,-0.162516,0.04208,-0.135102,-0.162104,0.166871,0.068
1026,0.419201,-0.301697,-0.227634,0.065159,-0.07405,-0.45276,-0.462124,-0.055332,-0.040935,0.1783,...,-0.105212,-0.157691,-0.106798,0.136247,0.081322,0.182503,-0.490991,0.100194,-0.289154,0.10133
2232,0.156489,-0.178668,0.005783,0.074416,0.007197,-0.19361,-0.010682,-0.051759,-0.346724,0.153327,...,0.10125,-0.090236,-0.123796,-0.121192,-0.162831,0.26628,0.002818,-0.148038,-0.058223,-0.08839


In [None]:
gene_csv = pd.read_csv('drive/MyDrive/TFM/data/gene_information.csv').set_index('gene_id')
gene_csv.index = gene_csv.index.astype(str)
node2vec_df_transposed = pd.concat([node2vec_df_transpose, gene_csv], axis=1, join='inner')

In [None]:
node2vec_df_transposed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,chr_chr3,chr_chr4,chr_chr5,chr_chr6,chr_chr7,chr_chr8,chr_chr9,chr_chrM,chr_chrX,chr_chrY
1,0.207478,-0.133112,-0.090474,0.063056,-0.065361,-0.287115,-0.009496,0.090515,-0.387027,-0.107791,...,False,False,False,False,False,False,False,False,False,False
310,0.14893,-0.197648,-0.113831,0.124112,-0.043199,-0.263765,0.070016,0.033679,-0.314659,-0.162753,...,False,False,False,False,False,False,False,False,False,False
368,0.253915,-0.184889,-0.191043,0.080495,0.219197,-0.260263,-0.053996,0.137636,-0.283346,-0.085089,...,False,False,False,False,False,False,False,False,False,False
1026,0.419201,-0.301697,-0.227634,0.065159,-0.07405,-0.45276,-0.462124,-0.055332,-0.040935,0.1783,...,False,False,False,True,False,False,False,False,False,False
2232,0.156489,-0.178668,0.005783,0.074416,0.007197,-0.19361,-0.010682,-0.051759,-0.346724,0.153327,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Inicializamos un DataFrame de pandas sin información
node_df = pd.DataFrame()

# Creamos una lista auxiliar para almacenar las filas temporalmente
aux_list = []

# Obtenemos el número completo de interacciones para visualizar el progreso de la ejecución
total = len(graph_df)
iteration = 0

# Iteramos sobre cada fila de graph_df
for index, row in graph_df.iterrows():
    iteration += 1
    print(f"Processing row {iteration}/{total}")

    gene_id = row['gene_id']
    interactant_id = row['interactant_id']

    # Extraemos los vectores
    gene_vector = node2vec_df_transposed.loc[str(gene_id)]
    interactant_vector = node2vec_df_transposed.loc[str(interactant_id)]

    # Añadimos sufijos para diferenciar las columnas
    gene_vector = gene_vector.add_suffix('_gene')
    interactant_vector = interactant_vector.add_suffix('_interactant')

    # Concatenamos los vectores de embeddings y los añadimos a la lista auxiliar
    concatenated_vector = pd.concat([gene_vector, interactant_vector])
    aux_list.append(concatenated_vector)

    # Cada 10,000 iteraciones, se añaden las filas recopiladas al DataFrame y vaciamos la lista auxiliar
    if iteration % 25000 == 0:
        print("Appending rows to DataFrame and triggering garbage collection...")
        node_df = pd.concat([node_df, pd.DataFrame(aux_list)], ignore_index=True)
        aux_list.clear()
        gc.collect()

# Después del bucle, añadimos las filas faltantes de la lista auxiliar
if aux_list:
    node_df = pd.concat([node_df, pd.DataFrame(aux_list)], ignore_index=True)

gc.collect()

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Processing row 833505/838504
Processing row 833506/838504
Processing row 833507/838504
Processing row 833508/838504
Processing row 833509/838504
Processing row 833510/838504
Processing row 833511/838504
Processing row 833512/838504
Processing row 833513/838504
Processing row 833514/838504
Processing row 833515/838504
Processing row 833516/838504
Processing row 833517/838504
Processing row 833518/838504
Processing row 833519/838504
Processing row 833520/838504
Processing row 833521/838504
Processing row 833522/838504
Processing row 833523/838504
Processing row 833524/838504
Processing row 833525/838504
Processing row 833526/838504
Processing row 833527/838504
Processing row 833528/838504
Processing row 833529/838504
Processing row 833530/838504
Processing row 833531/838504
Processing row 833532/838504
Processing row 833533/838504
Processing row 833534/838504
Processing row 833535/838504
Processing row 833536/8385

In [None]:
# Dividimos el DataFrame en conjunto de entrenamiento y prueba
X_train, X_test = train_test_split(node_df, test_size=0.2, random_state=42)

# Aseguramos de que el modelo reciba los datos sin nombres de columnas
X_train_np = X_train.values
X_test_np = X_test.values

# Se divide el conjunto de entrenamiento en subconjuntos
n_subsets = 2000
subsets = np.array_split(X_train_np, n_subsets)

# Entrenamos un OSVM para cada subconjunto y recolectamos sus vectores de soporte
support_vectors = []
for subset in subsets:
    model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
    model.fit(subset)
    support_vectors.append(model.support_vectors_)

# Combinamos todos los vectores de soporte del primer paso
combined_support_vectors = np.vstack(support_vectors)

# Se entrena un nuevo OSVM con los vectores de soporte combinados
final_model_initial = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
final_model_initial.fit(combined_support_vectors)

# Obtenemos los vectores de soporte del anterior modelo
initial_support_vectors = final_model_initial.support_vectors_

# Quitamos los vectores de soporte del conjunto de entrenamiento original
def remove_support_vectors(data, support_vectors):
    return data[~np.isin(data, support_vectors).all(axis=1)]

X_train_reduced = remove_support_vectors(X_train_np, initial_support_vectors)

# Repetimos el proceso de partición y entrenamiento en el conjunto reducido
subsets_reduced = np.array_split(X_train_reduced, n_subsets)

support_vectors_reduced = []
for subset in subsets_reduced:
    model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
    model.fit(subset)
    support_vectors_reduced.append(model.support_vectors_)

combined_support_vectors_reduced = np.vstack(support_vectors_reduced)

# Finalmente, entrenamos el modelo final con los vectores de soporte reducidos
final_model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
final_model.fit(combined_support_vectors_reduced)

# Evaluamos el modelo en el conjunto de prueba
y_pred_test = final_model.predict(X_test_np)

# Convertimos las predicciones a 1 para datos "normales" y 0 para "anomalías"
y_pred_test = np.where(y_pred_test == 1, 1, 0)

# Medimos las métricas usando el conjunto de prueba
accuracy = accuracy_score(np.ones(len(X_test_np)), y_pred_test)
report = classification_report(np.ones(len(X_test_np)), y_pred_test, target_names=['Anomalía', 'Normal'], zero_division=0, output_dict=True)

# Mostramos los resultados
metrics_df = pd.DataFrame(report).transpose()
metrics_df['accuracy'] = accuracy

# Mostramos el dataframe con las métricas
print(metrics_df)

              precision    recall  f1-score        support  accuracy
Anomalía       0.000000  0.000000  0.000000       0.000000  0.964335
Normal         1.000000  0.964335  0.981844  167701.000000  0.964335
accuracy       0.964335  0.964335  0.964335       0.964335  0.964335
macro avg      0.500000  0.482168  0.490922  167701.000000  0.964335
weighted avg   1.000000  0.964335  0.981844  167701.000000  0.964335


In [None]:
final_model.predict(np.array(pd.concat([node2vec_df_transpose.loc['1'], node2vec_df_transpose.loc['310']], axis=0)).reshape(1, -1))

array([1])

In [None]:

""" Tomamos un conjunto de 100.000 pares de genes para los que no hay interacción documentada con el objetivo de validar los resultados del modelo """

valid_gene_ids = set(graph_df['gene_id']).union(set(graph_df['interactant_id']))

existing_edges = set(tuple(sorted(pair)) for pair in graph_df[['gene_id', 'interactant_id']].values)

X_new_entries = []

c = 0
while len(X_new_entries) < 100000:
    c += 1
    if c % 1000 == 0: print(c)
    i, j = np.random.choice(list(valid_gene_ids), size=2, replace=False)

    if (i, j) not in existing_edges and (j, i) not in existing_edges:
        entry_i = node2vec_df_transpose.loc[str(i)].reset_index(drop=True)
        entry_j = node2vec_df_transpose.loc[str(j)].reset_index(drop=True)

        entry = pd.concat([entry_i, entry_j], axis=0).reset_index(drop=True)

        X_new_entries.append(entry)

        existing_edges.add((i, j))
        existing_edges.add((j, i))

X_new = pd.DataFrame(X_new_entries)
print(X_new.shape)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
(100000, 128)


In [None]:
# Suma de las predicciones (los valores posibles son 1 para interacción y -1 para no interacción)
print(final_model.predict(np.array(X_new)).sum())

62948
