## **Generamos un conjunto de tamaño similar al nuestro para probar el tiempo de entrenamiento**

In [1]:
import pandas as pd
import joblib
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

# Ruta del archivo Parquet
file_path = '/content/drive/MyDrive/TFM/embedding_kmer5.parquet'

result_df = pd.read_parquet(file_path)

Mounted at /content/drive


In [2]:
embedding_columns4 = [col for col in result_df.columns if col != 'gene_id']

# Función para concatenar los embeddings en un solo vector
def concatenate_values(row):
    return row.values[1:].tolist()  # Ignorar la primera columna (filename) y convertir a lista

# Aplicar la función a cada fila para concatenar embeddings en un solo vector
result_df['concatenated_vector'] = result_df.apply(concatenate_values, axis=1)
result_df.drop(columns=embedding_columns4, inplace=True)


In [3]:
path_int= "/content/drive/MyDrive/genes.csv"
inter = pd.read_csv(path_int)
inter.head(5)
inter['interactant_id'] = inter['interactant_id'].astype(str)
inter['gene_id'] = inter['gene_id'].astype(str)
result_df['gene_id'] = result_df['gene_id'].astype(str)
result_df.rename(columns={'gene_id': 'gene'}, inplace=True)


In [4]:
#info gen 1
inter_emb = pd.merge(inter, result_df, left_on='gene_id', right_on='gene', how='left')
inter_emb.rename(columns={'concatenated_vector': 'gen_vector'}, inplace=True)
#info gen 2
inter_emb = pd.merge(inter_emb, result_df, left_on='interactant_id', right_on='gene', how='left')
inter_emb.rename(columns={'concatenated_vector': 'inter_vector'}, inplace=True)

inter_emb.drop(columns=['gene_x'], inplace=True)
inter_emb.drop(columns=['gene_y'], inplace=True)

inter_emb = inter_emb[['gene_id', 'gen_vector','Symbol_x','seqname_x', 'interactant_id','inter_vector','Symbol_y','seqname_y']]

In [5]:
inter_emb['gen_vector'] = inter_emb['gen_vector'].apply(lambda x: np.array(x))
inter_emb['inter_vector'] = inter_emb['inter_vector'].apply(lambda x: np.array(x))

# Multiplicar los arrays
inter_emb['result'] = inter_emb.apply(lambda row: row['gen_vector'] * row['inter_vector'], axis=1)

In [6]:
inter_emb.shape

(838503, 9)

In [7]:
import random

X_train, X_test = train_test_split(inter_emb, test_size=0.1, random_state=42)
df_splits = np.array_split(X_train, 100)
# Selecciona 10 muestras aleatorias de las 100 divisiones
random_splits = random.sample(df_splits, 10)

In [8]:
# Since visualizing high-dimensional data directly is not feasible,
# the plot_decision_boundary function will not be used in this context.

# Train OSVM models on each subset and collect support vectors
support_vectors = np.vstack([
    OneClassSVM(kernel='rbf', gamma='auto', nu=0.1).fit(np.vstack(df_split['result'].values)).support_vectors_
    for df_split in random_splits
])

# Train a final OSVM model on the collected support vectors
final_model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
final_model.fit(support_vectors)

# Note: Visualization of decision boundaries in high-dimensional space is not straightforward
# and typically not practical. You might consider dimensionality reduction techniques
# like PCA for visualization purposes, but it's outside the direct scope of this modification.

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Función para calcular y mostrar métricas
def calculate_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    print(f"Precisión: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Exactitud: {accuracy:.4f}")
    print("-----------------------------")


In [10]:
X_test.shape

(83851, 9)

In [11]:
test_data =  np.vstack(X_test['result'].values)
y_true = np.ones(83851)

preds = final_model.predict(test_data)
calculate_metrics(y_true, preds)

Precisión: 1.0000
Recall: 0.9898
F1-Score: 0.9949
Exactitud: 0.9898
-----------------------------


In [12]:

X_test['prediccion'] = preds

In [13]:
X_test.groupby('prediccion').size()

prediccion
-1      853
 1    82998
dtype: int64

# Probamos con interacciones negativas

In [14]:
path_int_neg_OC = '/content/drive/MyDrive/inter_neg_OCSVM.csv'
inter_neg_modelo = pd.read_csv(path_int_neg_OC)

In [15]:
inter_neg_modelo = inter_neg_modelo.dropna()

In [16]:
inter_neg_modelo['interactant_id'] = inter_neg_modelo['interactant_id'].astype(int)
inter_neg_modelo['gene_id'] = inter_neg_modelo['gene_id'].astype(int)

In [17]:
inter_neg_modelo['interactant_id'] = inter_neg_modelo['interactant_id'].astype(str)
inter_neg_modelo['gene_id'] = inter_neg_modelo['gene_id'].astype(str)

In [18]:
#info gen 1
inter_embOC_neg = pd.merge(inter_neg_modelo, result_df, left_on='gene_id', right_on='gene', how='left')
inter_embOC_neg.rename(columns={'concatenated_vector': 'gen_vector'}, inplace=True)
#info gen 2
inter_embOC_neg = pd.merge(inter_embOC_neg, result_df, left_on='interactant_id', right_on='gene', how='left')
inter_embOC_neg.rename(columns={'concatenated_vector': 'inter_vector'}, inplace=True)

inter_embOC_neg.drop(columns=['gene_x'], inplace=True)
inter_embOC_neg.drop(columns=['gene_y'], inplace=True)

In [19]:
inter_embOC_neg['gen_vector'] = inter_embOC_neg['gen_vector'].apply(lambda x: np.array(x))
inter_embOC_neg['inter_vector'] = inter_embOC_neg['inter_vector'].apply(lambda x: np.array(x))

# Multiplicar los arrays
inter_embOC_neg['result'] = inter_embOC_neg.apply(lambda row: row['gen_vector'] * row['inter_vector'], axis=1)

In [20]:
df_unir = inter_emb[['gene_id', 'interactant_id', 'result']]
df_unir['clas'] = 1

df_unir_neg = inter_embOC_neg[['gene_id', 'interactant_id', 'result']]
df_unir_neg['clas'] = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unir['clas'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unir_neg['clas'] = -1


In [21]:
pred_data =  np.vstack(inter_embOC_neg['result'].values)
preds = final_model.predict(pred_data)

inter_embOC_neg['preds5'] = preds

In [22]:
inter_embOC_neg.groupby('preds5').size()

preds5
-1     13806
 1    762356
dtype: int64

# Pegamos los embeddings a las interacciones que vamos a meter al modelo

In [23]:
path_int_OC = '/content/drive/MyDrive/inter_OCSVM.csv'
inter_modelo = pd.read_csv(path_int_OC)

In [24]:
inter_modelo['interactant_id'] = inter_modelo['interactant_id'].astype(str)
inter_modelo['gene_id'] = inter_modelo['gene_id'].astype(str)

In [25]:
#info gen 1
inter_embOC = pd.merge(inter_modelo, result_df, left_on='gene_id', right_on='gene', how='left')
inter_embOC.rename(columns={'concatenated_vector': 'gen_vector'}, inplace=True)
#info gen 2
inter_embOC = pd.merge(inter_embOC, result_df, left_on='interactant_id', right_on='gene', how='left')
inter_embOC.rename(columns={'concatenated_vector': 'inter_vector'}, inplace=True)

inter_embOC.drop(columns=['gene_x'], inplace=True)
inter_embOC.drop(columns=['gene_y'], inplace=True)

In [26]:
inter_embOC['gen_vector'] = inter_embOC['gen_vector'].apply(lambda x: np.array(x))
inter_embOC['inter_vector'] = inter_embOC['inter_vector'].apply(lambda x: np.array(x))

# Multiplicar los arrays
inter_embOC['result'] = inter_embOC.apply(lambda row: row['gen_vector'] * row['inter_vector'], axis=1)

In [27]:
pred_data =  np.vstack(inter_embOC['result'].values)
preds = final_model.predict(pred_data)

In [28]:
inter_embOC['preds5'] = preds
inter_embOC.groupby('preds5').size()

preds5
-1      900
 1    48259
dtype: int64

# Hacemos una regresión logística

In [29]:
import pandas as pd

# Asegúrate de que los DataFrames tengan las mismas columnas
df_unir = inter_emb[['gene_id', 'interactant_id', 'result']].copy()
df_unir['clas'] = 1

df_unir_neg = inter_embOC_neg[['gene_id', 'interactant_id', 'result']].copy()
df_unir_neg['clas'] = -1

# Concatenar los DataFrames
df_concatenado = pd.concat([df_unir, df_unir_neg])

# Resetear el índice si es necesario
df_concatenado.reset_index(drop=True, inplace=True)

df_concatenado


Unnamed: 0,gene_id,interactant_id,result,clas
0,1,2886,"[0.05623543435370326, 0.006625147215156835, 0....",1
1,1,80854,"[0.06706914626234006, -0.001924178583570746, 0...",1
2,1,84236,"[0.06869531104878579, -0.001305265558446178, 0...",1
3,1,148581,"[0.07038887149240614, -0.002297709064050135, 0...",1
4,1,5655,"[0.06654438764185455, 0.005791075457994399, 0....",1
...,...,...,...,...
1614660,7756,57476,"[0.10024724992931411, 0.00017051288184483787, ...",-1
1614661,4968,55504,"[0.06660063072668354, -7.103466949281382e-05, ...",-1
1614662,1036,151742,"[0.10706084596534104, 0.003265211891773534, 0....",-1
1614663,1002,64147,"[0.04333024210438108, 0.0030701246622920897, 0...",-1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Asumimos que df_concatenado ya está creado y contiene las columnas 'result' y 'clas'

# Separar características (X) y etiquetas (y)
X = np.vstack(df_concatenado['result'])
y = df_concatenado['clas']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Escalar las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crear el modelo de regresión logística con más iteraciones y un solver diferente
model = LogisticRegression(solver='saga', max_iter=2000)

# Entrenar el modelo
model.fit(X_train_scaled, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test_scaled)

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


In [None]:
import joblib

# Guardar el modelo entrenado en un archivo
model_path = '/content/drive/MyDrive/logistic_regression_model_5.joblib'
joblib.dump(model, model_path)

print(f"Modelo guardado en {model_path}")

Modelo guardado en /content/drive/MyDrive/logistic_regression_model_5.joblib


In [None]:
pred_scaled = scaler.transform(np.vstack(inter_embOC['result'].values))
y_predscaled = model.predict(pred_scaled)
inter_embOC['preds5_logis'] = y_predscaled
inter_embOC.groupby('preds5_logis').size()

inter_embOC[['Name_x', 'Name_y', 'gene_id', 'interactant_id', 'seqname_x','seqname_y','preds5','preds5_logis']].to_csv('/content/drive/MyDrive/prediccion_5.csv', index = False)


# Sacamos la sensitividad de los casos de test

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler

# Cargar el modelo desde el archivo
model_path = '/content/drive/MyDrive/logistic_regression_model_5.joblib'
model5 = joblib.load(model_path)

# Asumimos que df_concatenado ya está creado y contiene las columnas 'result' y 'clas'

# Separar características (X) y etiquetas (y)
X = np.vstack(df_concatenado['result'])
y = df_concatenado['clas']

# Dividir los datos en conjuntos de entrenamiento y prueba, obteniendo los índices
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, df_concatenado.index, test_size=0.1, random_state=42)

# Escalar las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hacer predicciones en el conjunto de prueba
y_pred = model5.predict(X_test_scaled)

# Calcular la sensitividad para la clase 1
sensitivity = recall_score(y_test, y_pred, pos_label=1)
print("Sensitividad para la clase 1:", sensitivity)

# Crear un DataFrame con la parte de test de df_concatenado
df_test = df_concatenado.loc[test_indices].copy()
df_test['Predicted'] = y_pred

# Guardar el DataFrame en un archivo CSV si lo deseas
df_test[['gene_id','interactant_id','clas','Predicted']].to_csv('/content/drive/MyDrive/test_predictions_with_df_concatenado5.csv', index=False)


Sensitividad para la clase 1: 0.7008522523812935


# Entrenamos el modelo solo con las características con los coeficientes más altos

In [30]:
t5 = [744, 507, 405, 252, 688,  91, 340, 718, 679, 423, 561, 466, 545,
       336, 617, 501, 209, 696, 614, 115, 208, 298, 731, 602, 355,  74,
        73, 323, 753, 149, 534, 316, 148, 393, 560, 457, 665,  16, 531,
       256, 546, 576, 356, 702, 672, 225, 671, 762, 448, 371, 407,  11,
       508, 630, 305,  27, 714, 222, 306, 414,  28, 697, 206, 668, 238,
       386, 265, 537, 490, 216, 267, 430, 255, 726, 345,  92, 294, 271,
       145, 766, 236,   3,  97, 402,  14, 664, 264, 639, 575, 593, 489,
       548, 338,  69, 392, 447, 514, 313, 212, 311]

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Asumimos que df_concatenado ya está creado y contiene las columnas 'result' y 'clas'

# Separar características (X) y etiquetas (y)
X = np.vstack(df_concatenado['result'])
y = df_concatenado['clas']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, df_concatenado.index, test_size=0.1, random_state=42)

# Escalar las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_filtered = X_train_scaled[:, t5]
X_test_filtered = X_test_scaled[:, t5]

# Crear el modelo de regresión logística con más iteraciones y un solver diferente
model = LogisticRegression(solver='saga', max_iter=2000)

# Entrenar el modelo
model.fit(X_train_filtered, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test_filtered)

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Crear un DataFrame con la parte de test de df_concatenado
df_test = df_concatenado.loc[test_indices].copy()
df_test['Predicted'] = y_pred

# Guardar el DataFrame en un archivo CSV si lo deseas
df_test[['gene_id','interactant_id','clas','Predicted']].to_csv('/content/drive/MyDrive/test_predictions_with_df_concatenado5_nuevo.csv', index=False)

Accuracy: 0.6393071029993744
Classification Report:
               precision    recall  f1-score   support

          -1       0.64      0.58      0.61     77689
           1       0.64      0.70      0.67     83778

    accuracy                           0.64    161467
   macro avg       0.64      0.64      0.64    161467
weighted avg       0.64      0.64      0.64    161467

Confusion Matrix:
 [[44780 32909]
 [25331 58447]]


In [None]:
import joblib

# Guardar el modelo entrenado en un archivo
model_path = '/content/drive/MyDrive/logistic_regression_model_5_nuevo100.joblib'
joblib.dump(model, model_path)

print(f"Modelo guardado en {model_path}")

Modelo guardado en /content/drive/MyDrive/logistic_regression_model_5_nuevo100.joblib
