In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
filtered_df = pd.read_csv('drive/MyDrive/TFM/GeneRIF/interactions_human_reduced.csv')
df = filtered_df[['gene_id','interactant_id']].drop_duplicates().sort_values(by=['gene_id','interactant_id'])
graph_df = df.loc[df['gene_id'] != df['interactant_id']]

deep_df = pd.read_csv('drive/MyDrive/definitivo/node2vec_embeddings3.csv').drop('Unnamed: 0', axis=1)  # p = q = 1
nod1_df = pd.read_csv('drive/MyDrive/definitivo/node2vec_embeddings4.csv').drop('Unnamed: 0', axis=1)  # p = 0.5 ; q = 2
nod2_df = pd.read_csv('drive/MyDrive/definitivo/node2vec_embeddings5.csv').drop('Unnamed: 0', axis=1)  # p = 2 ; q = 0.5


In [4]:
gene_csv = pd.read_csv('drive/MyDrive/TFM/data/gene_information_csv').set_index('gene_id').drop(columns=['pos_min','pos_max'], axis=1)
gene_csv.index = gene_csv.index.astype(str)

dnabert3_df = pd.read_parquet('drive/MyDrive/TFM/data/embedding_kmer3.parquet').set_index('gene_id')
dnabert4_df = pd.read_parquet('drive/MyDrive/TFM/data/embedding_kmer4.parquet').set_index('gene_id')
dnabert5_df = pd.read_parquet('drive/MyDrive/TFM/data/embedding_kmer5.parquet').set_index('gene_id')
dnabert6_df = pd.read_csv('drive/MyDrive/TFM/data/embedding_dnabert6.csv').drop(columns=['Unnamed: 0']).rename(columns={'gene': 'gene_id'}).set_index('gene_id')

dnabert3_df.index = dnabert3_df.index.astype(str)
dnabert4_df.index = dnabert4_df.index.astype(str)
dnabert5_df.index = dnabert5_df.index.astype(str)
dnabert6_df.index = dnabert6_df.index.astype(str)

gdnabert3_df = pd.concat([gene_csv, dnabert3_df], axis=1, join='inner').T
gdnabert4_df = pd.concat([gene_csv, dnabert4_df], axis=1, join='inner').T
gdnabert5_df = pd.concat([gene_csv, dnabert5_df], axis=1, join='inner').T
gdnabert6_df = pd.concat([gene_csv, dnabert6_df], axis=1, join='inner').T

In [4]:
embedding_dict1 = deep_df.to_dict(orient='list')
embedding_dict2 = nod1_df.to_dict(orient='list')
embedding_dict3 = nod2_df.to_dict(orient='list')

In [5]:
G = nx.from_pandas_edgelist(graph_df, source='gene_id', target='interactant_id')
node_labels = list(G.nodes())

In [6]:
dicc = {}
for i, j in enumerate(G.nodes()):
  dicc[j] = i

idicc = {v: k for k, v in dicc.items()}

In [8]:
positive_edges = list(G.edges())

n_positive = len(positive_edges)

negative_edges = set()
while len(negative_edges) < n_positive:
    u, v = random.sample(node_labels, 2)
    if not G.has_edge(u, v) and (u, v) not in negative_edges and (v, u) not in negative_edges:
        negative_edges.add((u, v))

negative_edges = list(negative_edges)

edges = positive_edges + negative_edges
labels = np.hstack([np.ones(len(positive_edges)), np.zeros(len(negative_edges))])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(edges, labels, test_size=0.3, random_state=42)

In [10]:
def create_edge_features(edge_list, embedding_dict):
    features = []
    for u, v in edge_list:
        edge_vector = np.concatenate([embedding_dict[str(u)], embedding_dict[str(v)]])
        features.append(edge_vector)
    return np.array(features)

def create_edge_features_mean(edge_list, embedding_dict):
    features = []
    for u, v in edge_list:
        edge_vector = np.mean([embedding_dict[str(u)], embedding_dict[str(v)]], axis=0)
        features.append(edge_vector)
    return np.array(features)

def create_edge_features_df(edge_list, embedding_df):
    features = []
    i = 0
    for u, v in edge_list:
        u_vector = embedding_df[str(u)].values
        v_vector = embedding_df[str(v)].values

        edge_vector = np.mean([u_vector, v_vector], axis=0)
        features.append(edge_vector)
        i += 1
        if i % 1000 == 0:
          print(i)

    return np.array(features)

### **DeepWalk**

In [None]:
X_train_features = create_edge_features(X_train, embedding_dict1)
X_test_features = create_edge_features(X_test, embedding_dict1)

In [None]:
scaler = StandardScaler()
X_train_features = scaler.fit_transform(X_train_features)
X_test_features = scaler.transform(X_test_features)

**Regresión Logística**

In [None]:
clf = LogisticRegression(random_state=42, penalty='l2', C=1.0, class_weight='balanced')

clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
y_pred_proba = clf.predict_proba(X_test_features)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

coefficients = clf.coef_[0]
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(coefficients))]

feature_importance = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance:")
for feature, coef in feature_importance:
    print(f"{feature}: {coef:.4f}")

Accuracy: 0.7510
ROC-AUC Score: 0.8119
Precision: 0.8175
Recall: 0.7764
F1 Score: 0.7964

Confusion Matrix:
[[106012  43666]
 [ 56311 195563]]

Feature Importance:
Feature 0: -8.0975
Feature 21: -7.4894
Feature 1: 7.3389
Feature 16: 7.0003
Feature 5: 6.8735
Feature 13: 6.5623
Feature 23: 6.5225
Feature 25: -6.2697
Feature 18: 5.9456
Feature 8: 5.9366
Feature 10: -5.9315
Feature 52: -5.7944
Feature 44: -5.6823
Feature 20: -5.5310
Feature 59: -5.4909
Feature 45: 5.3322
Feature 4: -5.3300
Feature 60: 4.3999
Feature 46: 4.2673
Feature 50: -4.2246
Feature 35: -4.0246
Feature 31: -4.0021
Feature 53: -3.9753
Feature 14: 3.9247
Feature 49: -3.8047
Feature 40: -3.6910
Feature 3: -3.6572
Feature 51: 3.6055
Feature 41: -3.4231
Feature 30: 3.4136
Feature 33: 3.3393
Feature 56: 3.2451
Feature 26: 3.2051
Feature 24: 2.9445
Feature 43: -2.8730
Feature 55: 2.4209
Feature 39: -2.1293
Feature 57: 2.1001
Feature 62: 2.0768
Feature 47: -1.8922
Feature 64: -1.7991
Feature 38: 1.7877
Feature 65: 1.6751
Feat

**Random Forest**

In [None]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=10, n_jobs=-1, class_weight='balanced')

print("Initial RandomForest Model Parameters:")
print(rf_clf.get_params())

rf_clf.fit(X_train_features, y_train)

print("\nRandomForest Model Parameters after Training:")
print(rf_clf.get_params())

rf_y_pred = rf_clf.predict(X_test_features)
rf_y_pred_proba = rf_clf.predict_proba(X_test_features)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_y_pred_proba)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
rf_confusion_matrix = confusion_matrix(y_test, rf_y_pred)

print("\nRandomForest Model Performance with Specific Parameters")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC-AUC Score: {rf_roc_auc:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

print("\nConfusion Matrix:")
print(rf_confusion_matrix)

feature_importances = rf_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial RandomForest Model Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Parameters after Training:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Performance with Specific Parameters
Accuracy: 0.7851
ROC-AUC Score: 0.8563
Precision: 0.7698
Recall: 0.8131
F

**XGBoost**

In [None]:
negative_class_count = sum(y_train == 0)
positive_class_count = sum(y_train == 1)
scale_pos_weight = negative_class_count / positive_class_count

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)

print("Initial XGBoost Model Parameters:")
print(xgb_clf.get_params())

xgb_clf.fit(X_train_features, y_train)

print("\nXGBoost Model Parameters after Training:")
print(xgb_clf.get_params())

xgb_y_pred = xgb_clf.predict(X_test_features)
xgb_y_pred_proba = xgb_clf.predict_proba(X_test_features)[:, 1]

xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_roc_auc = roc_auc_score(y_test, xgb_y_pred_proba)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_y_pred)

print("\nXGBoost Model Performance with Specific Parameters")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"ROC-AUC Score: {xgb_roc_auc:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nConfusion Matrix:")
print(xgb_confusion_matrix)

feature_importances = xgb_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial XGBoost Model Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}


Parameters: { "use_label_encoder" } are not used.




XGBoost Model Parameters after Training:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}

XGBoost Model

### **node2vec [p = 0.5 y q = 2]**

In [None]:
X_train_features = create_edge_features(X_train, embedding_dict2)
X_test_features = create_edge_features(X_test, embedding_dict2)

In [None]:
scaler = StandardScaler()
X_train_features = scaler.fit_transform(X_train_features)
X_test_features = scaler.transform(X_test_features)

**Regresión Logística**

In [None]:
clf = LogisticRegression(random_state=42, penalty='l2', C=1.0, class_weight='balanced')

clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
y_pred_proba = clf.predict_proba(X_test_features)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

coefficients = clf.coef_[0]
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(coefficients))]

feature_importance = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance:")
for feature, coef in feature_importance:
    print(f"{feature}: {coef:.4f}")

Accuracy: 0.7388
ROC-AUC Score: 0.7946
Precision: 0.8055
Recall: 0.7694
F1 Score: 0.7870

Confusion Matrix:
[[102874  46804]
 [ 58085 193789]]

Feature Importance:
Feature 16: 8.8205
Feature 0: -6.3292
Feature 52: -5.4533
Feature 13: 5.3772
Feature 20: -5.2453
Feature 49: -5.1376
Feature 14: 4.9219
Feature 25: -4.5955
Feature 19: -4.5304
Feature 37: 4.2115
Feature 8: 4.1585
Feature 12: 4.1442
Feature 56: 4.0905
Feature 61: 4.0843
Feature 59: -3.8058
Feature 40: -3.3348
Feature 7: 3.1986
Feature 23: 3.1442
Feature 47: 3.1353
Feature 26: 3.0765
Feature 10: -2.8974
Feature 33: 2.8419
Feature 50: -2.8362
Feature 5: 2.7967
Feature 3: -2.7467
Feature 53: -2.6054
Feature 57: 2.6003
Feature 51: 2.4651
Feature 39: -2.4183
Feature 44: -2.3069
Feature 60: 2.2764
Feature 31: -2.2628
Feature 11: 2.2082
Feature 2: -2.1018
Feature 28: -2.0773
Feature 45: 1.9162
Feature 36: 1.8923
Feature 17: 1.8654
Feature 1: 1.8373
Feature 22: -1.7763
Feature 21: -1.7483
Feature 48: 1.7105
Feature 32: 1.5874
Feature

**Random Forest**

In [None]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=10, n_jobs=-1)

print("Initial RandomForest Model Parameters:")
print(rf_clf.get_params())

rf_clf.fit(X_train_features, y_train)

print("\nRandomForest Model Parameters after Training:")
print(rf_clf.get_params())

rf_y_pred = rf_clf.predict(X_test_features)
rf_y_pred_proba = rf_clf.predict_proba(X_test_features)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_y_pred_proba)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)

print("\nRandomForest Model Performance with Specific Parameters")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC-AUC Score: {rf_roc_auc:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")

feature_importances = rf_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial RandomForest Model Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Parameters after Training:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Performance with Specific Parameters
Accuracy: 0.7848
ROC-AUC Score: 0.8484
Precision: 0.7637
Recall: 0.9514

Feature Impo

**XGBoost**

In [None]:
negative_class_count = sum(y_train == 0)
positive_class_count = sum(y_train == 1)
scale_pos_weight = negative_class_count / positive_class_count

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)

print("Initial XGBoost Model Parameters:")
print(xgb_clf.get_params())

xgb_clf.fit(X_train_features, y_train)

print("\nXGBoost Model Parameters after Training:")
print(xgb_clf.get_params())

xgb_y_pred = xgb_clf.predict(X_test_features)
xgb_y_pred_proba = xgb_clf.predict_proba(X_test_features)[:, 1]

xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_roc_auc = roc_auc_score(y_test, xgb_y_pred_proba)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_y_pred)

print("\nXGBoost Model Performance with Specific Parameters")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"ROC-AUC Score: {xgb_roc_auc:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nConfusion Matrix:")
print(xgb_confusion_matrix)

feature_importances = xgb_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial XGBoost Model Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}


Parameters: { "use_label_encoder" } are not used.




XGBoost Model Parameters after Training:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}

XGBoost Model

### **node2vec** [p = 2 y q = 0.5]

In [None]:
X_train_features = create_edge_features(X_train, embedding_dict3)
X_test_features = create_edge_features(X_test, embedding_dict3)

In [None]:
scaler = StandardScaler()
X_train_features = scaler.fit_transform(X_train_features)
X_test_features = scaler.transform(X_test_features)

**Regresión Logística**

In [None]:
clf = LogisticRegression(random_state=42, penalty='l2', C=1.0, class_weight='balanced')

clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
y_pred_proba = clf.predict_proba(X_test_features)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

coefficients = clf.coef_[0]
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(coefficients))]

feature_importance = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance:")
for feature, coef in feature_importance:
    print(f"{feature}: {coef:.4f}")

Accuracy: 0.7563
ROC-AUC Score: 0.8203
Precision: 0.8222
Recall: 0.7802
F1 Score: 0.8006

Confusion Matrix:
[[107173  42505]
 [ 55371 196503]]

Feature Importance:
Feature 0: -11.7498
Feature 16: 10.0500
Feature 26: 9.9966
Feature 29: -8.3773
Feature 23: 7.5330
Feature 20: -6.8936
Feature 44: -6.7206
Feature 62: 6.2197
Feature 5: 6.1112
Feature 8: 6.0667
Feature 1: 5.8001
Feature 10: -5.7669
Feature 19: -5.7470
Feature 54: -5.5753
Feature 17: 5.5124
Feature 25: -5.2928
Feature 35: -5.0806
Feature 49: -4.9880
Feature 3: -4.8704
Feature 60: 4.7246
Feature 18: 4.6630
Feature 59: -4.6302
Feature 40: -4.4945
Feature 34: -4.3835
Feature 2: -4.1879
Feature 57: 3.8619
Feature 45: 3.7292
Feature 50: -3.6287
Feature 43: 3.5879
Feature 15: -3.4904
Feature 33: 3.4471
Feature 24: 3.4398
Feature 63: -3.0112
Feature 21: -3.0062
Feature 31: -2.8416
Feature 42: -2.7943
Feature 11: 2.7932
Feature 36: 2.6792
Feature 55: -2.6616
Feature 47: 2.4696
Feature 53: -2.3162
Feature 64: -2.2361
Feature 38: -2.199

**Random Forest**

In [None]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=10, n_jobs=-1, class_weight='balanced')

print("Initial RandomForest Model Parameters:")
print(rf_clf.get_params())

rf_clf.fit(X_train_features, y_train)

print("\nRandomForest Model Parameters after Training:")
print(rf_clf.get_params())

rf_y_pred = rf_clf.predict(X_test_features)
rf_y_pred_proba = rf_clf.predict_proba(X_test_features)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_y_pred_proba)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
rf_confusion_matrix = confusion_matrix(y_test, rf_y_pred)

print("\nRandomForest Model Performance with Specific Parameters")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC-AUC Score: {rf_roc_auc:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

print("\nConfusion Matrix:")
print(rf_confusion_matrix)

feature_importances = rf_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial RandomForest Model Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Parameters after Training:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Performance with Specific Parameters
Accuracy: 0.7761
ROC-AUC Score: 0.8505
Precision: 0.7538
Recall: 0.8196
F

**XGBoost**

In [None]:
negative_class_count = sum(y_train == 0)
positive_class_count = sum(y_train == 1)
scale_pos_weight = negative_class_count / positive_class_count

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)

print("Initial XGBoost Model Parameters:")
print(xgb_clf.get_params())

xgb_clf.fit(X_train_features, y_train)

print("\nXGBoost Model Parameters after Training:")
print(xgb_clf.get_params())

xgb_y_pred = xgb_clf.predict(X_test_features)
xgb_y_pred_proba = xgb_clf.predict_proba(X_test_features)[:, 1]

xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_roc_auc = roc_auc_score(y_test, xgb_y_pred_proba)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_y_pred)

print("\nXGBoost Model Performance with Specific Parameters")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"ROC-AUC Score: {xgb_roc_auc:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nConfusion Matrix:")
print(xgb_confusion_matrix)

feature_importances = xgb_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial XGBoost Model Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}


Parameters: { "use_label_encoder" } are not used.




XGBoost Model Parameters after Training:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.999541805558631, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}

XGBoost Model

### **deepWalk + DNABERT3**

In [11]:
node_train_features = create_edge_features_mean(X_train, embedding_dict1)
node_test_features = create_edge_features_mean(X_test, embedding_dict1)

In [12]:
dna_train_features = create_edge_features_df(X_train, gdnabert3_df)
dna_test_features = create_edge_features_df(X_test, gdnabert3_df)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [13]:
X_train_features = np.concatenate((node_train_features, dna_train_features), axis=1)
X_test_features = np.concatenate((node_test_features, dna_test_features), axis=1)

In [16]:
scaler = StandardScaler()
X_train_features = scaler.fit_transform(X_train_features)
X_test_features = scaler.transform(X_test_features)

**Regresión Logística**

In [None]:
clf = LogisticRegression(random_state=42, penalty='l2', C=1.0)

clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
y_pred_proba = clf.predict_proba(X_test_features)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

coefficients = clf.coef_[0]
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(coefficients))]

feature_importance = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance:")
for feature, coef in feature_importance:
    print(f"{feature}: {coef:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7457
ROC-AUC Score: 0.8026
Precision: 0.7646
Recall: 0.8590
F1 Score: 0.8091

Confusion Matrix:
[[ 83065  66613]
 [ 35515 216359]]

Feature Importance:
Feature 0: -1.2347
Feature 21: -1.1661
Feature 1: 1.1352
Feature 5: 1.0381
Feature 23: 1.0241
Feature 16: 1.0124
Feature 13: 1.0109
Feature 25: -0.9671
Feature 44: -0.9380
Feature 52: -0.9062
Feature 10: -0.8918
Feature 18: 0.8874
Feature 20: -0.8840
Feature 8: 0.8763
Feature 45: 0.8628
Feature 59: -0.8536
Feature 4: -0.8173
Feature 50: -0.6991
Feature 60: 0.6910
Feature 46: 0.6516
Feature 31: -0.6385
Feature 14: 0.6353
Feature 53: -0.6126
Feature 35: -0.5962
Feature 3: -0.5822
Feature 49: -0.5705
Feature 33: 0.5627
Feature 40: -0.5515
Feature 51: 0.5486
Feature 41: -0.5436
Feature 26: 0.5280
Feature 30: 0.5191
Feature 43: -0.4708
Feature 56: 0.4694
Feature 24: 0.4447
Feature 55: 0.3802
Feature 39: -0.3756
Feature 113: 0.3512
Feature 62: 0.3385
Feature 57: 0.3301
Feature 15: -0.3226
Feature 352: 0.3224
Feature 22: 0.2973
Fea

**Random Forest**

In [None]:
rf_clf = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=10, n_jobs=-1, class_weight='balanced')

print("Initial RandomForest Model Parameters:")
print(rf_clf.get_params())

rf_clf.fit(X_train_features, y_train)

print("\nRandomForest Model Parameters after Training:")
print(rf_clf.get_params())

rf_y_pred = rf_clf.predict(X_test_features)
rf_y_pred_proba = rf_clf.predict_proba(X_test_features)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_y_pred_proba)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
rf_confusion_matrix = confusion_matrix(y_test, rf_y_pred)

print("\nRandomForest Model Performance with Specific Parameters")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC-AUC Score: {rf_roc_auc:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

print("\nConfusion Matrix:")
print(rf_confusion_matrix)

feature_importances = rf_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial RandomForest Model Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Parameters after Training:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}

RandomForest Model Performance with Specific Parameters
Accuracy: 0.6724
ROC-AUC Score: 0.7291
Precision: 0.7683
Recall: 0.6840
F

**XGBoost**

In [17]:
negative_class_count = sum(y_train == 0)
positive_class_count = sum(y_train == 1)
scale_pos_weight = negative_class_count / positive_class_count

xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)

print("Initial XGBoost Model Parameters:")
print(xgb_clf.get_params())

xgb_clf.fit(X_train_features, y_train)

print("\nXGBoost Model Parameters after Training:")
print(xgb_clf.get_params())

xgb_y_pred = xgb_clf.predict(X_test_features)
xgb_y_pred_proba = xgb_clf.predict_proba(X_test_features)[:, 1]

xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_roc_auc = roc_auc_score(y_test, xgb_y_pred_proba)
xgb_precision = precision_score(y_test, xgb_y_pred)
xgb_recall = recall_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_y_pred)

print("\nXGBoost Model Performance with Specific Parameters")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"ROC-AUC Score: {xgb_roc_auc:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nConfusion Matrix:")
print(xgb_confusion_matrix)

feature_importances = xgb_clf.feature_importances_
feature_names = X_train_features.columns if hasattr(X_train_features, 'columns') else [f'Feature {i}' for i in range(len(feature_importances))]

feature_importance = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance:
    print(f"{feature}: {importance:.4f}")

Initial XGBoost Model Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.5971770962957912, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}


Parameters: { "use_label_encoder" } are not used.




XGBoost Model Parameters after Training:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 0.5971770962957912, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}

XGBoost Mode