In [2]:
import pandas as pd 

data_cox = pd.read_csv('data/data_cox_with_survival3yr.csv')
data_cox.head()

Unnamed: 0,CD74,HLA-DPB1,HLA-DRB1,HLA-DMA,HLA-DMB,HLA-DRA,HLA-DPA1,HLA-DOA,HLA-E,HLA-DQA1,...,LCP2,SERPING1,CD4,FCER1G,C1QB,C1QC,overall_survival,status,risk_score,survival_3yr
0,12.5668,8.6102,8.759,8.5993,8.9834,10.788,8.8908,6.8782,12.2597,6.8204,...,7.6154,8.2515,10.1129,9.4675,11.3329,11.2728,3574.0,False,-0.291393,1
1,15.4927,11.9222,12.3668,10.5364,10.8081,13.9021,12.5162,10.0694,13.9244,10.2799,...,9.0451,11.8442,10.8098,9.7941,12.5445,12.4334,234.0,True,1.282886,0
2,15.3739,11.4483,12.7477,10.558,10.8706,13.7942,11.965,10.3466,13.7275,10.4735,...,9.285,9.8687,11.6573,10.0406,13.3568,13.2014,1335.0,True,-0.699702,1
3,13.2865,10.4928,9.8587,9.0924,9.5139,11.8657,10.5019,8.501,11.6388,8.4154,...,7.9089,7.9684,10.2035,8.425,11.8849,11.9314,1106.0,True,-0.937324,1
4,14.4549,11.67,11.6364,10.3967,9.8603,13.2601,11.5003,9.0541,12.5336,9.8875,...,6.9549,10.4694,9.5651,8.994,11.4629,11.5642,1828.0,False,0.119908,1


In [3]:
# Columnas que no son genes hub a excluir
exclude_cols = ['overall_survival', 'status', 'risk_score', 'survival_3yr']

# Lista de genes hub: todas las columnas salvo las clínicas (de expresión génica)
top_hubs_in_expr = [col for col in data_cox.columns if col not in exclude_cols]

print(f"Número de genes hub utilizados: {len(top_hubs_in_expr)}")
print(top_hubs_in_expr)

Número de genes hub utilizados: 25
['CD74', 'HLA-DPB1', 'HLA-DRB1', 'HLA-DMA', 'HLA-DMB', 'HLA-DRA', 'HLA-DPA1', 'HLA-DOA', 'HLA-E', 'HLA-DQA1', 'PTPRC', 'LYN', 'IL10', 'CD3E', 'FCGR3A', 'LCK', 'CASP1', 'CD2', 'B2M', 'LCP2', 'SERPING1', 'CD4', 'FCER1G', 'C1QB', 'C1QC']


In [19]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

X = data_cox[top_hubs_in_expr].values
y = data_cox['survival_3yr'].values

# Definimos pipeline: balanceo + clasificador
pipeline = Pipeline([
    ('smotetomek', SMOTETomek(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

# Definir búsqueda de hiperparámetros (puedes agregar más combinaciones)
param_grid = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 5, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2', 0.5],
    'rf__bootstrap': [True, False]
}

# Función scoring segura para AUC (evita nan en folds con una sola clase)
def safe_roc_auc_score(y_true, y_pred_proba):
    unique_classes = np.unique(y_true)
    if len(unique_classes) < 2:
        return 0.5
    else:
        return roc_auc_score(y_true, y_pred_proba)

scorer = make_scorer(safe_roc_auc_score, needs_proba=True)

# Validación cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    verbose=2,
    n_jobs=-1,
    refit=True,
    return_train_score=False
)

grid_search.fit(X, y)

print(f"Mejores parámetros: {grid_search.best_params_}")
print(f"Mejor AUC medio (CV): {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 648 candidates, totalling 3240 fits


KeyboardInterrupt: 

In [None]:
import pandas as pd

results = pd.DataFrame(grid_search.cv_results_)

# Ordenar por mejor score medio
top_results = results.sort_values(by='mean_test_score', ascending=False).head(3)

for idx, row in top_results.iterrows():
    print(f"Configuración:\n{row['params']}")
    print(f"AUC medio (CV): {row['mean_test_score']:.4f}")
    fold_scores = [row[f'split{i}_test_score'] for i in range(cv.get_n_splits())]
    for fold_i, score in enumerate(fold_scores, 1):
        print(f"  Fold {fold_i}: AUC = {score:.4f}")
    print("-" * 40)


In [None]:
from sklearn.base import clone

best_params = grid_search.best_params_

pipeline.set_params(**best_params)

auc_scores = []

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Clonar pipeline con mejores parámetros para evitar contaminación
    model = clone(pipeline)
    
    # Entrenar con balanceo SMOTETomek en train
    model.fit(X_train, y_train)
    
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    auc_scores.append(auc)
    print(f"Fold {fold}: AUC = {auc:.4f}")

print(f"AUC promedio con mejores parámetros: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
