In [None]:
!pip uninstall scikit-learn scikit-survival
!pip install scikit-learn scikit-survival lifelines

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Lire le fichier en sautant la première ligne
df = pd.read_excel("/content/drive/MyDrive/GastricCancerData.xlsx", skiprows=1)
# Afficher les premières lignes du dataframe
df.head()

Unnamed: 0,AGE,SEXE,Cardiopathie,Ulceregastrique,Douleurepigastrique,Ulcero-bourgeonnant,Constipation,Denitrution,Tabac,Mucineux,Tubuleux,Infiltrant,Stenosant,Metastases,Adenopathie,Traitement,Tempsdesuivi (Mois),Deces
0,65,F,NON,NON,OUI,NON,OUI,OUI,OUI,OUI,NON,OUI,OUI,OUI,OUI,Chirurgie_Exclusive,1,OUI
1,34,F,NON,NON,OUI,NON,OUI,NON,OUI,OUI,OUI,OUI,OUI,OUI,OUI,Chirurgie_Chimiotherapie,1,OUI
2,55,H,NON,NON,OUI,NON,OUI,NON,OUI,OUI,NON,OUI,NON,OUI,OUI,Chirurgie_Exclusive,1,OUI
3,60,F,OUI,OUI,OUI,NON,OUI,NON,OUI,OUI,OUI,OUI,NON,OUI,NON,Chirurgie_Exclusive,2,NON
4,65,H,OUI,NON,OUI,NON,OUI,NON,OUI,NON,NON,OUI,OUI,NON,OUI,Chirurgie_Chimiotherapie,2,OUI


In [18]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Identifier les colonnes catégoriques
CatCols = df.select_dtypes(include=['object']).columns
# Encodage des variables catégoriques
#Label Encoding
label_encoder = LabelEncoder()
for col in CatCols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

df.head()

Unnamed: 0,AGE,SEXE,Cardiopathie,Ulceregastrique,Douleurepigastrique,Ulcero-bourgeonnant,Constipation,Denitrution,Tabac,Mucineux,Tubuleux,Infiltrant,Stenosant,Metastases,Adenopathie,Traitement,Tempsdesuivi (Mois),Deces
0,65,0,0,0,1,0,1,1,1,1,0,1,1,1,1,1,1,1
1,34,0,0,0,1,0,1,0,1,1,1,1,1,1,1,0,1,1
2,55,1,0,0,1,0,1,0,1,1,0,1,0,1,1,1,1,1
3,60,0,1,1,1,0,1,0,1,1,1,1,0,1,0,1,2,0
4,65,1,1,0,1,0,1,0,1,0,0,1,1,0,1,0,2,1


In [19]:
# Effacer les variables non pertinantes
clean=['SEXE','Tubuleux','Traitement','Constipation']
df=df.drop(columns=clean,axis=1)
df.columns

Index(['AGE', 'Cardiopathie', 'Ulceregastrique', 'Douleurepigastrique',
       'Ulcero-bourgeonnant', 'Denitrution', 'Tabac', 'Mucineux', 'Infiltrant',
       'Stenosant', 'Metastases', 'Adenopathie', 'Tempsdesuivi (Mois)',
       'Deces'],
      dtype='object')

In [None]:
!pip install tensorflow keras-tuner

In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf
from kerastuner.tuners import RandomSearch
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored, integrated_brier_score


# Préparation des données
# Supposons que df est votre DataFrame
X = df.drop(["Deces", "Tempsdesuivi (Mois)"], axis=1)
y = df[["Deces", "Tempsdesuivi (Mois)"]].values.astype(np.float32)

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Conversion pour scikit-survival
train_surv = np.array([(bool(e), t) for e, t in y_train], dtype=[('event', '?'), ('time', '<f8')])
test_surv = np.array([(bool(e), t) for e, t in y_test], dtype=[('event', '?'), ('time', '<f8')])

# Définition de la fonction de perte Cox
def cox_loss(y_true, y_pred):
    event = y_true[:, 0]
    time = y_true[:, 1]

    # Tri par temps décroissant
    sorted_idx = tf.argsort(time, direction='DESCENDING')
    event = tf.gather(event, sorted_idx)
    time = tf.gather(time, sorted_idx)
    pred = tf.gather(y_pred, sorted_idx)

    # Calcul des termes de risque
    exp_pred = tf.exp(pred)
    cumsum = tf.cumsum(exp_pred)
    log_cumsum = tf.math.log(cumsum + 1e-15)  # Éviter log(0)

    # Calcul de la perte
    loss = (pred - log_cumsum) * event
    return -tf.reduce_mean(loss)

# Construction du modèle de base
def build_model(input_shape, num_units=32, learning_rate=0.01):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_units, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(1, activation='linear')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=cox_loss
    )
    return model

# Entraînement du modèle initial
model = build_model(X_train.shape[1])
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    verbose=0
)

# Évaluation du C-index
def evaluate_model(model, X, y):
    risk_scores = model.predict(X).flatten()
    event = y[:, 0].astype(bool)
    time = y[:, 1]
    return concordance_index_censored(event, time, risk_scores)[0]

cindex_initial = evaluate_model(model, X_test, y_test)
print(f"C-index initial: {cindex_initial:.3f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
C-index initial: 0.814


## **Optimisation des hyperparamètres**

In [None]:
import numpy as np
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

def create_model(units=64, num_layers=1, lr=1e-3, units_0=64, units_1=64, units_2=64):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(units=units, activation='relu', input_shape=(X_train.shape[1],)))
    
    for i in range(num_layers):
        if i == 0:
            model.add(tf.keras.layers.Dense(units=units_0, activation='relu'))
        elif i == 1:
            model.add(tf.keras.layers.Dense(units=units_1, activation='relu'))
        elif i == 2:
            model.add(tf.keras.layers.Dense(units=units_2, activation='relu'))
    
    model.add(tf.keras.layers.Dense(1, activation='linear'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss=cox_loss)
    return model

# Enrobage du modèle avec scikeras
keras_estimator = KerasRegressor(model=create_model, epochs=50, verbose=0)

# Grille d'hyperparamètres avec le préfixe "model__" pour les paramètres du modèle
param_grid = {
    'model__units': [16, 32, 64, 128],
    'model__num_layers': [1, 2, 3],
    'model__lr': [1e-2, 1e-3, 1e-4],
    'model__units_0': [16, 32, 64, 128],
    'model__units_1': [16, 32, 64, 128],
    'model__units_2': [16, 32, 64, 128]
}

grid = GridSearchCV(estimator=keras_estimator,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=3)

grid_result = grid.fit(X_train, y_train)
print("Meilleurs hyperparamètres :", grid_result.best_params_)

# Récupération du meilleur modèle
best_model = grid_result.best_estimator_.model

# Ré-entraînement sur l'ensemble des données
X_full = np.concatenate([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
best_model.fit(X_full, y_full, epochs=100, verbose=0)

# Évaluation finale
cindex_final = evaluate_model(best_model, X_test, y_test)
print(f"C-index optimisé : {cindex_final:.3f}")

In [25]:
# Optimisation des hyperparamètres
def model_builder(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(
        units=hp.Int('units', 16, 128, step=16),
        activation='relu',
        input_shape=(X_train.shape[1],)
    ))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(tf.keras.layers.Dense(
            units=hp.Int(f'units_{i}', 16, 128, step=16),
            activation='relu'
        ))

    model.add(tf.keras.layers.Dense(1, activation='linear'))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Choice('lr', [1e-2, 1e-3, 1e-4])
        ),
        loss=cox_loss
    )
    return model

tuner = RandomSearch(
    model_builder,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='deep_surv'
)

tuner.search(
    X_train, y_train,
    epochs=50,
    validation_data=(X_test, y_test),
    verbose=0
)

# Meilleur modèle
best_model = tuner.get_best_models(num_models=1)[0]
best_hps = tuner.get_best_hyperparameters()[0]

# Ré-entraînement avec tous les données
best_model.fit(
    np.concatenate([X_train, X_test]),
    np.concatenate([y_train, y_test]),
    epochs=100,
    verbose=0
)

# Évaluation finale
cindex_final = evaluate_model(best_model, X_test, y_test)
print(f"C-index optimisé: {cindex_final:.3f}")

Reloading Tuner from tuner_dir/deep_surv/tuner0.json


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 115ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
C-index optimisé: 0.877


## **Affichage des hyperparamètres optimisés**

In [28]:
import numpy as np
from scipy import stats
from sklearn.utils import resample

# 1. Affichage des hyperparamètres optimisés
print("\nHyperparamètres optimisés:")
print(f"Unités couche initiale: {best_hps.get('units')}")
print(f"Nombre de couches cachées: {best_hps.get('num_layers')}")
for i in range(best_hps.get('num_layers')):
    print(f"Unités couche {i+1}: {best_hps.get(f'units_{i}')}")
print(f"Taux d'apprentissage: {best_hps.get('lr')}")


Hyperparamètres optimisés:
Unités couche initiale: 64
Nombre de couches cachées: 3
Unités couche 1: 96
Unités couche 2: 32
Unités couche 3: 48
Taux d'apprentissage: 0.01


## **Calcul des intervalles de confiance du C-index**

In [30]:
# 2. Calcul des intervalles de confiance du C-index
def cindex_with_ci(model, X, y, n_boot=1000, ci=95):
    event = y[:, 0].astype(bool)
    time = y[:, 1]
    risk_scores = model.predict(X).flatten()

    # Calcul initial
    c, concordant, discordant, tied_risk, tied_time = concordance_index_censored(event, time, risk_scores)

    # Bootstrapping pour l'intervalle de confiance
    boot_stats = []
    for _ in range(n_boot):
        indices = resample(np.arange(len(X)))
        sample_event = event[indices]
        sample_time = time[indices]
        sample_risk = risk_scores[indices]

        if np.sum(sample_event) == 0:  # Éviter les échantillons sans événements
            continue

        boot_c = concordance_index_censored(sample_event, sample_time, sample_risk)[0]
        boot_stats.append(boot_c)

    # Calcul des percentiles
    lower = np.percentile(boot_stats, (100 - ci)/2)
    upper = np.percentile(boot_stats, 100 - (100 - ci)/2)
    mean_c = np.mean(boot_stats)

    return c, (lower, upper), mean_c

# 3. Évaluation avec intervalles de confiance
cindex_final, ci, mean_cindex = cindex_with_ci(best_model, X_test, y_test)

print(f"\nC-index final: {cindex_final:.3f}")
print(f"Moyenne C-index (bootstrapped): {mean_cindex:.2f}")
print(f"Intervalle de confiance {ci[1]-ci[0]:.0f}%: [{ci[0]:.2f}, {ci[1]:.2f}]")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

C-index final: 0.877
Moyenne C-index (bootstrapped): 0.88
Intervalle de confiance 0%: [0.83, 0.92]


## **Brier Score Integred**

In [61]:
import numpy as np
from sksurv.metrics import integrated_brier_score

# Définition des bornes des temps de suivi en excluant la borne supérieure
min_time = np.min(y_test[:, 1])  # Temps minimum dans les données de test
max_time = np.max(y_test[:, 1]) * 0.99  # Réduction de la borne max pour éviter les erreurs

# Création de la grille de temps strictement dans l'intervalle valide
time_grid = np.linspace(min_time, max_time, 100)

# Prédiction des fonctions de survie avec le modèle Deep Survival
def predict_survival_function(model, X, time_grid):
    """
    Prédit les probabilités de survie pour une grille de temps donnée.
    """
    risk_scores = model.predict(X).flatten()
    surv_probs = np.zeros((len(X), len(time_grid)))

    # Calcul des probabilités de survie pour chaque point de la grille de temps
    for i, t in enumerate(time_grid):
        surv_probs[:, i] = np.exp(-np.exp(risk_scores) ** t )
        # Modèle de survie exponentiel

    return surv_probs

# Prédiction des probabilités de survie
surv_probs = predict_survival_function(best_model, X_test, time_grid)

# Conversion des données pour scikit-survival
train_surv = np.array([(bool(e), t) for e, t in y_train], dtype=[('event', '?'), ('time', '<f8')])
test_surv = np.array([(bool(e), t) for e, t in y_test], dtype=[('event', '?'), ('time', '<f8')])

# Calcul du Brier Score Intégré
ibs = integrated_brier_score(train_surv, test_surv, surv_probs, time_grid)

print(f"Integrated Brier Score: {ibs:.1f}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Integrated Brier Score: 0.2


  surv_probs[:, i] = np.exp(-np.exp(risk_scores) ** t )
