In [1]:
import pandas as pd

data = pd.read_csv("CP_count_PUMA.csv")
data

Unnamed: 0,smiles,1_1,10_19,10_20,100_277,100_278,104_291,105_355,105_369,105_361,...,Cells_Neighbors_FirstClosestObjectNumber_Adjacent,Cells_Neighbors_SecondClosestObjectNumber_5,Cells_Neighbors_SecondClosestObjectNumber_Adjacent,Cells_Parent_Nuclei,Cytoplasm_Number_Object_Number,Cytoplasm_Parent_Cells,Cytoplasm_Parent_Nuclei,Nuclei_Neighbors_FirstClosestObjectNumber_1,Nuclei_Neighbors_SecondClosestObjectNumber_1,Nuclei_Number_Object_Number
0,CCOC(=O)c1ccc(NC(=S)N2CCSC2c2ccc(OC)cc2)cc1,,,,,,,,,,...,1.765625,0.890625,0.890625,1.328125,1.328125,1.328125,1.328125,0.960938,1.320312,1.328125
1,O=C(CC1NC(=O)NC1=O)Nc1cccc2ccccc12,0.0,,,,,,,,,...,3.078125,2.718750,2.718750,2.503906,2.503906,2.503906,2.503906,3.242188,2.578125,2.503906
2,CC(Nc1nc(nc2ccccc12)N1CCCC1)c1ccccc1,,,,,,,,,,...,-7.765625,-8.140625,-8.140625,-8.250000,-8.250000,-8.250000,-8.250000,-7.710938,-7.546875,-8.250000
3,CCCC(Oc1ccc(Br)cc1)c1nc2c3cc(OC)c(OC)cc3nc(S)n2n1,,,,,,,,,,...,-7.320312,-7.828125,-7.828125,-7.796875,-7.804688,-7.796875,-7.796875,-7.859375,-7.695312,-7.804688
4,CC1=C(C(NC(=O)N1)c1ccc(F)cc1)C(=O)OCc1ccc2OCOc2c1,,,,,,,,,,...,-10.953125,-10.390625,-10.390625,-10.960938,-10.960938,-10.960938,-10.960938,-11.406250,-10.781250,-10.960938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16165,COC(=O)[C@H]1[C@H](CO)[C@H]2Cn3c(=O)c(\C=C\C)c...,,,,,,,,,,...,-2.039062,-2.031250,-2.031250,-1.550781,-1.550781,-1.550781,-1.550781,-1.546875,-2.945312,-1.550781
16166,C\C=C\c1ccc2n(C[C@H]3[C@H](CO)[C@H](N(C)[C@@H]...,,,,,,,,,,...,-0.851562,-0.359375,-0.359375,-0.835938,-0.835938,-0.835938,-0.835938,-1.328125,-0.804688,-0.835938
16167,C\C=C\c1ccc2n(C[C@@H]3[C@@H](CO)[C@@H](N(C)[C@...,,,,,,,,,,...,-3.851562,-3.359375,-3.359375,-3.835938,-3.835938,-3.835938,-3.835938,-3.328125,-4.054688,-3.835938
16168,C\C=C\c1ccc2n(C[C@H]3[C@H](CO)[C@@H](C(=O)N[C@...,,,,,,,,,,...,-4.039062,-2.460938,-2.460938,-3.300781,-3.300781,-3.300781,-3.300781,-3.382812,-2.937500,-3.300781


In [9]:
assays_list = data.columns.to_list()[1:-13]

In [10]:
len(assays_list)

270

In [13]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

# GPU Configuration
def configure_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            for device in physical_devices:
                tf.config.experimental.set_memory_growth(device, True)
            tf.config.experimental.set_visible_devices(physical_devices, 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            logging.info(f'{len(physical_devices)} Physical GPUs, {len(logical_gpus)} Logical GPUs')
        except RuntimeError as e:
            logging.error(f"GPU configuration error: {e}")
    else:
        logging.warning("No GPU devices found")

configure_gpu()

# Check if TensorFlow is using GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

missing_value_indicator = -1

def check_data(data, name="Data"):
    """Check if the data contains NaN or infinite values."""
    if np.any(np.isnan(data)) or np.any(np.isinf(data)):
        logging.warning(f"{name} contains NaN or infinite values.")
    else:
        logging.info(f"{name} is clean.")

# Custom Loss Function to Ignore Missing Values
def custom_loss(y_true, y_pred):
    mask = tf.not_equal(y_true, missing_value_indicator)
    mask = tf.cast(mask, dtype=tf.float32)
    y_true_masked = tf.multiply(y_true, mask)
    y_pred_masked = tf.multiply(y_pred, mask)
    loss = tf.keras.losses.binary_crossentropy(y_true_masked, y_pred_masked)
    loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)  # Normalize by the number of available targets
    return loss

# Custom AUC function to ignore missing values
class MaskedAUC(tf.keras.metrics.AUC):
    def __init__(self, name='masked_auc', **kwargs):
        super(MaskedAUC, self).__init__(name=name, **kwargs)

    def update_state(self, y_true, y_pred, sample_weight=None):
        mask = tf.not_equal(y_true, missing_value_indicator)
        mask = tf.cast(mask, dtype=tf.bool)
        y_true_masked = tf.boolean_mask(y_true, mask)
        y_pred_masked = tf.boolean_mask(y_pred, mask)
        super(MaskedAUC, self).update_state(y_true_masked, y_pred_masked, sample_weight)

# Build the Neural Network with Regularization and Batch Normalization
def build_model(input_dim, output_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(output_dim, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss=custom_loss, metrics=[MaskedAUC()])
    return model

def load_and_preprocess_data(train_path, test_path, data, assays_list):
    train_data = pd.read_csv(train_path)[["smiles"]]
    train_data = pd.merge(train_data, data, on="smiles")
    
    test_data = pd.read_csv(test_path)[["smiles"]]
    test_data = pd.merge(test_data, data, on="smiles")
    
    X_train = train_data[["Cells_Number_Object_Number"]].values
    X_test = test_data[["Cells_Number_Object_Number"]].values
    
    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    y_train = train_data[assays_list].replace(np.nan, missing_value_indicator).values
    y_test = test_data[assays_list].replace(np.nan, missing_value_indicator).values
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_model(fold, data, assays_list):
    train_path = f"PUMA/predictions/chemical_cv{fold}/assay_matrix_discrete_train_scaff.csv"
    test_path = f"PUMA/predictions/chemical_cv{fold}/assay_matrix_discrete_test_scaff.csv"
    
    X_train, X_test, y_train, y_test = load_and_preprocess_data(train_path, test_path, data, assays_list)
    
    # Check data for NaN or infinite values
    check_data(X_train, "X_train")
    check_data(y_train, "y_train")
    check_data(X_test, "X_test")
    check_data(y_test, "y_test")
    
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]
    
    model = build_model(input_dim, output_dim)

    # Create a stratified split for the validation set using multilabel stratification
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, val_index in mskf.split(X_train, y_train):
        X_train_split, X_val_split = X_train[train_index], X_train[val_index]
        y_train_split, y_val_split = y_train[train_index], y_train[val_index]
        break  # Only need the first split

    # Define early stopping, learning rate scheduler, and model checkpoint
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_masked_auc', patience=10, restore_best_weights=True, mode='max')
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_masked_auc', factor=0.5, patience=5, mode='max')
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f'best_model_fold_{fold}.h5', save_best_only=True, monitor='val_masked_auc', mode='max')

    # Train the Neural Network
    history = model.fit(X_train_split, y_train_split, epochs=100, batch_size=32, validation_data=(X_val_split, y_val_split),
                        callbacks=[early_stopping, lr_scheduler, model_checkpoint])

    # Evaluate the Model
    metrics = model.evaluate(X_test, y_test)
    for name, value in zip(model.metrics_names, metrics):
        logging.info(f"Fold {fold} - {name}: {value}")

    # Make predictions
    predictions = model.predict(X_test)
    
    predictions_df = pd.DataFrame(predictions, columns=assays_list)
    predictions_df.to_csv(f"predictions_fold_{fold}.csv", index=False)
    
    y_test_df = pd.DataFrame(y_test, columns=assays_list)
    y_test_df.to_csv(f"y_test_fold_{fold}.csv", index=False)

# Main execution
if __name__ == "__main__":
    for fold in range(5):
        train_and_evaluate_model(fold, data, assays_list)


INFO:root:8 Physical GPUs, 8 Logical GPUs


Num GPUs Available:  8


INFO:root:X_train is clean.
INFO:root:y_train is clean.
INFO:root:X_test is clean.
INFO:root:y_test is clean.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


INFO:root:Fold 0 - loss: 0.00040168085251934826
INFO:root:Fold 0 - masked_auc: 0.7511682510375977




INFO:root:X_train is clean.
INFO:root:y_train is clean.
INFO:root:X_test is clean.
INFO:root:y_test is clean.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100


INFO:root:Fold 1 - loss: 0.0004035759484395385
INFO:root:Fold 1 - masked_auc: 0.7356223464012146




INFO:root:X_train is clean.
INFO:root:y_train is clean.
INFO:root:X_test is clean.
INFO:root:y_test is clean.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


INFO:root:Fold 2 - loss: 0.00041717931162565947
INFO:root:Fold 2 - masked_auc: 0.7351248860359192




INFO:root:X_train is clean.
INFO:root:y_train is clean.
INFO:root:X_test is clean.
INFO:root:y_test is clean.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100


INFO:root:Fold 3 - loss: 0.0004185629077255726
INFO:root:Fold 3 - masked_auc: 0.7344642877578735




INFO:root:X_train is clean.
INFO:root:y_train is clean.
INFO:root:X_test is clean.
INFO:root:y_test is clean.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


INFO:root:Fold 4 - loss: 0.0004164405108895153
INFO:root:Fold 4 - masked_auc: 0.7431594729423523


