In [None]:
#Train test split
import pandas as pd
from sklearn.model_selection import train_test_split

# Assume df is your original DataFrame with 'target' as the label column

# Split the dataset into two subsets based on the target label
df_class_0 = df[df['target'] == 0]
df_class_1 = df[df['target'] == 1]

# Perform EDA separately on df_class_0 and df_class_1 here

# After EDA, merge the subsets back together, with class 0 data first
df_merged = pd.concat([df_class_0, df_class_1])

# Extract features and labels
X = df_merged.drop(columns=['target'])
y = df_merged['target']

# Perform a stratified train-test split to maintain the class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Check the class distribution in the train and test sets
print(f"Training set class distribution: {y_train.value_counts(normalize=True)}")
print(f"Test set class distribution: {y_test.value_counts(normalize=True)}")


## Some links of interests

### MFEUsLNet
https://www.sciencedirect.com/science/article/pii/S2215098624000181#b0155  

From the state of the art:

Ghosh et al. [31] proposed **SkinNet-16**, a deep-learning model aimed at distinguishing between benign and malignant skin lesions. Their approach employs advanced neural network architecture to enhance classification accuracy.

*Pronab Ghosh, Sami Azam, Ryana Quadir, Asif Karim, F.M. Shamrat, Shohag Kumar Bhowmik, Mirjam Jonkman, Khan Md Hasib, Kawsar Ahmed, SkinNet-16: a deep learning approach to identify benign and malignant skin lesions, Front. Oncol. 12 (2022) 931141.*

### EfficientNet V2 B0
https://www.kaggle.com/code/matthewjansen/transfer-learning-skin-cancer-classification  

### Inception-RestNet-v2
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9759648/


### Sample Code for using Optuna

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load example dataset
data = load_iris()
X, y = data.data, data.target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    
    # Create the model with these hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    preds = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, preds)
    
    # We want to maximize accuracy, so return it as a negative value
    return accuracy

# Create a study object and specify the direction of optimization
study = optuna.create_study(direction='maximize')

# Start the optimization
study.optimize(objective, n_trials=100, n_jobs=-1)  # n_trials is the number of trials, n_jobs=-1 uses all CPUs

# Get the best trial
best_trial = study.best_trial

print(f'Best trial number: {best_trial.number}')
print(f'Best value (accuracy): {best_trial.value}')
print(f'Best hyperparameters: {best_trial.params}')

# XGBoost imbalance classification

https://machinelearningmastery.com/xgboost-for-imbalanced-classification/

# Enabling GPUs with Keras

In [None]:
# Check GPU availability (before building the model)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
# 1. Specify GPU Device

import tensorflow as tf

# List available devices
devices = tf.config.list_physical_devices('GPU')
print("Available GPUs: ", devices)

# Set memory growth to avoid allocating all GPU memory upfront
for device in devices:
    tf.config.experimental.set_memory_growth(device, True)

# Specify a specific GPU device to use (if you have multiple GPUs)
with tf.device('/GPU:0'):  # or '/GPU:1', etc.
    # Your model code here
    pass


In [None]:
# 2. Set GPU Device Visibility

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only the first GPU

In [None]:
# 3. Limit GPU Memory Usage

import tensorflow as tf

# List available devices
devices = tf.config.list_physical_devices('GPU')
print("Available GPUs: ", devices)

for device in devices:
    tf.config.experimental.set_memory_growth(device, True)
    # Or set a specific memory limit if needed
    # tf.config.experimental.set_virtual_device_configuration(
    #     device,
    #     [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])

In [None]:
# 4. Check Device Placement

with tf.device('/GPU:0'):
    # Define and compile your model here
    pass

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.initializers import glorot_uniform

# Set environment variable to use only the first GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Verify GPU availability
print("Num GPUs Available: ", len(tf.config.list


In [None]:
# Example of Forcing GPU Usage

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.initializers import glorot_uniform

# Set environment variable to use only the first GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Verify GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Optionally, set memory growth to avoid TensorFlow allocating all GPU memory upfront
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Code below to build the model...

# Steps to design a NN

In [None]:
# 1. Prepare data
import pandas as pd

# Example DataFrame
data = {
    'feature1': [1.2, 2.3, 3.1, 4.5],
    'feature2': [5.1, 3.3, 6.2, 1.9],
    'feature3': [7.1, 8.3, 9.4, 10.2],
    'label': [0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separate features and labels
X = df.drop('label', axis=1)
y = df['label']

In [None]:
# 2. Create TensorFlow Datasets
import tensorflow as tf

# Convert DataFrame to TensorFlow dataset
def df_to_dataset(dataframe, labels, shuffle=True, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((dataframe.values, labels.values))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

# Define batch size
BATCH_SIZE = 32

# Split into training and validation sets (e.g., 80/20 split)
train_size = int(0.8 * len(X))
train_X, val_X = X[:train_size], X[train_size:]
train_y, val_y = y[:train_size], y[train_size:]

# Generate datasets
train_dataset = df_to_dataset(train_X, train_y, shuffle=True, batch_size=BATCH_SIZE)
val_dataset = df_to_dataset(val_X, val_y, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
# 3. Design the Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(train_X.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])


In [None]:
# 4. Train the Model
# Checkpoint callbacks
best_checkpoint_path = f"../models/tabular_nn_best.keras"
best_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=best_checkpoint_path, save_best_only=True)

final_checkpoint_path = f"../models/tabular_nn_final.keras"
final_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=final_checkpoint_path)

reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=1e-7)

history = model.fit(
    train_dataset,
    epochs=50,
    validation_data=val_dataset,
    callbacks=[best_checkpoint_callback, final_checkpoint_callback, reduce_lr_callback]
)

In [None]:
# 5. Loading and Continuing Training
# Load the model
model = tf.keras.models.load_model(best_checkpoint_path)

# Continue training
history = model.fit(
    train_dataset,
    epochs=10,  # Continue for more epochs
    validation_data=val_dataset,
    callbacks=[best_checkpoint_callback, final_checkpoint_callback, reduce_lr_callback]
)

Making your CNN model deterministic in training involves setting seeds for random number generators across various components of your machine learning environment, such as TensorFlow/Keras, NumPy, and Python. This minimizes the randomness in weight initialization, data shuffling, and GPU operations, leading to more consistent training results.

Here’s how you can modify your code to make it as deterministic as possible:

1. Set Seeds for TensorFlow, NumPy, and Python
python
Copy code
import tensorflow as tf
import numpy as np
import random
import os

# Set random seeds for reproducibility
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Ensure deterministic behavior on the GPU
os.environ['TF_DETERMINISTIC_OPS'] = '1'
2. Control Randomness in Data Augmentation (If Applicable)
If you are using data augmentation, ensure the operations are deterministic. For example, in Keras’ ImageDataGenerator, you can set a seed when generating augmented data:

python
Copy code
datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)
train_generator = datagen.flow(X_train, y_train, batch_size=32, seed=SEED)
3. Modify Your Model Initialization
In your model architecture, you've already set a seed for the GlorotUniform initializer. Ensure you set the same seed for all the initializers if they're used in other layers.

python
Copy code
from tensorflow.keras.initializers import glorot_uniform

# Initialize the ResNet model
base_model = ResNet152V2(weights='imagenet', include_top=False, input_shape=(IM_SIZE, IM_SIZE, 3))

# Make all layers trainable
for layer in base_model.layers:
    layer.trainable = True

# Build the custom layers on top of ResNet
x = GlobalAveragePooling2D()(base_model.output)    
x = Dense(NEURONS_1, kernel_initializer=glorot_uniform(seed=SEED),  activation=EVALUATION, kernel_regularizer=tf.keras.regularizers.l2(REG_RATE))(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(NEURONS_2, kernel_initializer=glorot_uniform(seed=SEED), activation=EVALUATION, kernel_regularizer=tf.keras.regularizers.l2(REG_RATE))(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(NEURONS_3, kernel_initializer=glorot_uniform(seed=SEED), activation=EVALUATION, kernel_regularizer=tf.keras.regularizers.l2(REG_RATE))(x)
x = Dropout(DROPOUT_RATE)(x)  
output = Dense(1, activation='sigmoid')(x)

# Build the model
model = Model(inputs=base_model.input, outputs=output)

# Compile the model with the Adam optimizer
optimizer = Adam(learning_rate=LEARN_RATE)
eval_metrics = ["accuracy", AUC(from_logits=False), SpecificityAtSensitivity(sensitivity=0.8)]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=eval_metrics)
4. Use Deterministic Operations (for GPU)
As TensorFlow can exhibit non-deterministic behavior on GPUs, it's important to ensure that deterministic operations are enforced:

bash
Copy code
# Set the environment variable to enforce deterministic operations
os.environ['TF_DETERMINISTIC_OPS'] = '1'
5. Limitations
GPU Operations: Even with deterministic operations enabled, some GPU operations may still introduce non-determinism due to the nature of parallel computation. You may need to run on a CPU to get fully deterministic results, though this may significantly slow down training.
Dropout and Batch Normalization: Dropout and Batch Normalization layers can introduce variability due to their nature. Setting seeds and ensuring deterministic behavior can mitigate, but not completely eliminate, this variability.
6. Potential Post-Training Solution
If full determinism isn't feasible, consider saving the model's weights after training and loading them for prediction to ensure consistency. This won't solve the variability during training but will guarantee that your predictions remain consistent.

python
Copy code
# Save the model weights after training
model.save_weights('model_weights.h5')

# Load the model weights for prediction
model.load_weights('model_weights.h5')
Conclusion
By setting seeds across TensorFlow, NumPy, and Python, and enforcing deterministic operations, you can greatly reduce the non-determinism in your CNN training process. However, due to the inherent randomness in neural network training, especially on GPUs, complete determinism may not always be achievable. Saving and loading the model weights can provide consistency for predictions.








In your final Kaggle notebook submission, you can indeed include the code to train your models, but as you've noted, training a Convolutional Neural Network (CNN) might not result in the exact same model each time due to its inherent non-determinism. However, there are ways to address this issue, and you can structure your notebook accordingly:
1. Include the Training Code:
•	Include the code to train your CNN model, as this shows transparency and ensures that your approach is fully reproducible in principle.
•	If you're using specific random seeds to control for some randomness (e.g., in data splitting, weight initialization), make sure to include those in your notebook.
2. Explain the Non-Determinism:
•	In the markdown cells of your notebook, clearly explain that due to the nature of neural networks and certain aspects of the training process (like GPU operations, dropout, etc.), the model might not produce the exact same results every time.
•	Mention that while the architecture and training process are reproducible, minor variations in the trained weights might occur.
3. Save and Load the Model (Optional but Transparent):
•	If you're allowed by the competition rules, you could save the trained CNN model's weights within the notebook after training and load them later to generate predictions. This way, you ensure that your exact trained model is used for prediction, even if it's trained on-the-fly in the notebook.
•	Example:
python
Copy code
# Save the model
model.save('cnn_model.h5')

# Later, load the model
from tensorflow.keras.models import load_model
model = load_model('cnn_model.h5')
•	Note: Ensure this is done within the notebook session and doesn't rely on external files not generated within the notebook.
4. Potential Workaround:
•	If the competition allows pre-trained models from public sources, consider training your CNN model on a larger dataset or using a pre-trained model that’s publicly available (like a model pre-trained on ImageNet) and fine-tuning it. This way, you could avoid some randomness and make your approach more reproducible.
5. Document the Process:
•	Document your training procedure, including any attempts to control for randomness and explain why exact reproducibility might be challenging.
•	Describe how your feature extraction works and how this feature vector is used in the downstream task.
6. Use Model Checkpointing:
•	If you have time constraints, use checkpointing in your training process to save the best model during training and then use this model to generate predictions.
Example Workflow:
1.	Preprocessing and CNN training code: Include the code for preprocessing and training the CNN. Set random seeds where possible.
2.	Model checkpoint: Save the model weights during training.
3.	Loading and generating predictions: Load the saved weights and generate predictions.
4.	Markdown explanations: Include markdown cells explaining the non-deterministic nature of CNNs and how you’ve handled it.
Conclusion:
While training a CNN can introduce non-determinism, you can mitigate this by controlling randomness where possible, saving and loading model weights within the notebook, and clearly documenting the process. Ensure that your notebook can be executed from start to finish on Kaggle's platform to generate the required submission file (submission.csv). If exact reproducibility isn't feasible, providing the code and explaining the limitations will demonstrate your good faith effort in making the process as transparent and reproducible as possible.

