# **Install Dependencies and Ignore Warnings**

In [None]:
# No need to run this cell on colab since it already has these installed
!pip install tensorflow numpy scikit-learn

In [None]:
# Install scikeras for compatibility between Scikit-learn and Keras.
!pip install -q scikeras -q scikeras

In [None]:
# Import warnings module to filter out any warning messages for a cleaner output.
import warnings
warnings.filterwarnings('ignore')

# **Create Dataset**

In [None]:
import numpy as np
from sklearn.datasets import make_classification

# Generate synthetic classification data with 1000 samples and 20 features.
# This dataset will be used for training and evaluation.
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
print("Inputs shape: ", X.shape)
print("Outputs shape: ", y.shape)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Perform Principal Component Analysis (PCA) to reduce data dimensions to 2 for visualization.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Convert the PCA transformed data into a DataFrame for easier plotting.
import pandas as pd
df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df['Label'] = y

# Plot the 2D visualization of the dataset, differentiating the classes using colors.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Label', palette='viridis', data=df)
plt.title('2D Visualization of the Synthetic Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# **Build Model**

In [None]:
'''Define the ANN Model'''
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier


# Define the ANN Model
def create_model(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)):
    """
    Creates and compiles a simple neural network model.

    Args:
    optimizer: The optimizer to use for training the model. Default is Adam with a learning rate of 0.01.

    Returns:
    A compiled Keras model ready for training.
    """
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=20))  # Input layer with 20 features
    model.add(Dense(16, activation='relu'))                # Hidden layer with 16 units
    model.add(Dense(1, activation='sigmoid'))              # Output layer for binary classification

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model


# **Grid Search for Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

# Wrap the Keras model in Scikit-learn's KerasClassifier for compatibility with GridSearchCV.
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define the grid search parameters, including different optimizers, batch sizes, and epochs.
param_grid = {'optimizer': ['adam', 'rmsprop', 'sgd'],
              'batch_size': [16, 32, 64],
              'epochs': [10, 20]}

# Initialize GridSearchCV to perform an exhaustive search over specified parameter values.
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=2)

# Fit the grid search model to find the best hyperparameters.
print("Performing Grid Search for Hyperparameter Tuning...\n")
grid_result = grid.fit(X, y) # This step might take some time to execute.

# Print the best accuracy and parameters obtained from the grid search.
print(f"\nBest Accuracy achieved: {grid_result.best_score_} using {grid_result.best_params_}")

In [None]:
from itertools import combinations

# Convert the GridSearchCV results to a DataFrame for visualization.
cv_results = pd.DataFrame(grid_result.cv_results_)

# Flatten the 'params' column for easier visualization.
params = cv_results['params'].apply(pd.Series)

# Concatenate the flattened params with the original DataFrame.
results_df = pd.concat([cv_results.drop(['params'], axis=1), params], axis=1)

# Define hyperparameters for which to generate pairwise combinations.
hyperparams = ['optimizer', 'epochs', 'batch_size']

# Create all unique pairs of hyperparameters for visualization.
hyperparam_combinations = list(combinations(hyperparams, 2))

# Plot heatmaps for each pairwise combination of hyperparameters.
fig, axs = plt.subplots(1, 3, figsize=(24, 8))  # Create a figure with 3 subplots side by side

for idx, combo in enumerate(hyperparam_combinations):
    param1, param2 = combo

    # Pivot the DataFrame to prepare for the heatmap.
    pivot_df = results_df.pivot_table(index=param1, columns=param2, values='mean_test_score', aggfunc=np.mean)

    # Fill missing values, if any, with 0.
    pivot_df.fillna(0, inplace=True)

    # Plot the heatmap on the corresponding subplot.
    sns.heatmap(pivot_df, ax=axs[idx], annot=True, fmt=".2f", cmap="YlGnBu")
    axs[idx].set_title(f'Grid Search CV Results: {param1} vs {param2}')
    axs[idx].set_xlabel(param2)  # Label for the x-axis
    axs[idx].set_ylabel(param1)  # Label for the y-axis

plt.tight_layout()  # Adjust layout to prevent overlap.
plt.show()

# **Random Search for Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Reinitialize the Keras model for RandomizedSearchCV.
model = KerasClassifier(build_fn=create_model(tf.keras.optimizers.Adam(learning_rate=0.01)), verbose=0)

# Define the random search parameters.
param_dist = {'optimizer': ['adam', 'rmsprop', 'sgd'],
              'batch_size': [16, 32, 64],
              'epochs': [10, 20]}

# Initialize RandomizedSearchCV to randomly sample the parameter space.
rand = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_jobs=-1)

# Fit the random search model to find the best hyperparameters.
print("Performing Randomized Search for Hyperparameter Tuning...\n")
rand_result = rand.fit(X, y)

# Print the best accuracy and parameters obtained from the random search.
print(f"Best: {rand_result.best_score_} using {rand_result.best_params_}")

In [None]:
# Convert the RandomizedSearchCV results to a DataFrame for visualization.
rand_results = pd.DataFrame(rand_result.cv_results_)

# Flatten the 'params' column for easier visualization.
params = rand_results['params'].apply(pd.Series)

# Concatenate the flattened params with the original DataFrame.
results_df = pd.concat([rand_results.drop(['params'], axis=1), params], axis=1)

# Define hyperparameters for which to generate pairwise combinations.
hyperparams = ['optimizer', 'epochs', 'batch_size']

# Create all unique pairs of hyperparameters for visualization.
hyperparam_combinations = list(combinations(hyperparams, 2))

# Plot heatmaps for each pairwise combination of hyperparameters.
fig, axs = plt.subplots(1, 3, figsize=(24, 8))  # Create a figure with 3 subplots side by side

for idx, combo in enumerate(hyperparam_combinations):
    param1, param2 = combo

    # Pivot the DataFrame to prepare for the heatmap.
    pivot_df = results_df.pivot_table(index=param1, columns=param2, values='mean_test_score', aggfunc=np.mean)

    # Fill missing values, if any, with 0.
    pivot_df.fillna(0, inplace=True)

    # Plot the heatmap on the corresponding subplot.
    sns.heatmap(pivot_df, ax=axs[idx], annot=True, fmt=".2f", cmap="YlGnBu")
    axs[idx].set_title(f'Grid Search CV Results: {param1} vs {param2}')
    axs[idx].set_xlabel(param2)  # Label for the x-axis
    axs[idx].set_ylabel(param1)  # Label for the y-axis

plt.tight_layout()  # Adjust layout to prevent overlap.
plt.show()

# **Learning Rate (LR) Scheduler**

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Create a new model instance with a higher initial learning rate.
model = create_model(optimizer=Adam(learning_rate=0.1))

# Define a callback to reduce the learning rate when the validation loss plateaus.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)

# Fit the model with the learning rate scheduler callback.
history = model.fit(X, y, validation_split=0.2, epochs=64, batch_size=32, callbacks=[reduce_lr])

In [None]:
# Extract and plot the learning rate history over epochs.
lr_history = history.history['learning_rate']

plt.figure(figsize=(10, 6))
plt.plot(lr_history)
plt.title('Learning Rate Scheduler Performance')
plt.xlabel('Epochs')
plt.ylabel('Learning Rate')
plt.show()

# **Early Stopping**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Create a new model instance for training with early stopping.
model = create_model(optimizer=Adam(learning_rate=0.01))

# Define an early stopping callback to prevent overfitting.
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Fit the model with the early stopping callback.
history = model.fit(X, y, validation_split=0.2, epochs=64, batch_size=32, callbacks=[early_stopping])

In [None]:
# Plot the training and validation loss over epochs to visualize the impact of early stopping.
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss with Early Stopping')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# **Inference and Conclusion**

### Minibatch Gradient Descent (MBGD)

#### Batch Sizes:
- **Impact of Batch Size**: Different batch sizes can have significant impacts on model performance.
  - **Smaller Batch Sizes**: Often provide a more accurate estimate of the gradient, leading to more frequent updates and potentially faster convergence. However, they can introduce more noise into the training process.
  - **Larger Batch Sizes**: Provide a smoother estimate of the gradient, which can lead to more stable convergence. They typically require more memory and can slow down the convergence process.

### Hyperparameter Tuning

#### Grid Search:
- **Best Parameters**: The Grid Search identified the best combination of hyperparameters to achieve optimal model performance.
- **Heatmaps**: The heatmaps provided a visual understanding of how different hyperparameter combinations affect model performance.
- **Accuracy**: The best accuracy achieved was noted along with the specific hyperparameters used.

#### Random Search:
- **Efficiency**: Random Search was faster compared to Grid Search as it evaluates a random subset of hyperparameter combinations rather than all possible combinations.
- **Best Parameters**: The Random Search also identified a set of hyperparameters that yielded a high accuracy.
- **Comparison**: Both Grid and Random Search are effective, but Random Search can be more efficient in terms of computation time, especially with larger hyperparameter spaces.

### Learning Rate Scheduler
- **Performance**: The learning rate scheduler dynamically adjusted the learning rate during training, which helped in achieving better convergence.
- **Plot**: The learning rate plot illustrated how the learning rate was reduced over epochs, which contributed to stabilizing the training process.

### Early Stopping
- **Prevention of Overfitting**: Early stopping halted training when the validation loss stopped improving, preventing the model from overfitting.
- **Efficiency**: This method saved computational resources by stopping training early when further training would not improve the model.