This Notebook contains the functions to 

- Perform Data Augmentation on Images for Oversampling and balancing Images
- Generate Datasets at various values of k after SVD Reconstruction

In [None]:
import os
import pandas as pd
import numpy as np
from keras.preprocessing.image import ImageDataGenerator, array_to_img

def save_augmented_image(original_image_path, row_data, counter):
    """
    Save augmented image and update row data with new image name.

    Parameters:
    - original_image_path (str): Path to the original image file.
    - row_data (pd.Series): Row data containing image and metadata.
    - counter (int): Counter to generate unique identifier for augmented image.

    Returns:
    - pd.Series: Updated row data with new image ID.
    """
    # Define augmentation parameters
    datagen = ImageDataGenerator(
        rotation_range=20,
        zoom_range=0.1,
        width_shift_range=0.2,
        height_shift_range=0.1,
        horizontal_flip=True,
        brightness_range=(0.2, 0.7)
    )

    # Create output directory if it doesn't exist
    output_dir = 'Original_Augmented'
    os.makedirs(output_dir, exist_ok=True)

    # Convert image to array and reshape for augmentation
    img_array = row_data['image']
    img_array = img_array.reshape((1,) + img_array.shape)

    # Generate augmented image
    aug_iter = datagen.flow(img_array, batch_size=1)
    aug_image = next(aug_iter)[0].astype(np.uint8)

    # Convert array back to image
    aug_image = array_to_img(aug_image)

    # Save the augmented image
    new_image_name = f"{row_data['image_id']}_{counter}_orgbcc_{row_data['cell_type']}_aug.png"
    aug_image.save(os.path.join(output_dir, new_image_name))

    # Update row data with new image name
    new_row = row_data.copy()
    new_row['image_id'] = new_image_name
    return new_row

def augmentation_dataset(dataset, label):
    """
    Perform data augmentation on a dataset for a specific label.

    Parameters:
    - dataset (pd.DataFrame): DataFrame containing image data and labels.
    - label (str): Label of the class to augment in the dataset.

    Saves:
    - CSV file: Augmented dataset with updated image names.
    """
    # Filter dataset by label
    df = dataset[dataset['cell_type'] == label]

    # Initialize output directory and image counter
    output_dir = 'Original_Augmented'
    os.makedirs(output_dir, exist_ok=True)
    image_counter = 1

    # List to store new DataFrame rows
    new_rows = []

    # Iterate through each image in the filtered DataFrame
    for index, row in df.iterrows():
        # Generate augmented images (in this case, only 1 augmentation per image)
        # range(2) will generate 2 augmentations per image
        for _ in range(1):
            print(f"Processing image {row['image_id']} iteration {_ + 1}")
            new_row = save_augmented_image(row['image'], row, image_counter)
            new_rows.append(new_row)
            image_counter += 1

    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)

    # Save the new DataFrame as a CSV file
    new_df.to_csv(f"Original_{label}_augmentation.csv", index=False)

# Example usage:
class_list = ["Basal cell carcinoma"]  # List of classes to augment

# Perform augmentation for each class in the class_list
for label in class_list:
    augmentation_dataset(dataset, label)

print("Data augmentation complete and new DataFrame saved as CSV.")


In [None]:
import os
import cv2
import pandas as pd
from sklearn.decomposition import PCA

def pca_image(image, n_components):
    """
    Perform PCA on image channels (Blue, Green, Red).

    Parameters:
    - image (numpy.ndarray): Input image as a NumPy array (BGR format).
    - n_components (int): Number of principal components to retain.

    Returns:
    - transformed_channels (list of numpy.ndarray): Transformed PCA components for each channel.
    - pca_solvers (list of PCA objects): List of PCA objects fitted to each channel.
    """
    # Splitting channels
    blue, green, red = cv2.split(image)

    # Initialize PCA for each channel with specified number of components
    pca_b = PCA(n_components, svd_solver='full')
    pca_g = PCA(n_components, svd_solver='full')
    pca_r = PCA(n_components, svd_solver='full')

    # Applying PCA to each channel
    blue_transformed = pca_b.fit_transform(blue)
    green_transformed = pca_g.fit_transform(green)
    red_transformed = pca_r.fit_transform(red)

    # Return transformed components and PCA objects
    return [red_transformed, green_transformed, blue_transformed], [pca_b, pca_g, pca_r]

def reconstruct_pca_image(transformed_channels, pca_solvers):
    """
    Reconstruct the original image from PCA-transformed components.

    Parameters:
    - transformed_channels (list of numpy.ndarray): Transformed PCA components for each channel.
    - pca_solvers (list of PCA objects): List of PCA objects fitted to each channel.

    Returns:
    - reconstructed_image (numpy.ndarray): Reconstructed image from PCA components.
    """
    # Inverse transform to reconstruct channels
    red_inverted = pca_solvers[2].inverse_transform(transformed_channels[0])
    green_inverted = pca_solvers[1].inverse_transform(transformed_channels[1])
    blue_inverted = pca_solvers[0].inverse_transform(transformed_channels[2])

    # Merge channels and convert to uint8 image
    reconstructed_image = (cv2.merge((blue_inverted, green_inverted, red_inverted))).astype(np.uint8)

    return reconstructed_image

def save_images(df, save_path, image_column, components):
    """
    Saves compressed images generated using PCA from a pandas DataFrame.

    Args:
    - df (pd.DataFrame): DataFrame containing 'image_id' and 'image' columns.
    - save_path (str): Path to the directory for saving images.
    - image_column (str): Column name containing 3D image arrays.
    - components (int): Number of PCA components to retain.

    Saves:
    - Compressed images with 'image_id' as file name in the specified directory.
    """
    # Create the save directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Iterate through DataFrame rows
    for ind in df.index:
        # Retrieve image array and image ID
        image_array = df.loc[ind, image_column]
        image_id = df.loc[ind, 'image_id']

        # Perform PCA on the image
        img_reduced_arr, pca_solver = pca_image(image_array, components)

        # Reconstruct the image from PCA components
        image_to_save = reconstruct_pca_image(img_reduced_arr, pca_solver)

        # Generate unique image name with image_id
        image_name = f"{image_id}.jpg"  # Use JPG format for simplicity, adjust as needed

        # Construct full save path
        full_path = os.path.join(save_path, image_name)

        # Save the image using OpenCV
        cv2.imwrite(full_path, cv2.cvtColor(image_to_save, cv2.COLOR_BGR2RGB))

        print(f"Image saved: {full_path}")

# Example usage:
# Assuming 'dataset' is your pandas DataFrame with 'image_id' and 'image' columns
# Adjust '400_Components_PCA_Data' to your desired save path
save_images(dataset, '400_Components_PCA_Data', 'image', components=400)
