In [8]:
import os
import glob

def count_images(folder_path):
    # Define the image file extensions to look for
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.bmp', '*.tiff']
    
    # Initialize the counter
    image_count = 0
    
    # Iterate over each extension and count the files
    for extension in image_extensions:
        image_count += len(glob.glob(os.path.join(folder_path, extension)))
    
    return image_count

# Example usage
folder_path = 'jds'
total_images = count_images(folder_path)
print(f'Total number of images: {total_images}')


Total number of images: 623


In [None]:
#  Creating Categories

In [3]:
import os
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans
from tqdm import tqdm

def extract_color_features(image_path):
    try:
        with Image.open(image_path) as img:
            # Ensure the image is in RGB format
            img = img.convert('RGB')
            # Resize to reduce computation time and to normalize feature scale
            img = img.resize((100, 100), Image.Resampling.LANCZOS)
            # Convert image data to a flat array
            data = np.array(img)
            # Extract average color
            return data.mean(axis=(0, 1))
    except OSError as e:
        print(f"Skipping file due to error: {image_path}, {e}")
        return None

def calculate_lobe_areas(image_path):
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')
            data = np.array(img)

            # Count all unique colors (ignoring black)
            colors, counts = np.unique(data.reshape(-1, 3), axis=0, return_counts=True)
            color_areas = {tuple(color): count for color, count in zip(colors, counts) if not np.array_equal(color, [0, 0, 0])}

            # Calculate total area (excluding black)
            total_area = sum(color_areas.values())
            proportions = {color: area / total_area for color, area in color_areas.items()}
            
            return proportions
    except OSError as e:
        print(f"Failed to calculate lobe areas due to error: {e}")
        return {}

def cluster_images(image_paths, n_clusters=6):
    features = [extract_color_features(path) for path in tqdm(image_paths, desc="Extracting color features") if extract_color_features(path) is not None]
    features = np.array(features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(features)
    return kmeans.labels_

def categorize_and_save_images(folder_path, brain_image_path):
    # Filter out unsupported files like WMF
    supported_formats = ('.png', '.jpg', '.jpeg')
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) 
                   if f.lower().endswith(supported_formats) and os.path.isfile(os.path.join(folder_path, f))]
    
    # Calculate lobe areas from the brain image
    lobe_proportions = calculate_lobe_areas(brain_image_path)
    # If lobe_proportions is empty or failed, use equal distribution
    if not lobe_proportions:
        print("Using equal distribution for categories.")
        lobe_sizes = [1] * 6
    else:
        lobe_sizes = list(lobe_proportions.values())
    
    # Normalize lobe_sizes to match the number of categories
    lobe_size_factors = [size / min(lobe_sizes) for size in lobe_sizes][:6]
    
    labels = cluster_images(image_paths, len(lobe_size_factors))
    
    # Adjust number of images per category based on lobe sizes
    # This example demonstrates the concept and may require adjustments for specific project needs
    for label, path in zip(labels, image_paths):
        factor = int(lobe_size_factors[label])
        target_folder = os.path.join(folder_path, f'category_{label}_size_{factor}')
        os.makedirs(target_folder, exist_ok=True)
        os.rename(path, os.path.join(target_folder, os.path.basename(path)))

# Example usage
folder_path = 'categories'  # Replace with the actual folder path
brain_image_path = 'data/input.jpg'  # Replace with the actual brain image path

categorize_and_save_images(folder_path, brain_image_path)


Extracting color features:  26%|███▌          | 108/420 [00:04<00:10, 28.89it/s]

Skipping file due to error: categories/blancocorredeirajessica_108_977_Mnemonic project, Jessica Blanco_image_9.png, cannot find loader for this WMF file


Extracting color features: 100%|██████████████| 420/420 [00:19<00:00, 21.36it/s]


In [None]:
#Changing background colors

import cv2
import numpy as np
import os
from sklearn.cluster import KMeans

def is_not_black(color, threshold=30):
    """
    Check if a color is not black or nearly black by comparing each channel against a threshold.
    """
    return np.any(np.array(color) > threshold)

def extract_distinct_colors_and_areas(image_path, num_colors=6, exclude_black=True, black_threshold=30):
    """
    Extract distinct colors from an input image using KMeans clustering and calculate their areas,
    optionally excluding black or nearly black colors.
    """
    image = cv2.imread(image_path)
    pixels = image.reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_colors)
    kmeans.fit(pixels)
    distinct_colors = kmeans.cluster_centers_.astype(int)
    
    if exclude_black:
        non_black_indexes = [index for index, color in enumerate(distinct_colors) if is_not_black(color, black_threshold)]
        distinct_colors = [distinct_colors[index] for index in non_black_indexes]
    else:
        non_black_indexes = list(range(num_colors))
    
    areas = np.bincount(kmeans.labels_, minlength=num_colors)[non_black_indexes]
    total_area = areas.sum()
    areas_ratio = [area / total_area for area in areas]
    distinct_colors_and_areas = [(tuple(color), ratio) for color, ratio in zip(distinct_colors, areas_ratio)]
    return distinct_colors_and_areas

def change_background(image, new_bg_color):
    """
    Change the background of an image to a new color.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    binary_inv = cv2.bitwise_not(binary)
    background_mask = cv2.cvtColor(binary_inv, cv2.COLOR_GRAY2BGR)
    new_background = np.full_like(image, new_bg_color)
    foreground = cv2.bitwise_and(image, background_mask)
    background_mask_inv = cv2.bitwise_not(background_mask)
    new_background_masked = cv2.bitwise_and(new_background, background_mask_inv)
    final_image = cv2.add(foreground, new_background_masked)
    return final_image

def generate_color_shades(color, num_shades=5):
    """
    Generate different shades for a given color.
    """
    color_hsv = cv2.cvtColor(np.uint8([[color]]), cv2.COLOR_BGR2HSV)
    variations = []
    for i in range(1, num_shades + 1):
        shade_hsv = np.copy(color_hsv)
        shade_hsv[0,0,1] = np.clip(shade_hsv[0,0,1] * (0.8 + 0.4 * (i / num_shades)), 0, 255) # Adjust saturation
        shade_hsv[0,0,2] = np.clip(shade_hsv[0,0,2] * (0.8 + 0.4 * (i / num_shades)), 0, 255) # Adjust brightness
        shade_bgr = cv2.cvtColor(shade_hsv, cv2.COLOR_HSV2BGR)
        variations.append(shade_bgr[0,0].tolist())
    return variations

def process_folder_with_weighted_colors_variations(folder_path, output_folder, distinct_colors_and_areas, num_shades=5):
    images = [img for img in os.listdir(folder_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]
    total_images = len(images)
    images_per_color = [int(round(ratio * total_images)) for _, ratio in distinct_colors_and_areas]

    # Generate color shades for each distinct color
    color_shades = {color: generate_color_shades(color, num_shades) for color, _ in distinct_colors_and_areas}

    # Allocate shades to images
    color_allocation = []
    for (color, _), num_images in zip(distinct_colors_and_areas, images_per_color):
        shades = color_shades[color]
        for i in range(num_images):
            color_allocation.append(shades[i % num_shades])

    for index, image_name in enumerate(images):
        image_path = os.path.join(folder_path, image_name)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: {image_name} is not a valid image.")
            continue
        shade_index = min(index, len(color_allocation) - 1)
        new_bg_color = color_allocation[shade_index]
        final_image = change_background(image, new_bg_color)
        output_path = os.path.join(output_folder, image_name)
        cv2.imwrite(output_path, final_image)
        print(f"Processed {image_name}, saved to {output_path}")

# Example usage
input_image_path = 'data/input.jpg'  # Update this path to your input image
folder_path = 'images from pdfs'  # Update this path to your folder with images
output_folder = 'background changed 2'  # Update this path to your desired output folder
os.makedirs(output_folder, exist_ok=True)

distinct_colors_and_areas = extract_distinct_colors_and_areas(input_image_path, num_colors=6)
process_folder_with_weighted_colors_variations(folder_path, output_folder, distinct_colors_and_areas, num_shades=5)

In [None]:
# which image is best suited for the background

In [7]:
#Version 3
import cv2
import numpy as np
import os
from sklearn.cluster import KMeans

def is_not_black(color, threshold=30):
    """
    Check if a color is not black or nearly black by comparing each channel against a threshold.
    """
    return np.any(np.array(color) > threshold)

def extract_distinct_colors_and_areas(image_path, num_colors=6, exclude_black=True, black_threshold=30):
    """
    Extract distinct colors from an input image using KMeans clustering and calculate their areas,
    optionally excluding black or nearly black colors.
    """
    image = cv2.imread(image_path)
    pixels = image.reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_colors)
    kmeans.fit(pixels)
    distinct_colors = kmeans.cluster_centers_.astype(int)
    
    if exclude_black:
        non_black_indexes = [index for index, color in enumerate(distinct_colors) if is_not_black(color, black_threshold)]
        distinct_colors = [distinct_colors[index] for index in non_black_indexes]
    else:
        non_black_indexes = list(range(num_colors))
    
    areas = np.bincount(kmeans.labels_, minlength=num_colors)[non_black_indexes]
    total_area = areas.sum()
    areas_ratio = [area / total_area for area in areas]
    distinct_colors_and_areas = [(tuple(color), ratio) for color, ratio in zip(distinct_colors, areas_ratio)]
    return distinct_colors_and_areas

def generate_lighter_shades(color, num_shades=5):
    """
    Generate lighter shades of a given color.
    """
    shades = []
    for i in range(1, num_shades + 1):
        shade = color + (255 - color) * (i / (num_shades + 1))
        shades.append(shade.astype(int))
    return shades

def find_best_background_color(image, distinct_colors):
    """
    Determine which of the distinct colors best fits the image as a background. 
    This function is a placeholder and currently selects a random color.
    """
    random_index = np.random.randint(0, len(distinct_colors))
    return distinct_colors[random_index]

def change_background(image, new_bg_color):
    """
    Change the background of an image to a new color.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    binary_inv = cv2.bitwise_not(binary)
    background_mask = cv2.cvtColor(binary_inv, cv2.COLOR_GRAY2BGR)
    new_background = np.full_like(image, new_bg_color)
    foreground = cv2.bitwise_and(image, background_mask)
    background_mask_inv = cv2.bitwise_not(background_mask)
    new_background_masked = cv2.bitwise_and(new_background, background_mask_inv)
    final_image = cv2.add(foreground, new_background_masked)
    return final_image

def process_folder_with_best_fit_backgrounds(folder_path, output_folder, distinct_colors_and_areas, num_shades=5):
    images = [img for img in os.listdir(folder_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]
    # Convert color tuples to strings to ensure they can be used as dictionary keys consistently
    color_shades = {str(color): generate_lighter_shades(np.array(color), num_shades) for color, _ in distinct_colors_and_areas}
    
    for image_name in images:
        image_path = os.path.join(folder_path, image_name)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: {image_name} is not a valid image.")
            continue
        
        # Ensure we are working with a list of color tuples for comparison
        colors_for_comparison = [color for color, _ in distinct_colors_and_areas]
        best_background_color = find_best_background_color(image, colors_for_comparison)
        # Convert the best color to a string key to match dictionary keys
        best_background_color_str = str(best_background_color)
        shades = color_shades[best_background_color_str]
        
        # Flatten the shades list to a simple list of tuples if not already
        flattened_shades = [tuple(shade) for shade in shades]
        
        # Randomly select a shade
        best_shade_index = np.random.randint(0, len(flattened_shades))
        best_shade = flattened_shades[best_shade_index]
        
        final_image = change_background(image, np.array(best_shade))
        output_path = os.path.join(output_folder, image_name)
        cv2.imwrite(output_path, final_image)
        print(f"Processed {image_name}, saved to {output_path}")

# Example usage
input_image_path = 'data/input.jpg'  # Update this path
folder_path = 'images from pdfs'  # Update this path
output_folder = 'background changed 3'  # Update this path
os.makedirs(output_folder, exist_ok=True)

distinct_colors_and_areas = extract_distinct_colors_and_areas(input_image_path, num_colors=6)
process_folder_with_best_fit_backgrounds(folder_path, output_folder, distinct_colors_and_areas, num_shades=5)


Processed ec790191-8a5f-4dd6-aa2c-0f286377b545.png, saved to background changed 3/ec790191-8a5f-4dd6-aa2c-0f286377b545.png
Processed e5d4d74f-d4ba-4bb4-8737-25e88944a04f.png, saved to background changed 3/e5d4d74f-d4ba-4bb4-8737-25e88944a04f.png
Processed limaroxana_107_874_The Tapper_image_0_0.png, saved to background changed 3/limaroxana_107_874_The Tapper_image_0_0.png
Processed 692c7495-5460-451b-830d-67cc80275ed1.png, saved to background changed 3/692c7495-5460-451b-830d-67cc80275ed1.png
Processed barrybinta_264_981_MnemomicProject_image_0_0.png, saved to background changed 3/barrybinta_264_981_MnemomicProject_image_0_0.png
Processed 64e91fba-6afc-4115-bfc4-00fc1ab95aa5.png, saved to background changed 3/64e91fba-6afc-4115-bfc4-00fc1ab95aa5.png
Processed 68b07f84-0089-4ff2-a241-3235022f11d1.png, saved to background changed 3/68b07f84-0089-4ff2-a241-3235022f11d1.png
Processed 0784619c-8a3d-4dc3-9a88-0ed5db9df967.png, saved to background changed 3/0784619c-8a3d-4dc3-9a88-0ed5db9df96

In [13]:
import cv2
import numpy as np
import os
from itertools import product
from tqdm import tqdm

class Options:
    def __init__(self, input_path="data/input.jpg", output_path="data/output.jpg", pool_path="jds", stride=35):
        self.input = input_path
        self.output = output_path
        self.pool = pool_path
        self.stride = stride

def get_component_images(path, size):
    images = []
    avg_colors = []
    all_files = os.listdir(path)
    image_paths = [os.path.join(path, f) for f in all_files if os.path.isfile(os.path.join(path, f))]
    for image_path in tqdm(image_paths, desc="Reading images"):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        if image is not None:
            image = cv2.resize(image, (size, size))
            images.append(image)
            avg_colors.append(np.mean(image, axis=(0, 1)))
    return images, np.array(avg_colors)

def calculate_max_dimensions(opt, input_image):
    input_height, input_width, _ = input_image.shape
    max_width = input_width // opt.stride * opt.stride
    max_height = input_height // opt.stride * opt.stride
    return max_width, max_height

def calculate_required_images(opt, input_image):
    max_width, max_height = calculate_max_dimensions(opt, input_image)
    num_horizontal_tiles = max_width // opt.stride
    num_vertical_tiles = max_height // opt.stride
    return num_horizontal_tiles * num_vertical_tiles

def main(opt):
    try:
        os.makedirs("output_snapshots", exist_ok=True)
        input_image = cv2.imread(opt.input, cv2.IMREAD_COLOR)
        if input_image is None:
            raise ValueError(f"Failed to load input image from {opt.input}")

        required_images = calculate_required_images(opt, input_image)
        available_images = len([f for f in os.listdir(opt.pool) if os.path.isfile(os.path.join(opt.pool, f))])

        if available_images < required_images:
            print(f"Insufficient images to create the mosaic. Required: {required_images}, Available: {available_images}")
            return

        max_width, max_height = calculate_max_dimensions(opt, input_image)
        blank_image = np.zeros((max_height, max_width, 3), np.uint8)
        images, avg_colors = get_component_images(opt.pool, opt.stride)

        used_indices = set()
        total_steps = required_images
        progress_bar = tqdm(total=total_steps, desc="Generating mosaic")

        for i, j in product(range(0, max_width, opt.stride), range(0, max_height, opt.stride)):
            target_section = input_image[j:j+opt.stride, i:i+opt.stride]
            # Adjust the darkness threshold or remove this check as needed
            if np.mean(target_section) < 30:
                progress_bar.update(1)
                continue

            best_match_idx = None
            min_dist = float('inf')
            target_color = np.mean(target_section, axis=(0, 1))

            for idx, color in enumerate(avg_colors):
                if idx not in used_indices:
                    dist = np.linalg.norm(color - target_color)
                    if dist < min_dist:
                        min_dist = dist
                        best_match_idx = idx

            if best_match_idx is not None:
                blank_image[j:j+opt.stride, i:i+opt.stride] = images[best_match_idx]
                used_indices.add(best_match_idx)

            progress_bar.update(1)

        cv2.imwrite(opt.output, blank_image)
        print(f"Final image successfully saved to {opt.output}")
    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        if 'progress_bar' in locals():
            progress_bar.close()

# Update these paths and parameters as needed
opt = Options(input_path="data/input.jpg", output_path="data/output.jpg", pool_path="jds", stride=35)
main(opt)

Reading images: 100%|█████████████████████████| 632/632 [00:10<00:00, 60.58it/s]
Generating mosaic: 100%|████████████████████| 598/598 [00:00<00:00, 2986.30it/s]

Final image successfully saved to data/output.jpg





In [None]:
#Remove duplicate images in the jds folder

In [7]:
import os
import hashlib
import cv2
from tqdm import tqdm

def compute_image_hash(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return None
    resized = cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA)
    mean = resized.mean()
    hash_value = ''.join(['1' if pixel > mean else '0' for row in resized for pixel in row])
    return hashlib.md5(hash_value.encode('utf-8')).hexdigest()

def remove_duplicate_images(folder_path):
    hash_dict = {}
    duplicate_files = []
    
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    for file_path in tqdm(all_files, desc="Processing images"):
        image_hash = compute_image_hash(file_path)
        if image_hash is not None:
            if image_hash in hash_dict:
                duplicate_files.append(file_path)
            else:
                hash_dict[image_hash] = file_path
    
    for duplicate in tqdm(duplicate_files, desc="Removing duplicates"):
        os.remove(duplicate)
        print(f"Removed duplicate image: {duplicate}")

if __name__ == "__main__":
    folder_path = "jds"  # Update with the path to your image folder
    remove_duplicate_images(folder_path)


Processing images: 100%|██████████████████████| 632/632 [00:12<00:00, 49.52it/s]
Removing duplicates: 0it [00:00, ?it/s]
