In [82]:
# Necessary imports for the EDA
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import seaborn as sns
import cv2
import plotly.express as px
import xml.etree.ElementTree as ET # Using this to parse the XML annotations file
from imblearn.over_sampling import RandomOverSampler # For balancing
from imblearn.under_sampling import RandomUnderSampler # For balancing

In [83]:
bccd_images_dir = './BCCD_Dataset/BCCD/JPEGImages/'
bccd_annotations_dir = './BCCD_Dataset/BCCD/Annotations/'
cbc_images_dir = './Complete-Blood-Cell-Count-Dataset/Training/Images/'
cbc_annotations_dir = './Complete-Blood-Cell-Count-Dataset/Training/Annotations/'

In [84]:
def load_annos(annotation_path):
    '''
    Load and parse an XML annotation file.

    This function reads an XML file, parses the file, and extracts details about objects 
    found within the smear.
    Parameters:
    - annotation_path (str): The file path to the XML annotation file.

    Returns:
    - dict: A dictionary containing a list of objects, each represented as a dictionary with
      keys for the object's name, pose, truncated status, difficulty status, and bounding box.
      The bounding box itself is a dictionary with 'xmin', 'ymin', 'xmax', and 'ymax'.
    '''

    tree = ET.parse(annotation_path)
    root = tree.getroot()

    annotations = {
        'objects': []
    }
    
    # Find objects
    for obj in root.findall('.//object'):
        obj_name = obj.find('.//name').text
        obj_pose = obj.find('.//pose').text
        obj_truncated = int(obj.find('.//truncated').text)
        obj_difficult = int(obj.find('.//difficult').text)
        
        # Find bounding boxes
        bndbox = obj.find('.//bndbox')
        bbox = {
            'xmin': int(bndbox.find('.//xmin').text),
            'ymin': int(bndbox.find('.//ymin').text),
            'xmax': int(bndbox.find('.//xmax').text),
            'ymax': int(bndbox.find('.//ymax').text)
        }

        # Add traits to objects
        annotations['objects'].append({
            'name': obj_name,
            'pose': obj_pose,
            'truncated': obj_truncated,
            'difficult': obj_difficult,
            'bbox': bbox
        })

    return annotations

In [85]:
def load_data(images_dir, annotations_dir):
    '''
    Load images and their corresponding annotations from specified directories.

    This function iterates through all image files (JPG) in the images folder,
    constructs the path to their corresponding XML annotation files in the annotations directory,
    and attempts to load both the image and its annotation. If an annotation file is missing, it logs a
    warning. Errors in opening an image file or other processing errors are also caught and logged.

    Parameters:
    - images_dir (str): The directory path that contains the image files.
    - annotations_dir (str): The directory path that contains the XML annotation files.

    Returns:
    - list of dicts: Each dictionary contains an 'Image' object (a copy of the loaded image),
      'Path' (the file path of the image), and 'Annotation' (the parsed annotation data if available).
      If an annotation file is not found, a warning is printed and the image is not added to the list.
    '''
    data = []
    for filename in os.listdir(images_dir):
        # Only working with .jpg's
        if filename.endswith('.jpg'):
            img_path = os.path.join(images_dir, filename)
            annotation_filename = filename.rsplit('.', 1)[0] + '.xml'
            annotation_path = os.path.join(annotations_dir, annotation_filename)
            try:
                with Image.open(img_path) as img:
                    # If path exists, load the annotations and corresponding image
                    if os.path.exists(annotation_path):
                        annotation = load_annotation(annotation_path)
                        data.append({'Image': img.copy(), 'Path': img_path, 'Annotation': annotation})
                    else:
                        print(f"Warning: No annotation file found for {filename}")
            except IOError as e:
                print(f"Error opening image {filename}: {e}")
    return data

In [86]:
def extract_features(data):
    '''
    Extracts features related to the bounding boxes of RBCs, WBCs, and platelets in blood smear images.

    Parameters:
    - data (list of dicts): Each dictionary contains:
        - 'Image': Image object (PIL image).
        - 'Path': File path of the image.
        - 'Annotation': Dictionary containing annotation data for objects (RBCs, WBCs, Platelets) in the image.

    Returns:
    - pandas.DataFrame: DataFrame with each row representing an image and extracted features, including:
        - 'Path': Path of the image.
        - 'Width': Width of the image.
        - 'Height': Height of the image.
        - 'Num_RBCs': Number of RBCs in the image.
        - 'Num_WBCs': Number of WBCs in the image.
        - 'Num_Platelets': Number of platelets in the image.
        - 'Avg_BBox_Area': Average area of the bounding boxes.
        - 'Var_BBox_Area': Variance of the bounding box areas.
        - 'Avg_BBox_Aspect_Ratio': Average aspect ratio of the bounding boxes.
    '''
    features = []
    for item in data:
        img = item['Image']
        annotation = item['Annotation']
        
        # Count the number of RBCs, WBCs, and platelets
        num_rbc = sum(1 for obj in annotation['objects'] if obj['name'] == 'RBC')
        num_wbc = sum(1 for obj in annotation['objects'] if obj['name'] == 'WBC')
        num_platelet = sum(1 for obj in annotation['objects'] if obj['name'] == 'Platelets')

        # Extract bounding box features
        bbox_areas = []
        bbox_aspect_ratios = []
        for obj in annotation['objects']:
            xmin = obj['bbox']['xmin']
            xmax = obj['bbox']['xmax']
            ymin = obj['bbox']['ymin']
            ymax = obj['bbox']['ymax']
            width = xmax - xmin
            height = ymax - ymin

            # Calculate area and aspect ratio of the bounding box
            area = width * height
            aspect_ratio = width / height if height != 0 else 0

            bbox_areas.append(area)
            bbox_aspect_ratios.append(aspect_ratio)

        # Calculate average and variance of bounding box areas and aspect ratios
        avg_bbox_area = np.mean(bbox_areas) if bbox_areas else 0
        var_bbox_area = np.var(bbox_areas) if bbox_areas else 0
        avg_bbox_aspect_ratio = np.mean(bbox_aspect_ratios) if bbox_aspect_ratios else 0

        # Append extracted features for the current image
        features.append({
            'Path': item['Path'],
            'Width': img.width,
            'Height': img.height,
            'Num_RBCs': num_rbc,
            'Num_WBCs': num_wbc,
            'Num_Platelets': num_platelet,
            'Avg_BBox_Area': avg_bbox_area,
            'Var_BBox_Area': var_bbox_area,
            'Avg_BBox_Aspect_Ratio': avg_bbox_aspect_ratio
        })

    return pd.DataFrame(features)

In [89]:
#Load data
bccd_data = load_data(bccd_images_dir, bccd_annotations_dir)
cbc_data = load_data(cbc_images_dir, cbc_annotations_dir)

In [90]:
# Extract Features
bccd_df = extract_features(bccd_data)
cbc_df = extract_features(cbc_data)

In [108]:
# Basic Stats of BCCD
print(bccd_df.describe())

       Width  Height    Num_RBCs    Num_WBCs  Num_Platelets  Avg_BBox_Area  \
count  364.0   364.0  364.000000  364.000000     364.000000     364.000000   
mean   640.0   480.0   11.414835    1.021978       0.991758   12421.594016   
std      0.0     0.0    4.267360    0.233692       1.151087    3999.947717   
min    640.0   480.0    0.000000    0.000000       0.000000    1560.000000   
25%    640.0   480.0    9.000000    1.000000       0.000000   10562.078297   
50%    640.0   480.0   11.000000    1.000000       1.000000   11806.468750   
75%    640.0   480.0   14.000000    1.000000       2.000000   13333.823661   
max    640.0   480.0   27.000000    2.000000       6.000000   47998.000000   

       Var_BBox_Area  Avg_BBox_Aspect_Ratio  
count   3.640000e+02             364.000000  
mean    7.455876e+07               1.067435  
std     7.517148e+07               0.080191  
min     0.000000e+00               0.840134  
25%     2.230198e+07               1.023435  
50%     5.770411e+07 

In [110]:
# Basic Stats of CBC
print(cbc_df.describe())

       Width  Height    Num_RBCs    Num_WBCs  Num_Platelets  Avg_BBox_Area  \
count  300.0   300.0  300.000000  300.000000     300.000000     300.000000   
mean   640.0   480.0   11.136667    1.023333       1.013333   12205.574038   
std      0.0     0.0    4.453925    0.250996       1.139070    3863.041721   
min    640.0   480.0    0.000000    0.000000       0.000000    1560.000000   
25%    640.0   480.0    9.000000    1.000000       0.000000   10437.554622   
50%    640.0   480.0   11.000000    1.000000       1.000000   11573.200000   
75%    640.0   480.0   14.000000    1.000000       2.000000   13091.020833   
max    640.0   480.0   27.000000    2.000000       6.000000   47998.000000   

       Var_BBox_Area  Avg_BBox_Aspect_Ratio  
count   3.000000e+02             300.000000  
mean    6.811002e+07               1.066081  
std     7.068528e+07               0.083887  
min     0.000000e+00               0.840134  
25%     2.082834e+07               1.021895  
50%     4.928613e+07 