# Intro

This calculates the following statistics about the dataset:

1. Cardinality of the dataset (mean number of labels per pixel)
2. Density (mean number of labels per pixel/number of labels in the dataset)
3. Single label pixels (percentage of pixels in the dataset with only one active label)
4. Frequency of each label per image
5. Frequency of each label per pixel
6. Imbalance ratio for each label (# time the most frequent label appears/# of times the
label of interest appears)
7. Mean imbalance ratio

The results can be seen in Table 4 in the MultiSolSegment paper

## setup

In [None]:
import numpy as np
import os
from collections import Counter

In [7]:
root = "/Users/ojas/Desktop/saj/SANDIA/pvcracks_data/Channeled_Combined_CWRU_LBNL_ASU/"

img_folder = root + "img/all/"
mask_folder = root + "ann/all/"

# image_file_extension = "tiff"
# image_file_extension = "jpg"

category_mapping = {0: "empty", 1: "dark", 2: "busbar", 3: "crack", 4: "non-cell"}
categories_list = list(category_mapping.values())

In [8]:
files = [c for c in os.listdir(mask_folder)]
n_masks = len(files)

# regex_string = r'(.+)\.' + image_file_extension + r'\.json'

# n_masks = [re.search(regex_string, f).group(1) for f in files]

-----

## class information

In [9]:
files

['mx_SW2-DH3000-500h-PT-cell54-flr.npy',
 'SW3-DH3000-1500h-PT-cell44-r180.npy',
 'mxy_2002.npy',
 'mxy_0601.npy',
 'mxy_SW3-DH3000-2000h-PT-cell46-r180.npy',
 'mxy_SW1-DH3000-500h-PT-cell50-flr.npy',
 'mx_1217 - Old TC for PID_2_front_Isc (EL Image).npy',
 'my_SW1-DH3000-1500h-PT-cell56.npy',
 '1_EL_18.09.2023-15-40-40_unknownID_ASU Minisample_SHJ5_103.npy',
 'mxy_sa19976_sub_EL_9-c04.npy',
 'mxy_sa19958_sub_EL_9-c02.npy',
 'mxy_sa19966_sub_EL_9-c02.npy',
 'mx_SW1-DH3000-2000h-PT-cell37.npy',
 'mxy_sa19990_sub_EL_9-c04.npy',
 'mx_3_EL_18.09.2023-15-00-34_unknownID_ASU Minisample_07_68.npy',
 'mxy_SW3-DH3000-2000h-PT-cell37-flr.npy',
 'SW2-DH3000-1500h-PT-cell54-flr.npy',
 'mxy_sa19979_sub_EL_9-c03.npy',
 'mxy_sa19980_sub_EL_9-c02.npy',
 'mx_SW2-DH3000-500h-PT-cell54-fup.npy',
 'my_SW3-DH3000-2000h-PT-cell37-r180.npy',
 '1240 - NICE_SHJ_2_Isc (EL Image).npy',
 'SW1-DH3000-1500h-PT-cell49-flr.npy',
 'mx_SW3-DH3000-2000h-PT-cell42-flr.npy',
 'mx_sa19958_sub_EL_9-c04.npy',
 'my_sa19959_su

In [10]:
class_tally = {k: 0 for k in category_mapping.values()}

mask_data_list = []

for f in files:
    mask_path = os.path.join(mask_folder, f)
    if not os.path.exists(mask_path):
        print(f"Mask file {mask_path} does not exist. Skipping.")
        continue

    mask_data = np.load(mask_path, allow_pickle=True)
    mask_data_list.append(mask_data)

    for i, class_name in enumerate(categories_list):
        if np.any(mask_data[i]):
            class_tally[class_name] += 1

print("Class Tally:")
for class_name, count in class_tally.items():
    print(f"{class_name}: {count}")

Class Tally:
empty: 0
dark: 364
busbar: 2340
crack: 1688
non-cell: 2340


In [11]:
"""
mask_data_list: list of all the 5-channeled masks
mask_data_list[0]: 5-channeled mask for image. 400x400x5
mask_data_list[0][0]: 1 channel of the mask of an image. 400x400
mask_data_list[0][0][0]: 1 dimension of 1 channel of the mask of an image. 400
"""

'\nmask_data_list: list of all the 5-channeled masks\nmask_data_list[0]: 5-channeled mask for image. 400x400x5\nmask_data_list[0][0]: 1 channel of the mask of an image. 400x400\nmask_data_list[0][0][0]: 1 dimension of 1 channel of the mask of an image. 400\n'

In [12]:
# Calculate the cardinality of the dataset
num_images = len(mask_data_list)
sum_labels_per_pixel = 0
for mask_data in mask_data_list:
    pixels_in_image = mask_data[1].size
    labels_in_image = np.sum(mask_data[1:])
    labels_per_pixel = labels_in_image / pixels_in_image
    sum_labels_per_pixel += labels_per_pixel

cardinality = sum_labels_per_pixel / n_masks
# cardinality = (sum_labels_per_pixel + n_masks) / n_masks

print(f"Cardinality: {cardinality}")

Cardinality: 0.24049954329816203


In [13]:
# Calculate the density of the dataset
num_labels = 0
for mask_data in mask_data_list:
    labels_in_image = np.sum(mask_data[1:])
    num_labels += labels_in_image

density = cardinality / num_labels
# density = cardinality / (mask_data[1][1].size * num_images)

print(f"Density: {density}")

Density: 1.9699911319772553e-09


In [14]:
# Calculate the percentage of instances with only one active label
total_single_label_instances = 0
for mask_data in mask_data_list:
    pixel_wise_label_sums = np.sum(mask_data[1:], axis=0)
    single_label_instances = np.sum(pixel_wise_label_sums == 1)
    total_single_label_instances += single_label_instances

num_labels = 0
for mask_data in mask_data_list:
    labels_in_image = np.sum(mask_data[1:])
    num_labels += labels_in_image

final_single_label_instances = total_single_label_instances / num_labels
final_single_label_instances *= 100

# single_label_instances = np.mean([np.sum(mask_data) == 1 for mask_data in mask_data_list])
print(f"Single label instances: {final_single_label_instances}")

Single label instances: 95.23864116519634


In [None]:
total_imgs = class_tally["non-cell"]

for id, category in enumerate(list(category_mapping.values())[1:]):
    num_images_with_categoy = class_tally[category]
    freq = num_images_with_categoy / total_imgs
    print(f"{(freq * 100):.3f}% of images include Class {id} ({category})")


# Calculate the frequency of each label and distinct label set
# label_frequency = Counter([label for mask_data in mask_data_list for label in np.where(mask_data)[0]])

# print("Label Frequency:")
# for label, frequency in label_frequency.items():
# print(f"{category_mapping[label]}: {frequency}")

15.556% of images include Class 0 (dark)
100.000% of images include Class 1 (busbar)
72.137% of images include Class 2 (crack)
100.000% of images include Class 3 (non-cell)


In [20]:
# Calculate the frequency of each label
# Create an empty list to store all channel indices
all_channels = []

# Iterate through each mask in the list
for mask_data in mask_data_list:
    # Get the channel indices (first array from np.where)
    channel_indices = np.where(mask_data)[0]
    # Add these indices to our list
    all_channels.extend(channel_indices)
# Count the frequency of each channel
label_frequency = Counter(all_channels)

# Calculate the total number of pixels
total_pixels = mask_data_list[0][0].size * num_images

# Calculate the percentage of pixels that are each class
for label, frequency in label_frequency.items():
    print(f"{category_mapping[label]}: {(frequency / total_pixels) * 100:.3f}%")

busbar: 20.259%
crack: 3.505%
non-cell: 7.813%
dark: 1.031%


In [17]:
label_frequency

Counter({np.int64(2): 75849800,
         np.int64(4): 29251268,
         np.int64(3): 13121664,
         np.int64(1): 3858804})

In [18]:
# Calculate the imbalance ratio for each label
max_frequency = max(label_frequency.values())

imbalance_ratio = {}

for label, frequency in label_frequency.items():
    imbalance_ratio[label] = frequency / max_frequency


# Print the imbalance ratio for each label
for label, ratio in imbalance_ratio.items():
    print(f"{category_mapping[label]}: {ratio}")

busbar: 1.0
crack: 0.1729953671598343
non-cell: 0.3856472660442084
dark: 0.050874280485907676


In [19]:
# Calculate the mean imbalance ratio
mean_imbalance_ratio = np.mean(list(imbalance_ratio.values()))
print(f"Mean imbalance ratio: {mean_imbalance_ratio}")

Mean imbalance ratio: 0.40237922842248763
