In [1]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.datasets import cifar10

In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar10.load_data()


In [3]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [4]:
def get_chunks_with_overlap(data, num_chunks=10, min_chunk_size=0.2, max_chunk_size=0.3):
    num_samples = len(data)
    num_classes = 10  # cifar10 has 10 classes

    # Select random chunk sizes that are multiples of the number of classes
    chunk_sizes = [np.random.choice(np.arange(int(min_chunk_size * num_samples), 
                                              int(max_chunk_size * num_samples)+1, 
                                              num_classes)) for _ in range(num_chunks)]
    
    chunks = []
    
    # Split data by classes
    data_by_class = {i: [] for i in range(num_classes)}
    for row in data:
        data_by_class[int(row[-1])].append(row)

    # Ensure each sample is included at least once
    for i in range(num_classes):
        np.random.shuffle(data_by_class[i])

    # Copy of the original class samples before popping
    data_by_class_original = {i: list(data) for i, data in data_by_class.items()}
    
    for size in chunk_sizes:
        chunk = []
        for _ in range(size // num_classes):
            for c in range(num_classes):
                if data_by_class[c]:  # if there are samples left
                    chunk.append(data_by_class[c].pop())
                else:  # if all samples of this class have been used, start reusing
                    chunk.append(data_by_class_original[c][np.random.choice(len(data_by_class_original[c]))])
        chunks.append(np.array(chunk))
    return chunks

# Split train_data into 10 chunks
chunks = get_chunks_with_overlap(train_data)




In [5]:
# check to ensure that each sample in the total training dataset is chosen at least one

total = np.concatenate(chunks, axis=0)
unique_samples = np.unique(total, axis=0)

len(unique_samples)



50000

In [6]:
def calculate_overlap(chunks):
    overlaps = np.zeros((len(chunks), len(chunks)))
    for i in range(len(chunks)):
        for j in range(i + 1, len(chunks)):
            set_i = set(tuple(row) for row in chunks[i])
            set_j = set(tuple(row) for row in chunks[j])
            intersection = set_i & set_j
            union = set_i | set_j
            overlap = len(intersection) / len(union)
            overlaps[i, j] = overlap
            overlaps[j, i] = overlap  # the overlap is symmetric
    return overlaps
overlaps = calculate_overlap(chunks)


In [7]:
for i in range(len(chunks)):
    for j in range(i+1, len(chunks)):
        print(f" pair ({i},{j}) is: {overlaps[i,j]}")

 pair (0,1) is: 0.0
 pair (0,2) is: 0.0
 pair (0,3) is: 0.0
 pair (0,4) is: 0.10311784228485954
 pair (0,5) is: 0.12491266593472403
 pair (0,6) is: 0.11426889317062727
 pair (0,7) is: 0.1270743466125349
 pair (0,8) is: 0.13056675361631492
 pair (0,9) is: 0.12787663107947805
 pair (1,2) is: 0.0
 pair (1,3) is: 0.0
 pair (1,4) is: 0.10132829074808597
 pair (1,5) is: 0.12891417264407617
 pair (1,6) is: 0.11741340959644106
 pair (1,7) is: 0.12270979981029405
 pair (1,8) is: 0.13796357459658393
 pair (1,9) is: 0.13679334467763282
 pair (2,3) is: 0.0
 pair (2,4) is: 0.11747982775199632
 pair (2,5) is: 0.14480021405636817
 pair (2,6) is: 0.1337878142309331
 pair (2,7) is: 0.14521377564855825
 pair (2,8) is: 0.15866809881847477
 pair (2,9) is: 0.15549828178694158
 pair (3,4) is: 0.09409850189058537
 pair (3,5) is: 0.12428446186375122
 pair (3,6) is: 0.10909191338930622
 pair (3,7) is: 0.11975173420956553
 pair (3,8) is: 0.1285483949772805
 pair (3,9) is: 0.13141734217636944
 pair (4,5) is: 0.1

In [8]:
def remove_duplicate_sample_from_chunk(chunks):
    unique_chunks = []
    additional_samples = []
    num_classes = 10  # cifar10 has 10 classes

    for i, chunk in enumerate(chunks):
        unique_samples = np.unique(chunk, axis=0)
        unique_chunks.append(unique_samples)

        # Count the number of samples from each category in the original and unique chunks
        original_counts = np.bincount(chunk[:,-1], minlength=num_classes)
        unique_counts = np.bincount(unique_samples[:,-1], minlength=num_classes)

        # Compute the additional_samples for each category as the difference between the original and unique counts
        additional_sample = original_counts - unique_counts
        additional_samples.append(additional_sample.tolist())
        print(f"chunk {i+1}: {len(chunk)} vs {len(unique_samples)} --> additional sample = {additional_sample.tolist()}")
        print(f"additional sample list: {additional_sample}")
        
    return zip(unique_chunks, additional_samples)

requirement = remove_duplicate_sample_from_chunk(chunks)


chunk 1: 10970 vs 10970 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 2: 11250 vs 11250 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 3: 14100 vs 14100 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 4: 10230 vs 10230 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 5: 14470 vs 12629 --> additional sample = [212, 182, 186, 163, 182, 190, 188, 175, 192, 171]
additional sample list: [212 182 186 163 182 190 188 175 192 171]
chunk 6: 13140 vs 11571 --> additional sample = [148, 145, 164, 148, 157, 167, 155, 170, 157, 158]
additional sample list: [148 145 164 148 157 167 155 170 157 158]
chunk 7: 10990 vs 9849 --> additional sample = [116, 106, 98, 110, 117, 120, 125, 112, 110, 127]
additional sample list: [116 106  98 110 117 120 125 112 110 127]
chunk 8

In [9]:
chunk = chunks[0]

chunk.shape

(10970, 3073)

In [10]:
def fill_chunks(requirement, chunks, subset_factor=2):
    num_classes = 10  # cifar10 has 10 classes

    for i, (unique_chunk, additional_samples) in enumerate(requirement):
        # Create a set for faster membership tests
        unique_set = set(map(tuple, unique_chunk))

        # Convert the list of additional samples into a numpy array for easier manipulation
        additional_samples = np.array(additional_samples)

        while np.any(additional_samples > 0):
            # Randomly select a chunk different from the current one
            while True:
                random_chunk_index = np.random.choice(len(chunks))
                if random_chunk_index != i:
                    break

            random_chunk = chunks[random_chunk_index]
            np.random.shuffle(random_chunk)

            for sample in random_chunk:
                # If sample is not already in unique_set and there are still missing samples for its category
                if tuple(sample) not in unique_set and additional_samples[int(sample[-1])] > 0:
                    # Add sample to unique_chunk
                    unique_chunk = np.vstack([unique_chunk, sample])
                    # Add sample to unique_set
                    unique_set.add(tuple(sample))
                    # Decrease the count of missing samples for this category
                    additional_samples[int(sample[-1])] -= 1

            # Replace the old chunk with the new one in the chunks list
            chunks[i] = unique_chunk

    return chunks

# Use the fill_chunks function
filled_chunks = fill_chunks(requirement, chunks)


In [11]:
result = remove_duplicate_sample_from_chunk(filled_chunks)

chunk 1: 10970 vs 10970 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 2: 11250 vs 11250 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 3: 14100 vs 14100 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 4: 10230 vs 10230 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 5: 14470 vs 14470 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 6: 13140 vs 13140 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 7: 10990 vs 10990 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]
chunk 8: 12710 vs 12710 --> additional sample = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
additional sample list: [0 0 0 0 0 0 0 0 0 0]


In [12]:
# Unzip the result to get chunks and additional_samples separately
chunks, _ = zip(*result)

for chunk in chunks:
    print(chunk.shape)



(10970, 3073)
(11250, 3073)
(14100, 3073)
(10230, 3073)
(14470, 3073)
(13140, 3073)
(10990, 3073)
(12710, 3073)
(14900, 3073)
(14780, 3073)


In [13]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [14]:
import os
folder = "10_chunks_overlap"
if not os.path.exists(folder):
    os.makedirs(folder)


In [15]:
iid_folder = "iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [16]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [17]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk   size                                 label_distribution  \
0      1  10970  {0: 1097, 1: 1097, 2: 1097, 3: 1097, 4: 1097, ...   
1      2  11250  {0: 1125, 1: 1125, 2: 1125, 3: 1125, 4: 1125, ...   
2      3  14100  {0: 1410, 1: 1410, 2: 1410, 3: 1410, 4: 1410, ...   
3      4  10230  {0: 1023, 1: 1023, 2: 1023, 3: 1023, 4: 1023, ...   
4      5  14470  {0: 1447, 1: 1447, 2: 1447, 3: 1447, 4: 1447, ...   
5      6  13140  {0: 1314, 1: 1314, 2: 1314, 3: 1314, 4: 1314, ...   
6      7  10990  {0: 1099, 1: 1099, 2: 1099, 3: 1099, 4: 1099, ...   
7      8  12710  {0: 1271, 1: 1271, 2: 1271, 3: 1271, 4: 1271, ...   
8      9  14900  {0: 1490, 1: 1490, 2: 1490, 3: 1490, 4: 1490, ...   
9     10  14780  {0: 1478, 1: 1478, 2: 1478, 3: 1478, 4: 1478, ...   

                                   label_proportions  
0  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
1  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
2  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
3  {0: 0.

In [18]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)


In [20]:
# sample = chunks[0][5]
# image = sample[:-1]
# label = sample[-1]
# image = image.reshape((32, 32, 3))

# import matplotlib.pyplot as plt
# plt.imshow(image)
