In [1]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.datasets import cifar10

In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar10.load_data()


In [3]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [4]:
# Separate the training dataset by category
data_by_class = {i: [] for i in range(10)}
for row in train_data:
    data_by_class[int(row[-1])].append(row)

In [5]:
# Generate proportions for 10 chunks using a normal distribution around the mean
mean = 0.05
std_dev = 0.01
proportions = np.random.normal(mean, std_dev, 10)

# Clip values to ensure they're in range [0,1] and normalize so they sum to 1
proportions = np.clip(proportions, 0, 1)
proportions /= np.sum(proportions)

# Compute the chunk sizes based on proportions, and ensure they're multiples of 10
chunk_sizes = [int(p * 50000) // 10 * 10 for p in proportions]

# Adjust the last chunk size to ensure the total sum is 50000
chunk_sizes[-1] = 50000 - sum(chunk_sizes[:-1])

# Sorting in descending order using sorted()
chunk_sizes = sorted(chunk_sizes, reverse=True)
chunk_sizes

[7150, 5370, 5100, 5080, 5060, 5010, 4950, 4860, 4480, 2940]

In [6]:
# Create 10 chunks with specified sizes and IID distribution
chunks = []
for size in chunk_sizes:
    chunk = []
    samples_per_class = size // 10  # Number of samples for each class in the chunk
    for i in range(10):
        # Append the correct number of samples from each class
        chunk.extend(data_by_class[i][:samples_per_class])
        # Remove the samples from data_by_class
        data_by_class[i] = data_by_class[i][samples_per_class:]
    # Shuffle the chunk
    np.random.shuffle(chunk)
    chunks.append(np.array(chunk))

In [7]:
# check to ensure that each sample in the total training dataset is chosen at least one
total = np.concatenate(chunks, axis=0)
unique_samples = np.unique(total, axis=0)

len(unique_samples)



50000

In [8]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [9]:
import os
folder = "10_chunks"
if not os.path.exists(folder):
    os.makedirs(folder)


In [10]:
iid_folder = "iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [11]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [12]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk  size                                 label_distribution  \
0      1  7150  {0: 715, 1: 715, 2: 715, 3: 715, 4: 715, 5: 71...   
1      2  5370  {0: 537, 1: 537, 2: 537, 3: 537, 4: 537, 5: 53...   
2      3  5100  {0: 510, 1: 510, 2: 510, 3: 510, 4: 510, 5: 51...   
3      4  5080  {0: 508, 1: 508, 2: 508, 3: 508, 4: 508, 5: 50...   
4      5  5060  {0: 506, 1: 506, 2: 506, 3: 506, 4: 506, 5: 50...   
5      6  5010  {0: 501, 1: 501, 2: 501, 3: 501, 4: 501, 5: 50...   
6      7  4950  {0: 495, 1: 495, 2: 495, 3: 495, 4: 495, 5: 49...   
7      8  4860  {0: 486, 1: 486, 2: 486, 3: 486, 4: 486, 5: 48...   
8      9  4480  {0: 448, 1: 448, 2: 448, 3: 448, 4: 448, 5: 44...   
9     10  2940  {0: 294, 1: 294, 2: 294, 3: 294, 4: 294, 5: 29...   

                                   label_proportions  
0  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
1  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
2  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
3  {0: 0.1, 1: 0.1, 

In [13]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)
