In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.datasets import cifar10

2023-08-18 06:35:32.516248: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-18 06:35:32.551561: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar10.load_data()


In [3]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [101]:
# Generate proportions for 10 chunks using a normal distribution around the mean
mean = 0.05
std_dev = 0.015
proportions = np.random.normal(mean, std_dev, 10)

# Clip values to ensure they're in range [0,1] and normalize so they sum to 1
proportions = np.clip(proportions, 0, 1)
proportions /= np.sum(proportions)

# Compute the chunk sizes based on proportions, and ensure they're multiples of 10
chunk_sizes = [int(p * 50000) // 10 * 10 for p in proportions]

# Adjust the last chunk size to ensure the total sum is 50000
chunk_sizes[-1] = 50000 - sum(chunk_sizes[:-1])

# Sorting in descending order using sorted()
chunk_sizes = sorted(chunk_sizes, reverse=True)
chunk_sizes, proportions

([6840, 6740, 6230, 6010, 5290, 4380, 4350, 3770, 3670, 2720],
 array([0.12024607, 0.10583159, 0.07340995, 0.08775413, 0.12479917,
        0.13484228, 0.07549685, 0.05441033, 0.13699571, 0.08621391]))

In [102]:
# Categorize samples by label
samples_by_label = {}
for label in range(10):
    samples_by_label[label] = train_data[train_data[:, -1] == label]

def get_samples_for_chunk(label_data, chunk_size):
    chunk = []
    for label, samples in label_data.items():
        # Use a normal distribution to get the proportion of samples for this label
        prop = np.random.normal(0.095, 0.025) # mean at 9.5% and std deviation 2.5% to get a range ~7% to ~12%
        prop = np.clip(prop, 0.085, 0.1) 
        num_samples = int(chunk_size * prop)
        
        # Extract these samples
        chunk_samples = samples[:num_samples]
        chunk.extend(chunk_samples)
        
        # Remove these samples from the label data
        label_data[label] = samples[num_samples:]
    return chunk, label_data

chunks = []
label_data_copy = samples_by_label.copy()

for size in chunk_sizes:
    chunk, label_data_copy = get_samples_for_chunk(label_data_copy, size)
    chunks.append(chunk)

# Check if all samples are used
unused_samples = sum([len(samples) for samples in label_data_copy.values()])
assert unused_samples == 0, f"{unused_samples} samples are not used."

# Verify the chunk sizes
for chunk in chunks:
    print(len(chunk))


AssertionError: 3469 samples are not used.

In [103]:
# Verify the chunk sizes
for chunk in chunks:
    print(len(chunk))

6483
6404
5758
5589
4832
3990
4050
3531
3443
2451


In [104]:
for category in label_data_copy.values():
    # print(len(category))
    chunks[-1].extend(category)

In [105]:
len(chunks[-1])

5920

In [None]:
# chunks = []
# start_index = 0
# for size in chunk_sizes:
#     chunk = train_data[start_index: start_index + size]
#     chunks.append(chunk)
#     start_index += size


In [106]:
# check to ensure that each sample in the total training dataset is chosen at least one
total = np.concatenate(chunks, axis=0)
unique_samples = np.unique(total, axis=0)

len(unique_samples)



50000

In [107]:
len(chunks[0][0]), chunks[0][0][-1]

(3073, 0)

In [108]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [109]:
import os
folder = "10_chunks"
if not os.path.exists(folder):
    os.makedirs(folder)


In [110]:
iid_folder = "non_iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [111]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):
    chunk = np.array(chunk)    
    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [112]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk  size                                 label_distribution  \
0      1  6483  {0: 684, 1: 581, 2: 684, 3: 684, 4: 581, 5: 68...   
1      2  6404  {0: 641, 1: 674, 2: 665, 3: 576, 4: 572, 5: 59...   
2      3  5758  {0: 529, 1: 529, 2: 531, 3: 529, 4: 529, 5: 62...   
3      4  5589  {0: 541, 1: 542, 2: 510, 3: 510, 4: 601, 5: 60...   
4      5  4832  {0: 529, 1: 449, 2: 529, 3: 529, 4: 449, 5: 44...   
5      6  3990  {0: 438, 1: 379, 2: 372, 3: 438, 4: 385, 5: 42...   
6      7  4050  {0: 435, 1: 369, 2: 369, 3: 435, 4: 435, 5: 43...   
7      8  3531  {0: 377, 1: 320, 2: 377, 3: 320, 4: 320, 5: 36...   
8      9  3443  {0: 367, 1: 367, 2: 367, 3: 311, 4: 311, 5: 36...   
9     10  5920  {0: 459, 1: 790, 2: 596, 3: 668, 4: 817, 5: 45...   

                                   label_proportions  
0  {0: 0.10550670985654789, 1: 0.0896190035477402...  
1  {0: 0.10009369144284821, 1: 0.1052467207995003...  
2  {0: 0.09187217783952761, 1: 0.0918721778395276...  
3  {0: 0.09679728037

In [113]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)
