In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.datasets import cifar10

2023-08-18 06:35:06.587378: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-18 06:35:06.655403: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar10.load_data()


In [3]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [7]:
# Separate the training dataset by category
data_by_class = {i: [] for i in range(10)}
for row in train_data:
    data_by_class[int(row[-1])].append(row)

In [17]:
# Generate proportions for 10 chunks using a normal distribution around the mean
mean = 0.05
std_dev = 0.01
proportions = np.random.normal(mean, std_dev, 10)

# Clip values to ensure they're in range [0,1] and normalize so they sum to 1
proportions = np.clip(proportions, 0, 1)
proportions /= np.sum(proportions)

# Compute the chunk sizes based on proportions, and ensure they're multiples of 10
chunk_sizes = [int(p * 50000) // 10 * 10 for p in proportions]

# Adjust the last chunk size to ensure the total sum is 50000
chunk_sizes[-1] = 50000 - sum(chunk_sizes[:-1])

# Sorting in descending order using sorted()
chunk_sizes = sorted(chunk_sizes, reverse=True)
chunk_sizes

[6800, 6630, 5990, 5380, 5340, 4420, 4350, 3900, 3820, 3370]

In [19]:
# Create 10 chunks with specified sizes and IID distribution
chunks = []
for size in chunk_sizes:
    chunk = []
    samples_per_class = size // 10  # Number of samples for each class in the chunk
    for i in range(10):
        # Append the correct number of samples from each class
        chunk.extend(data_by_class[i][:samples_per_class])
        # Remove the samples from data_by_class
        data_by_class[i] = data_by_class[i][samples_per_class:]
    # Shuffle the chunk
    np.random.shuffle(chunk)
    chunks.append(np.array(chunk))

In [22]:
# check to ensure that each sample in the total training dataset is chosen at least one
total = np.concatenate(chunks, axis=0)
unique_samples = np.unique(total, axis=0)

len(unique_samples)



50000

In [23]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [25]:
import os
folder = "10_chunks"
if not os.path.exists(folder):
    os.makedirs(folder)


In [26]:
iid_folder = "iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [27]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [28]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk  size                                 label_distribution  \
0      1  6800  {0: 680, 1: 680, 2: 680, 3: 680, 4: 680, 5: 68...   
1      2  6630  {0: 663, 1: 663, 2: 663, 3: 663, 4: 663, 5: 66...   
2      3  5990  {0: 599, 1: 599, 2: 599, 3: 599, 4: 599, 5: 59...   
3      4  5380  {0: 538, 1: 538, 2: 538, 3: 538, 4: 538, 5: 53...   
4      5  5340  {0: 534, 1: 534, 2: 534, 3: 534, 4: 534, 5: 53...   
5      6  4420  {0: 442, 1: 442, 2: 442, 3: 442, 4: 442, 5: 44...   
6      7  4350  {0: 435, 1: 435, 2: 435, 3: 435, 4: 435, 5: 43...   
7      8  3900  {0: 390, 1: 390, 2: 390, 3: 390, 4: 390, 5: 39...   
8      9  3820  {0: 382, 1: 382, 2: 382, 3: 382, 4: 382, 5: 38...   
9     10  3370  {0: 337, 1: 337, 2: 337, 3: 337, 4: 337, 5: 33...   

                                   label_proportions  
0  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
1  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
2  {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0....  
3  {0: 0.1, 1: 0.1, 

In [29]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)


In [None]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)
