In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.datasets import cifar100

In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar100.load_data()


In [3]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [4]:
x_train.shape

(50000, 3072)

In [5]:
np.unique(y_train, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
        500, 500, 500, 500, 500, 500, 5

In [6]:
train_data.shape

(50000, 3073)

In [7]:
# Separate the training dataset by category
data_by_class = {i: [] for i in range(100)}
for row in train_data:
    data_by_class[int(row[-1])].append(row)

In [8]:
# Generate proportions for 10 chunks using a normal distribution around the mean
mean = 0.05
std_dev = 0.01
proportions = np.random.normal(mean, std_dev, 10)

# Clip values to ensure they're in range [0,1] and normalize so they sum to 1
proportions = np.clip(proportions, 0, 1)
proportions /= np.sum(proportions)

# Compute the chunk sizes based on proportions, and ensure they're multiples of 10
chunk_sizes = [int(p * 50000) // 100 * 100 for p in proportions]

# Adjust the last chunk size to ensure the total sum is 50000
chunk_sizes[-1] = 50000 - sum(chunk_sizes[:-1])

# Sorting in descending order using sorted()
chunk_sizes = sorted(chunk_sizes, reverse=True)
chunk_sizes

[6300, 6100, 5700, 5700, 4900, 4800, 4700, 4600, 4500, 2700]

In [9]:
len(chunk_sizes)

10

In [10]:
sum(chunk_sizes)

50000

In [11]:
# Create 10 chunks with specified sizes and IID distribution
chunks = []
for size in chunk_sizes:
    chunk = []
    samples_per_class = size // 100  # Number of samples for each class in the chunk
    for i in range(100):
        # Append the correct number of samples from each class
        chunk.extend(data_by_class[i][:samples_per_class])
        # Remove the samples from data_by_class
        data_by_class[i] = data_by_class[i][samples_per_class:]
    # Shuffle the chunk
    np.random.shuffle(chunk)
    chunks.append(np.array(chunk))

In [12]:
# check to ensure that each sample in the total training dataset is chosen at least one
total = np.concatenate(chunks, axis=0)
unique_samples = np.unique(total, axis=0)

len(unique_samples)



49995

In [13]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [14]:
import os
folder = "10_chunks"
if not os.path.exists(folder):
    os.makedirs(folder)


In [15]:
iid_folder = "iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [16]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [17]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk  size                                 label_distribution  \
0      1  6300  {0: 63, 1: 63, 2: 63, 3: 63, 4: 63, 5: 63, 6: ...   
1      2  6100  {0: 61, 1: 61, 2: 61, 3: 61, 4: 61, 5: 61, 6: ...   
2      3  5700  {0: 57, 1: 57, 2: 57, 3: 57, 4: 57, 5: 57, 6: ...   
3      4  5700  {0: 57, 1: 57, 2: 57, 3: 57, 4: 57, 5: 57, 6: ...   
4      5  4900  {0: 49, 1: 49, 2: 49, 3: 49, 4: 49, 5: 49, 6: ...   
5      6  4800  {0: 48, 1: 48, 2: 48, 3: 48, 4: 48, 5: 48, 6: ...   
6      7  4700  {0: 47, 1: 47, 2: 47, 3: 47, 4: 47, 5: 47, 6: ...   
7      8  4600  {0: 46, 1: 46, 2: 46, 3: 46, 4: 46, 5: 46, 6: ...   
8      9  4500  {0: 45, 1: 45, 2: 45, 3: 45, 4: 45, 5: 45, 6: ...   
9     10  2700  {0: 27, 1: 27, 2: 27, 3: 27, 4: 27, 5: 27, 6: ...   

                                   label_proportions  
0  {0: 0.01, 1: 0.01, 2: 0.01, 3: 0.01, 4: 0.01, ...  
1  {0: 0.01, 1: 0.01, 2: 0.01, 3: 0.01, 4: 0.01, ...  
2  {0: 0.01, 1: 0.01, 2: 0.01, 3: 0.01, 4: 0.01, ...  
3  {0: 0.01, 1: 0.01

In [18]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)
