In [1]:
import tensorflow as tf
from tensorflow.keras import datasets

import numpy as np

import pickle


2023-06-22 11:28:04.255533: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-22 11:28:04.282457: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()


In [3]:
# save testing set
test_image = test_images / 255.0
x_test = test_images.reshape(test_images.shape[0], -1)
y_test = test_labels.reshape(-1)
test_data = np.column_stack((x_test, y_test))

# Save test set as a pickle file
with open('../../../data/cifar_data/test_set.pickle', 'wb') as f:
    pickle.dump((test_data), f)


In [6]:
# training set
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = train_images.reshape(train_images.shape[0], -1)

# Reshape y_train to 1D array
y_train = train_labels.reshape(-1)
# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)

# # Save training set as a pickle file
# with open('../../../data/cifar_data/training_set.pickle', 'wb') as f:
#     pickle.dump((train_data), f)


In [6]:
x_train

array([[ 59,  62,  63, ..., 123,  92,  72],
       [154, 177, 187, ..., 143, 133, 144],
       [255, 255, 255, ...,  80,  86,  84],
       ...,
       [ 35, 178, 235, ...,  12,  31,  50],
       [189, 211, 240, ..., 195, 190, 171],
       [229, 229, 239, ..., 163, 163, 161]], dtype=uint8)

In [50]:

# Number of chunks
n_chunks = 5

# Generate sizes following a power-law distribution
sizes = np.random.zipf(1.5, n_chunks)

# Normalize the sizes so that their sum equals the number of training samples
sizes = (sizes / sizes.sum() * len(train_data)).astype(int)

# Ensure that the sum of sizes is equal to the total number of training samples
sizes[-1] += len(train_data) - sizes.sum()



sizes

array([20833,  4166, 12500,  4166,  8335])

In [52]:
# Create an array of indices at which to split the training data
split_indices = np.cumsum(sizes)[:-1]

# Split the training data into chunks of different sizes
chunks = np.split(train_data, split_indices)


In [57]:
import os
save_path = "../../../data/cifar_data/5_chunks/"
isExist = os.path.exists(save_path)
if not isExist:
   os.makedirs(save_path)

In [53]:
import pickle
# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))

# Store size and label distribution of each chunk, and save each chunk as a pickle file
chunk_info = []
for i, chunk in enumerate(chunks):
    info = {}
    info['chunk'] = i+1
    info['size'] = len(chunk)
    info['label_distribution'] = get_label_distribution(chunk)
    chunk_info.append(info)

    # Save chunk as a pickle file
    with open(f'{save_path}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)





In [54]:
import pandas as pd
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)

   chunk   size                                 label_distribution
0      1  20833  {0: 2099, 1: 2099, 2: 2083, 3: 2076, 4: 2101, ...
1      2   4166  {0: 422, 1: 425, 2: 413, 3: 425, 4: 395, 5: 43...
2      3  12500  {0: 1256, 1: 1241, 2: 1234, 3: 1227, 4: 1235, ...
3      4   4166  {0: 421, 1: 410, 2: 438, 3: 447, 4: 434, 5: 40...
4      5   8335  {0: 802, 1: 825, 2: 832, 3: 825, 4: 835, 5: 86...


In [59]:
# Save dataframe to csv
df.to_csv(f"{save_path}/5_chunks_info.csv", index=False)
