In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.datasets import cifar10

2023-07-20 03:58:54.976395: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load training and testing data separately
(x_train, y_train), (x_test, y_test) = cifar10.load_data()


In [6]:
# Reshape x_train from 4D to 2D array (number of samples, width*height*channels)
x_train = x_train.reshape(x_train.shape[0], -1)

# Reshape y_train to 1D array
y_train = y_train.reshape(-1)

# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((x_train, y_train))

# Randomly shuffle the training data
np.random.shuffle(train_data)


In [10]:
import numpy as np

# Generate an array of 20 random numbers around the mean using a normal distribution
mean = 0.05
std_dev = 0.005  # Small standard deviation to keep numbers close to the mean
array = np.random.normal(mean, std_dev, 20)

# Clip values to range [0,1]
array = np.clip(array, 0, 1)

# Normalize the array so the sum of all elements equals 1
array /= np.sum(array)

print(array)
print(np.sum(array))  # This should print '1.0'


[0.04736183 0.05161072 0.05689929 0.05039335 0.04208938 0.05147302
 0.05004039 0.04254451 0.05239361 0.05142695 0.05020426 0.03729472
 0.05356149 0.05485727 0.0554516  0.04848357 0.04788524 0.04906771
 0.05829911 0.048662  ]
0.9999999999999998


In [12]:
total_samples = len(train_data)

# Distribute the total number of samples proportionally and round to the nearest integer
sizes = np.round(array * total_samples).astype(int)

# Adjust the last element to ensure the sum equals to total_samples
sizes[-1] = total_samples - np.sum(sizes[:-1])

sizes, np.sum(sizes)  # This should print '50000'

(array([2368, 2581, 2845, 2520, 2104, 2574, 2502, 2127, 2620, 2571, 2510,
        1865, 2678, 2743, 2773, 2424, 2394, 2453, 2915, 2433]),
 50000)

In [13]:
# Create an array of indices at which to split the training data
split_indices = np.cumsum(sizes)[:-1]

# Split the training data into chunks of different sizes
chunks = np.split(train_data, split_indices)


In [14]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [15]:
import os
folder = "20_chunks_iid"
if not os.path.exists(folder):
    os.makedirs(folder)


In [16]:
iid_folder = "iid"
iid_folder = os.path.join(folder, iid_folder)
if not os.path.exists(iid_folder):
    os.makedirs(iid_folder)


In [17]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{iid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [18]:
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



    chunk  size                                 label_distribution  \
0       1  2368  {0: 237, 1: 219, 2: 243, 3: 260, 4: 245, 5: 23...   
1       2  2581  {0: 263, 1: 252, 2: 268, 3: 242, 4: 241, 5: 26...   
2       3  2845  {0: 288, 1: 293, 2: 284, 3: 298, 4: 281, 5: 29...   
3       4  2520  {0: 243, 1: 302, 2: 251, 3: 252, 4: 252, 5: 22...   
4       5  2104  {0: 238, 1: 201, 2: 225, 3: 214, 4: 209, 5: 20...   
5       6  2574  {0: 246, 1: 291, 2: 247, 3: 263, 4: 275, 5: 25...   
6       7  2502  {0: 237, 1: 244, 2: 250, 3: 267, 4: 246, 5: 24...   
7       8  2127  {0: 218, 1: 198, 2: 182, 3: 208, 4: 218, 5: 23...   
8       9  2620  {0: 279, 1: 280, 2: 240, 3: 291, 4: 257, 5: 24...   
9      10  2571  {0: 272, 1: 243, 2: 269, 3: 277, 4: 234, 5: 23...   
10     11  2510  {0: 258, 1: 216, 2: 279, 3: 236, 4: 264, 5: 23...   
11     12  1865  {0: 176, 1: 171, 2: 178, 3: 188, 4: 200, 5: 19...   
12     13  2678  {0: 277, 1: 284, 2: 301, 3: 233, 4: 271, 5: 25...   
13     14  2743  {0:

In [19]:
# Save dataframe to csv
df.to_csv(f"{iid_folder}/chunks_info.csv", index=False)


In [16]:
def make_distribution_unbalanced(chunk, seed=None):
    # The label is in the last column
    labels = chunk[:, -1]

    # Get unique labels in the chunk
    unique_labels = np.unique(labels)

    # Set a random seed for reproducibility
    np.random.seed(seed)

    # Assign random weights to each unique label
    label_weights = np.random.rand(len(unique_labels))

    # Normalize weights so they sum to 1
    label_weights /= label_weights.sum()

    # Create a new chunk with an unbalanced label distribution
    unbalanced_chunk = np.empty((0, chunk.shape[1]), dtype=chunk.dtype)

    for label, weight in zip(unique_labels, label_weights):
        # Calculate the number of samples for this label
        num_samples = int(weight * len(chunk))

        # Get all samples with this label
        label_samples = chunk[labels == label]

        # Randomly select samples of this label
        selected_samples = np.random.choice(len(label_samples), size=num_samples)

        # Add selected samples to the new chunk
        unbalanced_chunk = np.vstack([unbalanced_chunk, label_samples[selected_samples]])

    return unbalanced_chunk



In [17]:
noniid_folder = "noniid"
noniid_folder = os.path.join(folder, noniid_folder)
if not os.path.exists(noniid_folder):
    os.makedirs(noniid_folder)


In [18]:

# Store size and label distribution of each chunk, and save each chunk as a pickle file
unbalanced_chunk_info = []
# Now, make the distribution unbalanced for each chunk before saving
for i, chunk in enumerate(chunks):
    unbalanced_chunk = make_distribution_unbalanced(chunk, seed=i)

    info = {}
    info['chunk'] = i+1
    label_distribution = get_label_distribution(unbalanced_chunk)
    chunk_size =len(unbalanced_chunk)
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution

    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    unbalanced_chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{noniid_folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(unbalanced_chunk, f)


In [19]:
# Convert list of dictionaries to DataFrame for better visualization
noniid_df = pd.DataFrame(unbalanced_chunk_info)

# print dataframe
print(noniid_df)

   chunk   size                                 label_distribution  \
0      1   4996  {0: 445, 1: 580, 2: 489, 3: 442, 4: 344, 5: 52...   
1      2   7495  {0: 994, 1: 1717, 3: 720, 4: 349, 5: 220, 6: 4...   
2      3   9995  {0: 1215, 1: 72, 2: 1531, 3: 1213, 4: 1171, 5:...   
3      4  12494  {0: 1472, 1: 1893, 2: 777, 3: 1365, 4: 2387, 5...   
4      5  14996  {0: 2507, 1: 1418, 2: 2521, 3: 1853, 4: 1808, ...   

                                   label_proportions  
0  {0: 0.08907125700560449, 1: 0.1160928742994395...  
1  {0: 0.13262174783188793, 1: 0.2290860573715810...  
2  {0: 0.12156078039019509, 1: 0.0072036018009004...  
3  {0: 0.11781655194493357, 1: 0.1515127261085321...  
4  {0: 0.16717791411042945, 1: 0.0945585489463857...  


In [21]:
# Save dataframe to csv
noniid_df.to_csv(f"{noniid_folder}/chunks_info.csv", index=False)


In [68]:
df.size

30

In [69]:
df

Unnamed: 0,chunk,size,label_distribution
0,1,2173,"{0: 211, 1: 226, 2: 215, 3: 207, 4: 232, 5: 21..."
1,2,2173,"{0: 209, 1: 212, 2: 228, 3: 230, 4: 223, 5: 24..."
2,3,8695,"{0: 867, 1: 915, 2: 852, 3: 840, 4: 858, 5: 87..."
3,4,2173,"{0: 203, 1: 201, 2: 211, 3: 242, 4: 230, 5: 20..."
4,5,4347,"{0: 447, 1: 425, 2: 397, 3: 417, 4: 462, 5: 45..."
5,6,2173,"{0: 216, 1: 210, 2: 226, 3: 224, 4: 209, 5: 19..."
6,7,13043,"{0: 1231, 1: 1256, 2: 1333, 3: 1319, 4: 1328, ..."
7,8,2173,"{0: 228, 1: 240, 2: 221, 3: 188, 4: 210, 5: 19..."
8,9,10869,"{0: 1166, 1: 1086, 2: 1085, 3: 1120, 4: 1056, ..."
9,10,2181,"{0: 222, 1: 229, 2: 232, 3: 213, 4: 192, 5: 24..."


In [71]:
df.label_distribution


0    {0: 211, 1: 226, 2: 215, 3: 207, 4: 232, 5: 21...
1    {0: 209, 1: 212, 2: 228, 3: 230, 4: 223, 5: 24...
2    {0: 867, 1: 915, 2: 852, 3: 840, 4: 858, 5: 87...
3    {0: 203, 1: 201, 2: 211, 3: 242, 4: 230, 5: 20...
4    {0: 447, 1: 425, 2: 397, 3: 417, 4: 462, 5: 45...
5    {0: 216, 1: 210, 2: 226, 3: 224, 4: 209, 5: 19...
6    {0: 1231, 1: 1256, 2: 1333, 3: 1319, 4: 1328, ...
7    {0: 228, 1: 240, 2: 221, 3: 188, 4: 210, 5: 19...
8    {0: 1166, 1: 1086, 2: 1085, 3: 1120, 4: 1056, ...
9    {0: 222, 1: 229, 2: 232, 3: 213, 4: 192, 5: 24...
Name: label_distribution, dtype: object

In [21]:
import numpy as np
import os
import pickle
import pandas as pd

from tensorflow.keras import datasets
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

# Define the proportion of each chunk
chunk_sizes = [0.1, 0.15, 0.2, 0.25, 0.3]

# Define the proportion of each category within each chunk
category_dist_in_chunks = [
    [0.3, 0.2, 0.15, 0.1, 0.05, 0.05, 0.05, 0.05, 0.025, 0.025],
    [0.1, 0.2, 0.1, 0.15, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1],
    [0.1, 0.05, 0.15, 0.1, 0.2, 0.1, 0.1, 0.05, 0.1, 0.05],
    [0.05, 0.1, 0.05, 0.2, 0.1, 0.15, 0.1, 0.1, 0.05, 0.1],
    [0.1, 0.05, 0.1, 0.05, 0.15, 0.1, 0.2, 0.1, 0.1, 0.05]
]

# Normalize the proportions within each chunk
category_dist_in_chunks = [np.array(dist)/sum(dist) for dist in category_dist_in_chunks]

def save_to_pkl(images, labels, path):
    """Save the images and labels to a Pickle file."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        # Combine images and labels into one list
        data = [np.append(image.flatten(), label) for image, label in zip(images, labels)]
        pickle.dump(data, f)
    print(f'Dataset saved to {path}')

# Load CIFAR10 data
(train_images, train_labels), _ = datasets.cifar10.load_data()

# Create a dictionary where the keys are the labels and the values are the images
data_dict = defaultdict(list)
for image, label in zip(train_images, train_labels):
    data_dict[label[0]].append(image)

chunk_info = []
# Create chunks
for i in range(5):
    chunk_images = []
    chunk_labels = []
    total_samples = sum([len(images) for images in data_dict.values()])
    chunk_size = int(total_samples * chunk_sizes[i])
    
    for label, images in data_dict.items():
        n_samples = int(len(images) * category_dist_in_chunks[i][label])
        # Split data for this category
        chunk, data_dict[label] = train_test_split(images, train_size=n_samples, shuffle=True)
        chunk_images.extend(chunk)
        chunk_labels.extend([label]*len(chunk))
    
    # Correct for rounding errors
    diff = chunk_size - len(chunk_images)
    if diff != 0:
        label, images = max(data_dict.items(), key=lambda x: len(x[1]))
        extra_samples, data_dict[label] = train_test_split(images, train_size=diff, shuffle=True)
        chunk_images.extend(extra_samples)
        chunk_labels.extend([label]*len(extra_samples))
    
    # Save the chunk
    save_to_pkl(chunk_images, chunk_labels, f'data2/chunk_{i+1}.pkl')

    # Collect the chunk information
    chunk_info.append({
        'chunk': i+1,
        'size': len(chunk_images),
        'label_distribution': dict(Counter(chunk_labels))
    })

# Save the chunk information into a CSV file
df = pd.DataFrame(chunk_info)
df.to_csv('data2/chunk_info.csv', index=False)


Dataset saved to data2/chunk_1.pkl
Dataset saved to data2/chunk_2.pkl
Dataset saved to data2/chunk_3.pkl


ValueError: train_size=4463 should be either positive and smaller than the number of samples 3765 or a float in the (0, 1) range

In [1]:
import tensorflow as tf

2023-08-01 09:44:12.508986: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-01 09:44:12.718013: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
import tensorflow as tf

2023-08-01 21:45:40.427380: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-01 21:45:40.516313: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
epoch = 200
datasize = 12500
batch_size = 128

total_1 = epoch * datasize / batch_size
total_1

19531.25

In [3]:
def get_cosine_lr_scheduler(total_update_times: int = 380, initial_learning_rate: float = 0.1):
    lr_scheduler = tf.keras.experimental.CosineDecay(initial_learning_rate= initial_learning_rate, 
                                                    decay_steps= total_update_times)
    return lr_scheduler

In [4]:
total_update_times = 390
lr_scheduler = get_cosine_lr_scheduler(total_update_times= total_update_times)

for update_time in range(total_update_times):
    print(f"update time: {update_time + 1}, learning rate: {float(lr_scheduler(update_time))}")


2023-08-01 21:47:20.307834: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-01 21:47:20.337595: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-01 21:47:20.337633: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-01 21:47:20.339589: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-01 21:47:20.339621: I tensorflow/compile

update time: 1, learning rate: 0.10000000149011612
update time: 2, learning rate: 0.09999837726354599
update time: 3, learning rate: 0.0999935194849968
update time: 4, learning rate: 0.09998539835214615
update time: 5, learning rate: 0.09997405111789703
update time: 6, learning rate: 0.09995945543050766
update time: 7, learning rate: 0.09994161128997803
update time: 8, learning rate: 0.09992053359746933
update time: 9, learning rate: 0.09989621490240097
update time: 10, learning rate: 0.09986865520477295
update time: 11, learning rate: 0.09983786195516586
update time: 12, learning rate: 0.09980384260416031
update time: 13, learning rate: 0.09976658225059509
update time: 14, learning rate: 0.09972609579563141
update time: 15, learning rate: 0.09968238323926926
update time: 16, learning rate: 0.09963544458150864
update time: 17, learning rate: 0.09958528727293015
update time: 18, learning rate: 0.09953191131353378
update time: 19, learning rate: 0.09947532415390015
update time: 20, learn

In [None]:
epoch = 200
datasize = 12500
batch_size = 128

total_1 = epoch * datasize / batch_size
initial_lr = 0.1
lr_scheduler = get_cosine_lr_scheduler(initial_learning_rate= initial_lr,
                                       total_update_times= total_1)

for update_time in range(total_update_times):
    print(f"update time: {update_time + 1}, learning rate: {float(lr_scheduler(update_time))}")
