In [1]:
import pickle
import numpy as np

In [2]:
def load_data(path):
    with open(path, "rb") as f:
        dataset = pickle.load(f)
    X = dataset[:, :-1]
    y = dataset[:, -1]
    return X, y

In [4]:
X = []
y = []
data_folder = "/home/student02/thaile/working_with_ember_dataset/data"
for i in range(10):
    X_chunk, y_chunk = load_data(f'{data_folder}/chunk_{i}.pickle')
    X.append(X_chunk)
    y.append(y_chunk)
    # break

X = np.concatenate(X, axis=0)
y = np.concatenate(y, axis=0)
print('X shape:', X.shape, '-- y shape:', y.shape)

X shape: (600000, 2381) -- y shape: (600000,)


In [48]:
# check to ensure that each sample in the total training dataset is chosen at least one
unique_samples = np.unique(X, axis=0)

len(unique_samples)



599785

In [5]:
# Combine training data and labels into a single numpy array for easier manipulation
train_data = np.column_stack((X, y))

# Randomly shuffle the training data
np.random.shuffle(train_data)

In [7]:
data_size = len(train_data)
data_size

600000

In [21]:
# Generate proportions for 10 chunks using a normal distribution around the mean
mean = 0.05
std_dev = 0.01
proportions = np.random.normal(mean, std_dev, 7)

# Clip values to ensure they're in range [0,1] and normalize so they sum to 1
proportions = np.clip(proportions, 0, 1)
proportions /= np.sum(proportions)

# Compute the chunk sizes based on proportions, and ensure they're multiples of 10
chunk_sizes = [int(p * data_size) // 10 * 10 for p in proportions]

# Adjust the last chunk size to ensure the total sum is data_size
chunk_sizes[-1] = data_size - sum(chunk_sizes[:-1])

# Sorting in descending order using sorted()
chunk_sizes = sorted(chunk_sizes, reverse=True)
chunk_sizes


[104360, 99720, 93430, 87450, 79130, 77890, 58020]

In [24]:
chunks_portions = [size / data_size for size in chunk_sizes]

chunks_portions

[0.17393333333333333,
 0.1662,
 0.15571666666666667,
 0.14575,
 0.13188333333333332,
 0.12981666666666666,
 0.0967]

In [26]:
# # 1. Segregate the train_data into two lists based on class labels
# data_class_0 = train_data[train_data[:, -1] == 0]
# data_class_1 = train_data[train_data[:, -1] == 1]

# chunks = []

# # 2. For each chunk size, generate a random fraction for imbalance and sample data
# for size in chunk_sizes:
#     # Random fraction for class 0 between 0.1 and 0.9 for example
#     fraction_class_0 = np.random.uniform(0.1, 0.9)
#     num_class_0 = int(size * fraction_class_0)  # fraction of the chunk size for class 0
#     num_class_1 = size - num_class_0  # remaining data for class 1
    
#     # Handle case when available samples are less than required samples
#     if num_class_0 > len(data_class_0):
#         num_class_0 = len(data_class_0)
#         num_class_1 = size - num_class_0
#     if num_class_1 > len(data_class_1):
#         num_class_1 = len(data_class_1)
#         num_class_0 = size - num_class_1
    
#     # Sample without replacement
#     chunk_class_0 = np.random.choice(len(data_class_0), num_class_0, replace=False)
#     chunk_class_1 = np.random.choice(len(data_class_1), num_class_1, replace=False)

#     # Append to chunks
#     chunks.append(np.vstack((data_class_0[chunk_class_0], data_class_1[chunk_class_1])))

#     # 3. Remove the sampled data from main segregated lists
#     data_class_0 = np.delete(data_class_0, chunk_class_0, axis=0)
#     data_class_1 = np.delete(data_class_1, chunk_class_1, axis=0)


In [33]:
# # 1. Segregate the train_data into two lists based on class labels
# data_class_0 = train_data[train_data[:, -1] == 0]
# data_class_1 = train_data[train_data[:, -1] == 1]

# chunks = []

# # 2. For each chunk size, generate a random fraction for imbalance and sample data
# # Note: loop until len(chunk_sizes) - 1
# for size in chunk_sizes[:-2]:
#     # Random fraction for class 0 between 0.1 and 0.9 for example
#     fraction_class_0 = np.random.uniform(0.1, 0.9)
#     num_class_0 = int(size * fraction_class_0)  # fraction of the chunk size for class 0
#     num_class_1 = size - num_class_0  # remaining data for class 1
    
#     # Handle case when available samples are less than required samples
#     if num_class_0 > len(data_class_0):
#         num_class_0 = len(data_class_0)
#         num_class_1 = size - num_class_0
#     if num_class_1 > len(data_class_1):
#         num_class_1 = len(data_class_1)
#         num_class_0 = size - num_class_1
    
#     # Sample without replacement
#     chunk_class_0 = np.random.choice(len(data_class_0), num_class_0, replace=False)
#     chunk_class_1 = np.random.choice(len(data_class_1), num_class_1, replace=False)

#     # Append to chunks
#     chunks.append(np.vstack((data_class_0[chunk_class_0], data_class_1[chunk_class_1])))

#     # 3. Remove the sampled data from main segregated lists
#     data_class_0 = np.delete(data_class_0, chunk_class_0, axis=0)
#     data_class_1 = np.delete(data_class_1, chunk_class_1, axis=0)

# # 4. The last chunk takes all remaining values
# chunks.append(np.vstack((data_class_0, data_class_1)))


In [42]:
# 1. Segregate the train_data into two lists based on class labels
data_class_0 = train_data[train_data[:, -1] == 0]
data_class_1 = train_data[train_data[:, -1] == 1]

chunks = []

# 2. Process until len(chunk_sizes) - 3
for size in chunk_sizes[:-3]:
    # Random fraction for class 0 between 0.1 and 0.9 for example
    fraction_class_0 = np.random.uniform(0.1, 0.9)
    num_class_0 = int(size * fraction_class_0)
    num_class_1 = size - num_class_0
    
    # Sample without replacement
    chunk_class_0 = np.random.choice(len(data_class_0), num_class_0, replace=False)
    chunk_class_1 = np.random.choice(len(data_class_1), num_class_1, replace=False)

    # Append to chunks
    chunks.append(np.vstack((data_class_0[chunk_class_0], data_class_1[chunk_class_1])))

    # Remove the sampled data from main segregated lists
    data_class_0 = np.delete(data_class_0, chunk_class_0, axis=0)
    data_class_1 = np.delete(data_class_1, chunk_class_1, axis=0)

# 3. For the last three chunks, combine, shuffle, and split the data
combined_data = np.vstack((data_class_0, data_class_1))
np.random.shuffle(combined_data)

# Split according to the sizes of the last three chunks
split_index1 = chunk_sizes[-3]  # Size of the third last chunk
split_index2 = split_index1 + chunk_sizes[-2]  # Size of the second last chunk
chunks.append(combined_data[:split_index1])
chunks.append(combined_data[split_index1:split_index2])
chunks.append(combined_data[split_index2:])


In [49]:

# Function to get label distribution in a chunk
def get_label_distribution(chunk):
    # The label is in the last column
    labels = chunk[:, -1]
    unique_labels, counts = np.unique(labels, return_counts=True)
    return dict(zip(unique_labels, counts))


# Function to get label proportions in a chunk
def get_label_proportions(label_distribution, chunk_size):
    proportions = {}
    for label, count in label_distribution.items():
        proportions[label] = count / chunk_size
    return proportions




In [51]:
import os
folder = "./7_chunks"
if not os.path.exists(folder):
    os.makedirs(folder)

folder = "non_iid"
folder = os.path.join(folder, folder)
if not os.path.exists(folder):
    os.makedirs(folder)


In [52]:
# iid distribution

chunk_info = []
for i, chunk in enumerate(chunks):

    label_distribution = get_label_distribution(chunk)
    chunk_size =len(chunk)
    # save info
    info = {}
    info['chunk'] = i+1
    info['size'] = chunk_size
    info['label_distribution'] = label_distribution
    proportions = get_label_proportions(label_distribution, chunk_size)
    info['label_proportions'] = proportions

    chunk_info.append(info)

    # Save unbalanced chunk as a pickle file
    with open(f'{folder}/chunk_{i+1}.pickle', 'wb') as f:
        pickle.dump(chunk, f)


In [54]:
import pandas as pd
# Convert list of dictionaries to DataFrame for better visualization
df = pd.DataFrame(chunk_info)

# print dataframe
print(df)



   chunk    size        label_distribution  \
0      1  104360  {0.0: 86162, 1.0: 18198}   
1      2   99720  {0.0: 19279, 1.0: 80441}   
2      3   93430  {0.0: 23448, 1.0: 69982}   
3      4   87450  {0.0: 44198, 1.0: 43252}   
4      5   79130  {0.0: 46820, 1.0: 32310}   
5      6   77890  {0.0: 46008, 1.0: 31882}   
6      7   58020  {0.0: 34085, 1.0: 23935}   

                                   label_proportions  
0  {0.0: 0.8256228440015332, 1.0: 0.1743771559984...  
1  {0.0: 0.1933313277176093, 1.0: 0.8066686722823...  
2  {0.0: 0.25096863962324734, 1.0: 0.749031360376...  
3  {0.0: 0.5054088050314466, 1.0: 0.4945911949685...  
4  {0.0: 0.5916845696954379, 1.0: 0.4083154303045...  
5  {0.0: 0.5906791629220696, 1.0: 0.4093208370779...  
6  {0.0: 0.5874698379869011, 1.0: 0.4125301620130...  


In [55]:
df.to_csv(f"{folder}/chunks_info.csv", index=False)