In [1]:
# the purpose of this jupyter notebook is to find out how to evenly create a dataloader uniformly
# we want to create a function that evenly splits the data into (file)

import os, sys
from functools import lru_cache
import glob
import numpy as np
from tqdm import tqdm

def get_list_of_clients(base_dir: str):
    filelist = [os.path.basename(s).rstrip('_data.npz') for s in glob.glob(base_dir+'/*_data.npz')]
    clientlist = []
    for file in tqdm(filelist):
        numclients = np.load(base_dir+f'/{file}_data.npz')['load'].shape[0]
        clientlist.append((file,numclients))
    return base_dir, clientlist
    

In [2]:
# get a list of (filename,num_clients) so that we can save it as a text file

base_dir = '/lcrc/project/NEXTGENOPT/NREL_COMSTOCK_DATA/grouped'
w = get_list_of_clients(base_dir=base_dir)

  0%|                                                                                                                                                                                                | 0/3042 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3042/3042 [09:37<00:00,  5.27it/s]


In [21]:
# save the data acquired in the previous block to a text file

with open(base_dir.rstrip('/grouped')+'/client_count.txt','w') as file:
    for itm in w[1]:
        file.write(f'{itm[0]},{itm[1]}\n')

In [22]:
# get a list of (filename,numclients) which approximately splits into n parts

def split_clients_into_m_almost_equal_parts(file_name_with_path: str, m: int):
    # inputs:
    # file_name_with_path: filename with path of a .txt file that contains (fileID, count) on each line
    # m: number of splits such that each split's counts sum upto approximately the same value.
    
    # Read file
    data = []
    with open(file_name_with_path,'r') as file:
        for line in file:
            string, number = line.strip().split(',')
            data.append((string,int(number)))
    
    # Sort
    sorted_data = sorted(data, reverse=True, key = lambda x: x[1])
    
    # Populate sublists
    sublists = [[] for _ in range(m)]
    sums = [0] * m

    for item, number in sorted_data:
        idx = sums.index(min(sums))
        sublists[idx].append((item, number))
        sums[idx] += number

    return sublists
        

In [23]:
split_clients_into_m_almost_equal_parts(base_dir.rstrip('/grouped')+'/client_count.txt',12)

[[('G0600370', 10251),
  ('G4400070', 967),
  ('G0800410', 850),
  ('G0501190', 789),
  ('G3100550', 711),
  ('G1600010', 699),
  ('G1200330', 627),
  ('G4801210', 573),
  ('G3901510', 551),
  ('G1800030', 505),
  ('G3400050', 460),
  ('G3300150', 454),
  ('G1801410', 426),
  ('G3900930', 373),
  ('G1200690', 350),
  ('G5500090', 343),
  ('G3901550', 309),
  ('G0200200', 304),
  ('G3600270', 285),
  ('G0600070', 280),
  ('G4200410', 250),
  ('G5500870', 247),
  ('G2600930', 233),
  ('G0800350', 230),
  ('G0101250', 213),
  ('G5107700', 211),
  ('G2600210', 196),
  ('G4500030', 196),
  ('G1901630', 185),
  ('G3900890', 181),
  ('G5401070', 172),
  ('G4500350', 165),
  ('G2200150', 156),
  ('G1200610', 153),
  ('G5400810', 147),
  ('G3701550', 146),
  ('G4801890', 137),
  ('G0100690', 134),
  ('G5500730', 129),
  ('G1701130', 122),
  ('G4803670', 118),
  ('G2200050', 116),
  ('G1801050', 111),
  ('G2701090', 110),
  ('G4201070', 105),
  ('G4701190', 102),
  ('G3701570', 101),
  ('G270141