In [1]:
#!pip install h5py lz4 ujson msgpack luxai_s2

In [2]:
import h5py
import msgpack
from replay2 import load_replay
import numpy as np 
import lz4.frame
import copy

In [3]:

def compress_step_lz4(st):
    c = lz4.frame.compress(msgpack.packb({k:v for k,v in st.items() if k!='board'}))
    return {'board': st['board'], 'ep_id': st['ep_id'], 'phase': st['phase'], 'comp_lz4': c}

def compress_step(st):
    c = msgpack.packb({k:v for k,v in st.items() if k!='board'})
    return {'board': st['board'], 'ep_id': st['ep_id'], 'phase': st['phase'], 'comp': c}


def extract_step_h5_lz4(hf, ix):
    stc = {"board":{}}
    for k in hf:
        if k.startswith("board_"):
            stc["board"][k[6:]] = hf[k][ix]
        else:
            stc[k] = hf[k][ix]
    res = {'board': copy.deepcopy(stc['board'])}
    res.update(msgpack.unpackb(lz4.frame.decompress(stc['comp_lz4'])))
    return res

def extract_step_h5(hf, ix):
    stc = {"board":{}}
    for k in hf:
        if k.startswith("board_"):
            stc["board"][k[6:]] = hf[k][ix]
        else:
            stc[k] = hf[k][ix]
    res = {'board': copy.deepcopy(stc['board'])}
    res.update(msgpack.unpackb(stc['comp']))
    return res


In [4]:
import os
from tqdm import tqdm

total_compressed_steps = []
total_games = 0
ep_dir = "../archive4/episodes/"

for fn in tqdm([x for x in os.listdir(ep_dir) if x.endswith("_info.json")]):
    meta_fname = fn
    ep_fname = fn[:-10] + ".json"
    r = load_replay(ep_dir + ep_fname, ep_dir + meta_fname, 1200, 100)
    total_compressed_steps += [compress_step_lz4(x) for x in r.steps]
    total_games += 1
    
#     if total_games >= 7:
#         break

#     if total_games % 10 == 0:
#         print(total_games, len(total_steps))
    
print(total_games, len(total_compressed_steps))  

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 655/655 [35:56<00:00,  3.29s/it]

655 653402





In [5]:
with h5py.File('dataset_lz4_2.h5', 'w') as hf:
    for key in total_compressed_steps[0]["board"]:
        hf.create_dataset(f"board_{key}", data=np.array([x['board'][key] for x in total_compressed_steps]))
    for key in total_compressed_steps[0]:
        if key == "board":
            continue
        if key == "comp_lz4":
            dta = np.void(np.array([x[key] for x in total_compressed_steps], dtype=np.bytes_))
        else:
            dta = np.array([x[key] for x in total_compressed_steps])
        hf.create_dataset(f"{key}", data=dta)

In [6]:
!ls -al *.h5

-rw-rw-r-- 1 nea nea 27585331780 апр 22 02:20 dataset_lz4_2.h5
-rw-rw-r-- 1 nea nea 21841928200 апр 18 10:28 dataset_lz4.h5


In [7]:
hf = h5py.File('dataset_lz4.h5', 'r')

In [8]:
stp = extract_step_h5_lz4(hf, 1234)

In [9]:
stp

{'board': {'factories_per_team': 4,
  'ice': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int8),
  'lichen': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int8),
  'lichen_strains': array([[-1, -1, -1, ..., -1, -1, -1],
         [-1, -1, -1, ..., -1, -1, -1],
         [-1, -1, -1, ..., -1, -1, -1],
         ...,
         [-1, -1, -1, ..., -1, -1, -1],
         [-1, -1, -1, ..., -1, -1, -1],
         [-1, -1, -1, ..., -1, -1, -1]], dtype=int8),
  'ore': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
      

In [10]:
hf.close()