In [60]:
import struct
import os
import numpy as np
from tqdm import tqdm

In [61]:
def read_file_header(filename):
    with open(filename, 'rb') as f:
        # Read Size of Header
        size_of_header = struct.unpack('I', f.read(4))[0]  # Long int, 4B
        
        # Read Format code
        format_code = f.read(8).decode('ascii').rstrip('\0')  # ASCII, 8B
        
        # Read the illustration text up to null terminator
        illustration = []
        while True:
            char = f.read(1)
            if char == b'\0':
                break
            illustration.append(char)
        illustration = b''.join(illustration).decode('ascii')
        
        # Read Code type
        code_type = f.read(20).decode('ascii').rstrip('\0')  # ASCII, 20B
        
        # Read Code length
        code_length = struct.unpack('h', f.read(2))[0]  # Short int, 2B
        
        # Read Data type
        data_type = f.read(20).decode('ascii').rstrip('\0')  # ASCII, 20B
        
        # Read Sample number
        sample_number = struct.unpack('I', f.read(4))[0]  # Long int, 4B
        
        # Read Dimensionality (D)
        dimensionality = struct.unpack('I', f.read(4))[0]  # Long int, 4B

        samples = []

        for _ in range(sample_number):
            # Read the Label
            label = f.read(code_length).decode('GBK').rstrip('\0')

            # Calculate vector size based on Data type
            type_format = {
                'unsigned char': 'B',
                'short': 'h',
                'float': 'f'
            }.get(data_type.strip())
            if type_format is None:
                raise ValueError(f"Unsupported data type: {data_type}")

            vector_size = dimensionality * struct.calcsize(type_format)
            vector = struct.unpack(f'{dimensionality}{type_format}', f.read(vector_size))
            
            samples.append({
                'label': label,
                'vector': vector
            })


        if f.tell() != os.path.getsize(filename):
            raise ValueError("文件还未完全读取!")


        return {
            "size_of_header": size_of_header,
            "format_code": format_code,
            "illustration": illustration,
            "code_type": code_type,
            "code_length": code_length,
            "data_type": data_type,
            "sample_number": sample_number,
            "dimensionality": dimensionality,
            "samples": samples
        }

In [81]:
train_dir = '/Users/hpc-419/Downloads/OLHWDB1.1tst/'
anno=[]
dataset = []
for filename in tqdm(os.listdir(train_dir)):
    if not filename.endswith('.mpf'):
        continue
    file_path= os.path.join(train_dir,filename)
    data = read_file_header(file_path)
    for j in data['samples']:
        anno.append(j['label'])
        dataset.append(j['vector'])
        
    # print(len(data['samples']))
    # for key, value in data.items():
    #     if key!="samples":
    #         print(f"{key}: {value}")

100%|██████████| 60/60 [00:03<00:00, 16.80it/s]


In [82]:
len(dataset)

224559

In [83]:
dataset_np = np.array(dataset,dtype='uint8')
np.save('/Users/hpc-419/dataset/dataset_test.npy',dataset_np)

In [84]:
dataset_np.shape

(224559, 512)

In [85]:
dataset_np.dtype

dtype('uint8')

In [86]:
dataset_np.max()

250

In [87]:
import json
with open('/Users/hpc-419/dataset/anno_test.json','w') as f:
    json.dump(anno,f)

In [88]:
anno[0]

'蜒'