In [1]:
import os
import struct
import numpy as np
import zipfile
import tables as tb
import pickle


class Bunch(dict):
    def __init__(self, *args, **kwds):
        super().__init__(*args, **kwds)
        self.__dict__ = self
        
        
class MPF(Bunch):
    
    def __init__(self, buffer, *args, **kwds):
        super().__init__(*args, **kwds)
        size = struct.unpack('l', buffer[:4])[0]
        dt = np.dtype({'names': 
                       ['Format code', 'text', 
                        'Code type', 'Code length',
                        'Data type', 'nrows', 'ndims'],
                       'formats': f'S8 S{size-62} S20 h S20 l l'.split(' ')})
        self.header = np.frombuffer(buffer[4:size], dtype=dt)
        self.feature = buffer[size:]
        
    def __iter__(self):
        n = self.header['Code length'][0]
        ndims = self.header['ndims'][0]
        m =  n + ndims
        for i in range(0, m * self.header['nrows'][0], m):
            label = self.feature[i: i + n].decode('gb18030')
            data = np.frombuffer(self.feature[i + n : m], 'B')
            yield data, label


class PTTS(Bunch):
    def __init__(self, buffer, *args, **kwds):
        super().__init__(*args, **kwds)

        
class POT(Bunch):
    
    def __init__(self, buffer, *args, **kwds):
        super().__init__(*args, **kwds)
        self.buffer = buffer
        self.dt = np.dtype({'names': ['Sample size', 'Tag code', 'Stoke number'],
                       'formats': 'H S4 H'.split(' ')})
        
    def __iter__(self):
        j = 0
        for i, _ in enumerate(self.buffer):
            if i == j:
                item = np.frombuffer(buffer[j : j + 8], dtype=self.dt)
                size = item['Sample size'][0]
                tag_code = item['Tag code'][0].decode('gb18030')
                stroke_num = item['Stoke number'][0]
                strokes = np.frombuffer(self.buffer[j + 8: j + size], dtype='h')
                j += size
                print(i)
            else:
                continue
            yield tag_code, stroke_num, strokes
            
        
class GNT(Bunch):
    
    def __init__(self, buffer, *args, **kwds):
        super().__init__(*args, **kwds)
        self.buffer = buffer
          
    def __iter__(self):
        j = 0
        for i, _ in enumerate(self.buffer):
            if i == j:
                size = np.frombuffer(self.buffer[j : j + 4], 'I')[0]
                tag = self.buffer[j + 4: j + 6].decode('gb18030')
                width, height = np.frombuffer(self.buffer[j + 6: j + 10], 'H')
                bitmap = np.frombuffer(self.buffer[j + 10: j + size], 'B').reshape((height, width))
                j += size
                yield {tag : bitmap}
        
class DGR(Bunch):
    
    def __init__(self, buffer, *args, **kwds):
        super().__init__(*args, **kwds)
        

class HW(Bunch):
    
    def __init__(self, root, filename, *args, **kwds):
        super().__init__(*args, **kwds)
        self._type(filename)
        path = f'{root}{filename}'
        self.Z = zipfile.ZipFile(path)
        
    def _type(self, filename):
        if 'gnt' in filename:
            self['type'] = 'gnt'
        elif 'pot' in filename:
            self['type'] = 'pot'
        else:
            self['type'] = 'mpf'
            
    def __iter__(self):
        for info in self.Z.infolist():
            if not info.is_dir():
                buffer = self.Z.read(info)
                yield buffer
        
class Feature(Bunch):

    def __init__(self, root, *args, **kwds):
        super().__init__(*args, **kwds)
        for filename in os.listdir(root):
            name, _ = os.path.splitext(filename)
            name = name.replace('.', '')
            hw = HW(root, filename)
            self[name] = hw

In [2]:
root = 'E:/OCR/CASIA/HW/'

In [3]:
%%time
ft = Feature(root)
dataset = ft.HWDB11tst_gnt

Wall time: 616 ms


In [4]:
%%time
gnts = []
for buffer in iter(dataset):
    gnts.append(GNT(buffer))

Wall time: 11.4 s


In [6]:
gnt = gnts[0]

In [7]:
for item in iter(gnt):
    break

In [8]:
item

{'角': array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        ...,
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255]], dtype=uint8)}

In [23]:
%%time
X = np.asanyarray([item for item in iter(gnt)])

Wall time: 33min 29s


In [41]:
A = pd.DataFrame.from_dict(X)