In [5]:
import os
import struct

import numpy as np
import scipy.misc
import skimage.exposure


def read_gnt_in_directory(gnt_dirpath):
    def samples(f):
        header_size = 10

        # read samples from f until no bytes remaining
        while True:
            header = np.fromfile(f, dtype='uint8', count=header_size)
            #print '>', header
            if not header.size: break

            sample_size = header[0] + (header[1]<<8) + (header[2]<<16) + (header[3]<<24)
            tagcode = header[5] + (header[4]<<8)
            width = header[6] + (header[7]<<8)
            height = header[8] + (header[9]<<8)
            assert header_size + width*height == sample_size

            bitmap = np.fromfile(f, dtype='uint8', count=width*height).reshape((height, width))
            yield bitmap, tagcode

    for file_name in os.listdir(gnt_dirpath):
        if file_name.endswith('.gnt'):
            file_path = os.path.join(gnt_dirpath, file_name)
            with open(file_path, 'rb') as f:
                for bitmap, tagcode in samples(f):
                    yield bitmap, tagcode


def normalize_bitmap(bitmap):
    # pad the bitmap to make it squared
    pad_size = abs(bitmap.shape[0]-bitmap.shape[1]) // 2
    if bitmap.shape[0] < bitmap.shape[1]:
        pad_dims = ((pad_size, pad_size), (0, 0))
    else:
        pad_dims = ((0, 0), (pad_size, pad_size))
    bitmap = np.lib.pad(bitmap, pad_dims, mode='constant', constant_values=255)

    # rescale and add empty border
    #bitmap = scipy.misc.imresize(bitmap, (64 - 4*2, 64 - 4*2))
    bitmap = scipy.misc.imresize(bitmap, (256 - 4*2, 256 - 4*2))
    bitmap = np.lib.pad(bitmap, ((4, 4), (4, 4)), mode='constant', constant_values=255)
    #assert bitmap.shape == (64, 64)
    assert bitmap.shape == (256, 256)

    bitmap = np.expand_dims(bitmap, axis=0)
    #assert bitmap.shape == (1, 64, 64)
    assert bitmap.shape == (1, 256, 256)
    return bitmap

def preprocess_bitmap(bitmap):
    # contrast stretching
    p2, p98 = np.percentile(bitmap, (2, 98))
    assert abs(p2-p98) > 10
    bitmap = skimage.exposure.rescale_intensity(bitmap, in_range=(p2, p98))

    # from skimage.filters import threshold_otsu
    # thresh = threshold_otsu(bitmap)
    # bitmap = bitmap > thresh
    return bitmap


def tagcode_to_unicode(tagcode):
    return struct.pack('>H', tagcode).decode('gb2312')

def unicode_to_tagcode(tagcode_unicode):
    return struct.unpack('>H', tagcode_unicode.encode('gb2312'))[0]

In [6]:
HOME='/home/roger/Desktop/chinese/HWDB1.1trn/'
DATASET='/home/roger/Desktop/chinese/HWDB1.1trn_imgs/'
from scipy.misc import toimage
from os.path import join
import cv2

pathout = open(join(HOME, 'path_HWDB1.1trn.txt'), 'w')
dset_bitmap = {}
dset_tagcode = {}

for i, (bitmap, tagcode) in enumerate(read_gnt_in_directory(HOME)):
    dset_bitmap[i]  = normalize_bitmap(bitmap)
    dset_tagcode[i] = tagcode
    #print dset_bitmap[i].shape
    #toimage(dset_bitmap[i][0]).show()
    #print i, tagcode
    fout = join(DATASET, str(i)+'.jpg')
    pathout.write('%s %d\n' % (fout, tagcode))
    #img = cv2.resize(dset_bitmap[i][0], (256, 256), 0, 0, cv2.INTER_CUBIC)
    cv2.imwrite(fout, dset_bitmap[i][0])
    if i % 1000 == 0 and i != 0:
        print i
pathout.close()

In [12]:
from PIL import Image
#img = Image.fromarray(dset_bitmap[i][0], 'RGB')
#img.save('my.png')
#img.show()
print img.shape
toimage(img).show()

(256, 256)


In [23]:
print tagcode_to_unicode(45217).encode('utf-8') 

啊
