This notebook converts the unicode labels to tokens, pads the converted labels with `<start>`, `<stop>` and the `<pad>` tokens.

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import unicodedata
import tarfile
import pandas as pd
from tqdm import tqdm

In [0]:
# tarfile_loc = '/content/drive/My Drive/Project STation/TD/recognition_real_dataset/cropped_data.tar'
tarfile_loc = '/content/drive/My Drive/Project STation/TD/synthetic_cropped.tar.gz'

with tarfile.open(tarfile_loc) as tarf:
    tarf.extractall()

In [0]:
# These ranges are in hexadecimal format
def _unicode_to_idx_map(lower_range, upper_range, start_index):
    l_range = int(lower_range, 16) 
    u_range = int(upper_range, 16)
    name2idx = dict()
    idx = start_index
    for char_code in range(l_range, u_range + 1):
        code_point = chr(char_code)
        char_name = unicodedata.name(code_point)
        name2idx[char_name] = idx
        idx = idx + 1
    return name2idx, idx

In [0]:
start_index = 3  ## Start from 3 as the first three inidices are reserved for <start> <stop> and <PAD> token
name2idx, idx = _unicode_to_idx_map("0900", "094D", start_index)
digits, idx = _unicode_to_idx_map("0966", "096F", idx)
om, idx = _unicode_to_idx_map("0950", "0950", idx)
additional, idx = _unicode_to_idx_map("0958", "095F", idx)

name2idx.update(digits)
name2idx.update(om)
name2idx.update(additional)

In [0]:
def get_labels(text, name2idx):
    labels = list()
    for char in text:
        name = unicodedata.name(char)
        index = name2idx[name]
        labels.append(index)
    return labels

In [0]:
# annotation_file_loc = '/content/cropped_data/annotations.txt'
annotation_file_loc = '/content/cropped_dir/annotation.txt'
with open(annotation_file_loc) as fp:
    # find the max length of the data labels for padding
    labels = list()
    for line in fp.readlines():
        hindi_text = line.split('\t')[1].strip()
        try:
            labels.append(get_labels(hindi_text, name2idx))
        except KeyError:
            pass
    max_len = max([len(label) for label in labels]) + 2  # including start and stop symbols

In [0]:
cols = ['name', 'text', 'labels', 'unpadded_length']
annotation_df = pd.DataFrame()
with open(annotation_file_loc) as fp:
    data = {}
    for line in tqdm(fp.readlines()):
        img_path, hindi_text = line.split('\t')[:2]
        img_path = img_path.strip()
        hindi_text = hindi_text.strip()
        data['name'] = img_path
        data['text'] = f"<start> {hindi_text} <stop>"
        label = [0]  ## Initialize with the <start> token
        try:
            label.extend(get_labels(hindi_text, name2idx))
        except KeyError:
            continue
        label.append(1)  ## End with the <stop> token
        data['unpadded_length'] = int(len(label))
        extra_padding = max_len - len(label)
        label.extend([2] * extra_padding)
        data['labels'] = label
        annotation_df = annotation_df.append(data, ignore_index=True)

100%|██████████| 101660/101660 [09:06<00:00, 129.91it/s]


In [0]:
print(annotation_df.head())
annotation_df.to_pickle('/content/drive/My Drive/Project STation/TD/annotation_synthetic_preprocessed.pkl')

                                              labels  ... unpadded_length
0    [0, 23, 51, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]  ...             4.0
1  [0, 36, 51, 39, 65, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]  ...             6.0
2    [0, 27, 51, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]  ...             4.0
3    [0, 49, 51, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]  ...             4.0
4  [0, 24, 60, 66, 18, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]  ...             6.0

[5 rows x 4 columns]
