In [1]:
import cv2
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence

from tqdm.notebook import tqdm
from PIL import Image

from data_loader import CustomDataset


TRAIN_IMAGES_PATH = r'Synthetic_Rec_En_V5\train\images'

In [2]:
torch.__version__

'1.13.1+cu117'

In [3]:
img_path = r'Synthetic_Rec_En_V5'

img_labels = []
label_file = os.path.join(img_path,f'train/gt.txt')
        
with open(label_file,'r') as f:
    for line in f.readlines():
        img_labels.append((line.split('\t')[0],line.split('\t')[1].rstrip()))

In [4]:
image_paths,image_texts=[],[]
for file_ in img_labels:
    img_path,img_txt = file_
    image_paths.append(img_path)
    image_texts.append(img_txt)

In [5]:
vocab = set("".join(map(str, image_texts)))
print(sorted(vocab))

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']


In [6]:
max_label_len = max([len(str(text)) for text in image_texts])
max_label_len

71

In [7]:
char_list = sorted(vocab)
print(char_list)
print(len(char_list))

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
95


In [22]:

def encode_to_labels(txt,char_list):
    # encoding each output word into digits
    dig_lst = []
    
    for index, char in enumerate(txt):
        try:
            print(index)
            print(char_list.index(char))
            dig_lst.append(char_list.index(char))
        except:
            print('******')
            print(char)
    
    return dig_lst
    # return pad_sequence([torch.tensor(dig_lst)])

In [20]:
dig_lst = encode_to_labels(image_texts,char_list)


0
******
versioning gastroenterol rearview
1
******
4KVSSJ5VY4MMM6100315M8911215<<<<<<<<<<<<<<<<
2
******
GZDX6PEN36VOP4210201X9400200<<<<<<<<<<<<<<<<
3
******
Dylid Antiqued Towel&
4
******
65V0
5
******
2037/05/06
6
******
C82906
7
******
IPIRLX650971186<<<<<<<<<<<<<<<
8
******
Jobseeker Meek Talcott Eus Travolta
9
******
18/09/1938
10
******
Icann ` Contrat Idaho
11
******
Datsun Lic Interrogations Huggable Catchments Internazionale Dismissal
12
******
Jun 21, 1980
13
******
2025-04-03
14
******
STRATOSPHERE APPROP EDIFACT IVAR BITES
15
******
DENVER SEXFILMEAA $W TECHNICALITY TELEVUE TAILGATEW LITANY
16
******
2008-10-22
17
******
2000/10/27
18
******
97U5A9
19
******
TRAVELER<<HAPPY<<<<<<<<<<<<<<<
20
******
SCLERODERMA LAAGLB BRECHI AA
21
******
DAYS FOLKSONOMY FUTURISTIC
22
******
PSYCHOLOGICALLY MURROW TROLL LIFESIZE SAYYAF WEATHERS
23
******
Ftb Spenders Seasonic Arschloch Teething
24
******
WINGNUT PAS WHITMAN CHRONICALLY PRENDERGAST CEASAR
25
******
02-06-2013
26
******
1989/

In [21]:
print(dig_lst)

[23, 18, 17, 18, 21, 18]


In [15]:
padded_image_texts = list(map(encode_to_labels, image_texts))

padded_image_texts[0]

tensor([[86],
        [69],
        [82],
        [83],
        [73],
        [79],
        [78],
        [73],
        [78],
        [71],
        [ 0],
        [71],
        [65],
        [83],
        [84],
        [82],
        [79],
        [69],
        [78],
        [84],
        [69],
        [82],
        [79],
        [76],
        [ 0],
        [82],
        [69],
        [65],
        [82],
        [86],
        [73],
        [69],
        [87]])

In [26]:
from itertools import groupby


def ctc_decoder(predictions):
    '''
    input: given batch of predictions from text rec model
    output: return lists of raw extracted text

    '''
    text_list = []
    
    pred_indcies = np.argmax(predictions, axis=1)
    
    for i in range(pred_indcies.shape[0]):
        ans = ""
        
        ## merge repeats
        merged_list = [k for k,_ in groupby(pred_indcies[i])]
        
        ## remove blanks
        for p in merged_list:
            if p != len(char_list):
                ans += char_list[int(p)]
        
        text_list.append(ans)
        
    return text_list

In [3]:
import torch
import torch.nn as nn

seq_length=512
batch_size = 8
input_size=32
hidden_size = 64
x = torch.rand((seq_length,batch_size,input_size))

lstm1 = nn.LSTM(input_size,hidden_size)
# print(lstm1(x)[0].shape)
# print(lstm1(x)[1][0].shape)
# print(lstm1(x)[1][1].shape)


recurrent, _ = lstm1(x)
print(recurrent.shape)

sequence_length, batch_size, inputs_size = recurrent.shape
sequence_length2 = recurrent.view(sequence_length * batch_size, inputs_size)

print(sequence_length2.shape)
print()
print("Output shape:")
print(sequence_length2.view(sequence_length,batch_size,-1).size())


torch.Size([512, 8, 64])
torch.Size([4096, 64])

Output shape:
torch.Size([512, 8, 64])


In [3]:
in_channels = [3,32,64,128]
out_channels= [32,64,128,256]

x = torch.rand((batch_size,3,32,640))
print(f"Initial image size: {x.size()}")
for i,(in_ch,out_ch) in enumerate(zip(in_channels,out_channels)):
    conv1 = nn.Conv2d(in_ch,out_ch,3,1,1)
    pool = nn.MaxPool2d(2,2)

    ch1 = conv1(x)
    ch2 = pool(ch1)
    x = ch2
    print(f"Conv Layer-{i+1}: {ch1.size()}")
    print(f"Pool Layer-{i+1}: {ch2.size()}")
    print('*'*10)

print(f"Final shape: {ch2.size()}")
ch_sq = ch2.squeeze(2)
print(f"After squeezing : {ch_sq.size()}")
print(f"After permuting:{ch_sq.permute(2, 0, 1).size()} ")

Initial image size: torch.Size([8, 3, 24, 416])
Conv Layer-1: torch.Size([8, 32, 24, 416])
Pool Layer-1: torch.Size([8, 32, 12, 208])
**********
Conv Layer-2: torch.Size([8, 64, 12, 208])
Pool Layer-2: torch.Size([8, 64, 6, 104])
**********
Conv Layer-3: torch.Size([8, 128, 6, 104])
Pool Layer-3: torch.Size([8, 128, 3, 52])
**********
Conv Layer-4: torch.Size([8, 256, 3, 52])
Pool Layer-4: torch.Size([8, 256, 1, 26])
**********
Final shape: torch.Size([8, 256, 1, 26])
After squeezing : torch.Size([8, 256, 26])
After permuting:torch.Size([26, 8, 256]) 


In [11]:
import torch
import torch.nn as nn

batch_size = 8

x = torch.rand((batch_size,3,640,32))
print(f"Initial image size: {x.size()}")

conv1 = nn.Conv2d(3,64,3,1,1)
pool = nn.MaxPool2d(2,2)
ch1 = conv1(x)
ch2 = pool(ch1)
print(f"Conv Layer 1: {ch1.size()}")
print(f"Pool Layer 1: {ch2.size()}")
print('*'*10)

conv2 = nn.Conv2d(64,64,3,1,1)
pool2 = nn.MaxPool2d((1,2),(1,2))
ch3 = conv2(ch2)
ch4 = pool2(ch3)
print(f"Conv Layer-2: {ch3.size()}")
print(f"Pool Layer-2: {ch4.size()}")
print('*'*10)

conv3 = nn.Conv2d(64,128,3,1,1)
pool3 = nn.MaxPool2d(2,2)
ch5 = conv3(ch4)
ch6 = pool3(ch5)
print(f"Conv Layer-3: {ch5.size()}")
print(f"Pool Layer-3: {ch6.size()}")
print('*'*10)

conv4 = nn.Conv2d(128,128,3,1,1)
pool4 = nn.MaxPool2d(2,2)
ch7 = conv4(ch6)
ch8 = pool4(ch7)
print(f"Conv Layer-4: {ch7.size()}")
print(f"Pool Layer-4: {ch8.size()}")
print('*'*10)

conv5 = nn.Conv2d(128,128,3,1,1)
pool5 = nn.MaxPool2d(2,(1,2))
ch9 = conv5(ch8)
ch10 = pool5(ch9)
print(f"Conv Layer-5: {ch9.size()}")
print(f"Pool Layer-5: {ch10.size()}")
print('*'*10)

conv6 = nn.Conv2d(128,128,3,1,1)
ch11 = conv6(ch10)
print(f"Conv Layer-6: {ch11.size()}")

ch12 = ch11.squeeze(dim=-1)
print(f"After Squeezing: {ch12.size()}")
print('*'*10)
print(f"After permuting: {ch12.permute(2,0,1).size()}")


Initial image size: torch.Size([8, 3, 640, 32])
Conv Layer 1: torch.Size([8, 64, 640, 32])
Pool Layer 1: torch.Size([8, 64, 320, 16])
**********
Conv Layer-2: torch.Size([8, 64, 320, 16])
Pool Layer-2: torch.Size([8, 64, 320, 8])
**********
Conv Layer-3: torch.Size([8, 128, 320, 8])
Pool Layer-3: torch.Size([8, 128, 160, 4])
**********
Conv Layer-4: torch.Size([8, 128, 160, 4])
Pool Layer-4: torch.Size([8, 128, 80, 2])
**********
Conv Layer-5: torch.Size([8, 128, 80, 2])
Pool Layer-5: torch.Size([8, 128, 79, 1])
**********
Conv Layer-6: torch.Size([8, 128, 79, 1])
After Squeezing: torch.Size([8, 128, 79])
**********
After permuting: torch.Size([79, 8, 128])


In [26]:
import fastwer
hypo = ['This is an exammple 8}.', 'This is another example .']
ref = ['This is an example:|.', 'This is the example .']

# Corpus-Level WER: 40.0
fastwer.score(hypo, ref)
# Corpus-Level CER: 25.5814
fastwer.score(hypo, ref, char_level=True)

# Sentence-Level WER: 40.0
fastwer.score_sent(hypo[0], ref[0])
# Sentence-Level CER: 22.7273
fastwer.score_sent(hypo[0], ref[0], char_level=True)

19.0476

In [None]:
class Tokenizer():
    def __init__(self,label_path,max_length):
        self.image_labels=[]

        with open(label_path,'r') as f:
            for line in f.readlines():
                self.image_labels.append(line.split('\t')[1].rstrip())

        self.vocab = sorted(set("".join(map(str, img_labels))))
        self.idx2wrd  = {k:v for k,v in enumerate(sorted(self.vocab))}
        self.wrd2idx = {v:k for k,v in enumerate(sorted(self.vocab))}
        self.max_length = max_length

    def encode(self,text):
        encoded_text = [self.wrd2idx[char] for char in text]
        print("Encoded Text len",len(encoded_text))
        if len(encoded_text)>self.max_length:
            encoded_text = encoded_text[:self.max_length]
        return encoded_text

In [None]:
from encode_decode import Tokenizer

sample = "This is sample string"
tokenizer = Tokenizer()
encoded_text = 


In [None]:
torch.LongTensor([self.encode(txt) for txt in batch_text])

In [6]:
import os
from encode_decode import Tokenizer
img_path = r'Synthetic_Rec_En_V5'
label_path = os.path.join(img_path,f'train/gt.txt')
image_labels = []
label_lengths = []

with open(label_path,'r') as f:
    for line in f.readlines():
        image_labels.append(line.split('\t')[1].rstrip())
        label_lengths.append(len(line.split('\t')[1].rstrip()))

print(image_labels[:5])
print(label_lengths[:5])


['versioning gastroenterol rearview', '4KVSSJ5VY4MMM6100315M8911215<<<<<<<<<<<<<<<<', 'GZDX6PEN36VOP4210201X9400200<<<<<<<<<<<<<<<<', 'Dylid Antiqued Towel&', '65V0']
[33, 44, 44, 21, 4]


In [3]:
tokenizer = Tokenizer(label_path,max_length=52)

encoded_text = tokenizer.batch_encode(image_labels[:5])

In [4]:
print(encoded_text)

tensor([86, 69, 82, 83, 73, 79, 78, 73, 78, 71,  0, 71, 65, 83, 84, 82, 79, 69,
        78, 84, 69, 82, 79, 76,  0, 82, 69, 65, 82, 86, 73, 69, 87, 20, 43, 54,
        51, 51, 42, 21, 54, 57, 20, 45, 45, 45, 22, 17, 16, 16, 19, 17, 21, 45,
        24, 25, 17, 17, 18, 17, 21, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
        28, 28, 28, 28, 28, 39, 58, 36, 56, 22, 48, 37, 46, 19, 22, 54, 47, 48,
        20, 18, 17, 16, 18, 16, 17, 56, 25, 20, 16, 16, 18, 16, 16, 28, 28, 28,
        28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 36, 89, 76, 73, 68,
         0, 33, 78, 84, 73, 81, 85, 69, 68,  0, 52, 79, 87, 69, 76,  6, 22, 21,
        54, 16])


In [7]:
decoded_text = tokenizer.decode1D(encoded_text,label_lengths)
print(decoded_text)

[['v', 'e', 'r', 's', 'i', 'o', 'n', 'i', 'n', 'g', ' ', 'g', 'a', 's', 't', 'r', 'o', 'e', 'n', 't', 'e', 'r', 'o', 'l', ' ', 'r', 'e', 'a', 'r', 'v', 'i', 'e', 'w'], ['4', 'K', 'V', 'S', 'S', 'J', '5', 'V', 'Y', '4', 'M', 'M', 'M', '6', '1', '0', '0', '3', '1', '5', 'M', '8', '9', '1', '1', '2', '1', '5', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<'], ['M', 'M', '6', '1', '0', '0', '3', '1', '5', 'M', '8', '9', '1', '1', '2', '1', '5', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', '<', 'G', 'Z', 'D', 'X', '6', 'P', 'E', 'N', '3', '6', 'V'], ['M', 'M', '6', '1', '0', '0', '3', '1', '5', 'M', '8', '9', '1', '1', '2', '1', '5', '<', '<', '<', '<'], ['r', 'o', 'l', ' '], ['i', 'o', 'n', 'i', 'n', 'g', ' ', 'g', 'a', 's'], [' ', 'g', 'a', 's', 't', 'r'], ['n', 'i', 'n', 'g', ' ', 'g', 'a', 's', 't', 'r', 'o', 'e', 'n', 't', 'e', 'r', 'o', 'l', ' ', 'r', 'e', 'a', 'r', 'v', 'i', 'e', 'w', '4', 'K', 'V'], ['i', 'e', 'w', '4', 'K

In [26]:
from data_loader import CustomDataset
import config
from encode_decode import Tokenizer
from torch.utils.data import DataLoader
import torch
import matplotlib.pyplot as plt
import cv2
import numpy as np

In [2]:
def collate_func(batch):
    # (B, [images, encodings, lengths])
    images, encodings, lengths = zip(*batch)
    images = torch.stack(images)
    encodings = torch.cat(encodings, dim=-1)
    lengths = torch.cat(lengths, dim=-1)
    return images, encodings, lengths

In [3]:
data = CustomDataset(config.DATA_PATH,config.MODEL_INPUT_SHAPE[1],config.MODEL_INPUT_SHAPE[0],config.TOKENIZER)

train_data = DataLoader(data,batch_size=1,shuffle=True,collate_fn=collate_func)
# print(next(iter(train_img,train_label,label_lens)))

In [22]:
img,label,label_lens = next(iter(train_data))


img = img.squeeze(0).permute(2,1,0)
# label = next(iter(train_data))[1]



print(''.join(config.TOKENIZER.decode(label)))

Thomason Rko Bourne Heintz Touches


In [24]:
print(type(img))

<class 'torch.Tensor'>


In [27]:

print(''.join(config.TOKENIZER.decode(label)))
cv2.imshow('Image',((img.numpy()+1)*127).astype(np.uint8))
cv2.waitKey(0)
cv2.destroyAllWindows()

Thomason Rko Bourne Heintz Touches


In [3]:
import config

image_labels = []

with open(config.LABEL_PATH,'r') as f:
    for line in f.readlines():
        image_labels.append(line.split('\t')[1].rstrip())

vocab = sorted(set("".join(map(str, image_labels))))
idx2wrd  = {k:v for k,v in enumerate(vocab)}
wrd2idx = {v:k for k,v in enumerate(vocab)}
max_length = 50

In [16]:
sample_string = "This is a sample to encode"

encoded_text = [wrd2idx[wrd] for wrd in sample_string]

In [17]:
if len(encoded_text)<max_length:
    encoded_text = encoded_text +[0]*(max_length-len(encoded_text))
    print(encoded_text)

[52, 72, 73, 83, 0, 73, 83, 0, 65, 0, 83, 65, 77, 80, 76, 69, 0, 84, 79, 0, 69, 78, 67, 79, 68, 69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
