In [1]:
## Change the work dir:
import os
# Change to a new directory
new_directory = "D:/RL_Finance/Image_to_Text"
os.chdir(new_directory)



# Verify the change
print("Current working directory:", os.getcwd())

Current working directory: D:\RL_Finance\Image_to_Text


## Package Importing

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import argparse

from PIL import Image
from text_recognizer.data.create_save_argument_dataset import (load_processed_crops_and_labels, 
                                          ArgumentParagraphDataset, 
                                          DL_DATA_DIRNAME, 
                                          inverse_mapping,
                                          mapping,
                                          save_argument_data_as_tensors,
                                          load_argument_data_as_tensors,
                                          extract_images_and_labels)
from torch.utils.data import Dataset, DataLoader, random_split
from text_recognizer.stems.paragraph import ParagraphStem
import text_recognizer.metadata.iam_paragraphs as metadata_iam_paragraphs
from text_recognizer.models.resnet_transformer import ResnetTransformer
from text_recognizer.data.base_data_module import BaseDataModule


device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dims=metadata_iam_paragraphs.DIMS
output_dims=metadata_iam_paragraphs.OUTPUT_DIMS



## Import Model

In [14]:
TF_DIM = 256
TF_FC_DIM = 256
TF_DROPOUT = 0.4
TF_LAYERS = 4
TF_NHEAD = 4

RESNET_DIM = 512  # hard-coded

In [15]:
data_config = {
    "input_dims": input_dims,  # (channels, height, width)
    "output_dims": output_dims,  # Maximum output sequence length
    "mapping": mapping,  # Example mapping for digits
    "inverse_mapping": inverse_mapping,
}

args = argparse.Namespace(tf_dim=TF_DIM, tf_fc_dim=TF_FC_DIM, tf_nhead=TF_NHEAD, tf_dropout=TF_DROPOUT, tf_layers=TF_LAYERS)

In [16]:
model = ResnetTransformer(data_config, args).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load checkpoint
checkpoint_path =r"D:\RL_Finance\MLops\fslab\lab07\text_recognizer\artifacts\paragraph-text-recognizer\model.pt"
model_script = torch.jit.load(checkpoint_path, map_location="cuda" if torch.cuda.is_available() else "cpu")
state_dict = model_script.state_dict()


## Make the keys names match
new_state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
model.load_state_dict(new_state_dict)

<All keys matched successfully>

## Prepare the data into Dataloader

In [18]:
class CustomDataset(Dataset):
    def __init__(self, data_dir, dataset_len):
        crops, labels=load_processed_crops_and_labels(split="train", data_dirname=data_dir)
        # Create dataset
        argument_dataset = ArgumentParagraphDataset(
            line_crops=crops,
            line_labels=labels,
            dataset_len=dataset_len,
            inverse_mapping=inverse_mapping,
            input_dims=input_dims,
            output_dims=output_dims,
            transform=ParagraphStem(augment=False),
        )

        # Generate training data
        argument_data = argument_dataset.generate_argument_paragraphs()
        self.images, self.targets = extract_images_and_labels(argument_data)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.targets[idx]


class CustomDataModule(BaseDataModule):  
    def __init__(self, data_dir, dataset_len, batch_size, val_split=0.2):  
        super().__init__()  
        self.data_dir = data_dir  
        self.dataset_len=dataset_len
        self.batch_size = batch_size  
        self.val_split = val_split  

    def setup(self):  
        dataset = CustomDataset(self.data_dir, self.dataset_len)  
        val_size = int(len(dataset) * self.val_split)  
        train_size = len(dataset) - val_size  

        # Split into train and validation datasets
        self.train_dataset, self.val_dataset = random_split(dataset, [train_size, val_size])

    def train_dataloader(self):  
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)  

    def val_dataloader(self):  
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False) 

In [19]:
BATCH_SIZE = 32
S = data_config["output_dims"][0]  # Output sequence length

data_module = CustomDataModule(data_dir=DL_DATA_DIRNAME, dataset_len=1000, batch_size=BATCH_SIZE)
data_module.setup()  # Load data into train/val sets

# Get DataLoader
train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()

## Test on the data

In [None]:
## Evaluate the model using data
image, target = argument_data[1]

image=image.unsqueeze(0)

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    image = image.to(device)
    output = model(image)  # Encode image

    
    # Get the most likely label indices
    ## predicted_labels = torch.argmax(logits, dim=1)  # (B, Sy)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


In [None]:
decoded_text = ''.join(mapping[idx] for idx in output[0].tolist())
print(decoded_text)

<S>permitted him to enjoy anything. 'The Pearl', he wrote,
evening.
cordid, and when she experiences it for the
LORD SIDUEY WROTE TO DOUGUSS UNGHIDR
the Director of Public Prosecutions I know petty
chase of several mink coats which,
a effective alleviation of his painful malady. None
This phenamemon has nevertheless been
almost unchanged in 1959 for couples with two or more
in they May 1834.
woodfiller in the usual way and paint the frame in
from me almost instantly, but it had<E><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P

In [None]:
decoded_text = ''.join(mapping[idx] for idx in target.tolist())
print(decoded_text)

<S>permitted him to enjoy anything. 'The Pearl', he wrote,
evening.
sordid, and when she experiences it for the
LORD SIDNEY WROTE TO DOUGLAS KINNAIRD
the Director of Public Prosecutions I know pretty
chase of several mink coats which,
an effective alleviation of his painful malady. None
again and again it is the visual qualities of
This phenomenon has nevertheless been
almost unchanged in 1959 for couples with two or more
in # May 1834.
woodfiller in the usual way and paint the frame in
from me almost instantly, but it had<E><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P>


## Fine Tune

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10):

    model.to(device)

    for epoch in range(num_epochs):
        # -------- TRAINING --------
        model.train()  # Set model to training mode
        total_train_loss = 0

        for batch_idx, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()  # Reset gradients
            
            # Teacher forcing

            x = model.encode(x)  
            logits = model.decode(x, y[:, :-1]).permute(1, 2, 0)

            loss = criterion(logits, y[:, 1:])  
            
            loss.backward()  # Compute gradients
            optimizer.step()  # Update weights
            
            total_train_loss += loss.item()
            print(f"Epoch [{epoch+1}/{num_epochs}] | Batch [{batch_idx+1}/{len(train_loader)}] | Batch Loss: {loss.item():.4f}")


        avg_train_loss = total_train_loss / len(train_loader)

        # -------- VALIDATION --------
        model.eval()  # Set model to evaluation mode
        total_val_loss = 0

        with torch.no_grad():  # Disable gradient computation for validation
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)

                x = model.encode(x)
                logits = model.decode(x, y[:, :-1]).permute(1, 2, 0)

                loss = criterion(logits, y[:, 1:])    

                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        # Print Epoch Summary
        print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    print("Training complete!")



In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=4)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch [1/4] | Batch [1/25] | Batch Loss: 1.0791
Epoch [1/4] | Batch [2/25] | Batch Loss: 0.4445
Epoch [1/4] | Batch [3/25] | Batch Loss: 0.4941
Epoch [1/4] | Batch [4/25] | Batch Loss: 0.5003
Epoch [1/4] | Batch [5/25] | Batch Loss: 0.2846
Epoch [1/4] | Batch [6/25] | Batch Loss: 0.3435
Epoch [1/4] | Batch [7/25] | Batch Loss: 0.2976
Epoch [1/4] | Batch [8/25] | Batch Loss: 0.2524
Epoch [1/4] | Batch [9/25] | Batch Loss: 0.3278
Epoch [1/4] | Batch [10/25] | Batch Loss: 0.5325
Epoch [1/4] | Batch [11/25] | Batch Loss: 0.2092
Epoch [1/4] | Batch [12/25] | Batch Loss: 0.3254
Epoch [1/4] | Batch [13/25] | Batch Loss: 0.3271
Epoch [1/4] | Batch [14/25] | Batch Loss: 0.3206
Epoch [1/4] | Batch [15/25] | Batch Loss: 0.2726
Epoch [1/4] | Batch [16/25] | Batch Loss: 0.2964
Epoch [1/4] | Batch [17/25] | Batch Loss: 0.2661
Epoch [1/4] | Batch [18/25] | Batch Loss: 0.2278
Epoch [1/4] | Batch [19/25] | Batch Loss: 0.1838
Epoch [1/4] | Batch [20/25] | Batch Loss: 0.3044
Epoch [1/4] | Batch [21/25] |

## Perform a simple test on picture

In [None]:
example_input = "a01-077.png"
# Open the image
image = Image.open(example_input)
transform=ParagraphStem(augment=False)
image_tensor = transform(image)

image_tensor=image_tensor.unsqueeze(0)

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    image_tensor = image_tensor.to(device)
    output = model(image_tensor)  # Encode image

In [None]:
image.show()

In [None]:
decoded_text = ''.join(mapping[idx] for idx in output[0].tolist())
print(decoded_text)

<S>And, since this is election gear in West
Germany, Dr. Adenauer is in a tough
spot. Joyce Egginton cables: President
Kennedy at his Washington Press con-
ference admitted he did not know
Whether America was lagging behind
Russia in missile power. He said he
Was waiting for his senior military
aides to come up with the answer on
February 20.<E><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P

## LORA fine tune

In [None]:
from peft import LoraConfig, get_peft_model

target_module_1 = [
    "self_attn.in_proj_weight",
    "self_attn.out_proj",
    "multihead_attn.in_proj_weight",
    "multihead_attn.out_proj",
    "linear1",
    "linear2",
]


lora_config = LoraConfig(
    r=8,   # Rank of decomposition
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,
    target_modules=target_module_1
)

In [None]:
lora_model = get_peft_model(model, lora_config)

# Check trainable parameters
lora_model.print_trainable_parameters()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=data_config["inverse_mapping"]["<P>"])  # Ignore padding token
optimizer = optim.AdamW(lora_model.parameters(), lr=0.0001)  

trainable params: 65,536 || all params: 14,054,292 || trainable%: 0.4663


In [None]:
train_model(lora_model, train_loader, val_loader, optimizer, criterion, device, num_epochs=6)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch [1/6] | Batch [1/25] | Batch Loss: 0.7589
Epoch [1/6] | Batch [2/25] | Batch Loss: 0.6345
Epoch [1/6] | Batch [3/25] | Batch Loss: 0.8879
Epoch [1/6] | Batch [4/25] | Batch Loss: 0.8949
Epoch [1/6] | Batch [5/25] | Batch Loss: 1.8075
Epoch [1/6] | Batch [6/25] | Batch Loss: 1.1619
Epoch [1/6] | Batch [7/25] | Batch Loss: 0.9878
Epoch [1/6] | Batch [8/25] | Batch Loss: 0.6346
Epoch [1/6] | Batch [9/25] | Batch Loss: 0.9811
Epoch [1/6] | Batch [10/25] | Batch Loss: 1.3444
Epoch [1/6] | Batch [11/25] | Batch Loss: 0.9411
Epoch [1/6] | Batch [12/25] | Batch Loss: 0.6562
Epoch [1/6] | Batch [13/25] | Batch Loss: 0.9765
Epoch [1/6] | Batch [14/25] | Batch Loss: 0.6465
Epoch [1/6] | Batch [15/25] | Batch Loss: 0.7510
Epoch [1/6] | Batch [16/25] | Batch Loss: 1.0983
Epoch [1/6] | Batch [17/25] | Batch Loss: 0.8488
Epoch [1/6] | Batch [18/25] | Batch Loss: 0.5181
Epoch [1/6] | Batch [19/25] | Batch Loss: 0.7783
Epoch [1/6] | Batch [20/25] | Batch Loss: 0.9842
Epoch [1/6] | Batch [21/25] |

Evaluate on the single picture

In [None]:
example_input = "a01-077.png"
# Open the image
image = Image.open(example_input)
transform=ParagraphStem(augment=False)
image_tensor = transform(image)

image_tensor=image_tensor.unsqueeze(0)

lora_model.eval()  # Set model to evaluation mode
with torch.no_grad():
    image_tensor = image_tensor.to(device)
    output = lora_model(image_tensor)  # Encode image

In [None]:
image.show()

In [None]:
decoded_text = ''.join(mapping[idx] for idx in output[0].tolist())
print(decoded_text)

<S>And, since this is election year in West
Germany, Dr. Adenauer is in a tough
spot. Joyce Egginton cables: President
ference admitted he did not know
ference admitted he did not know
Russia in missile power. He said he
Russia in missile power. He said he
was waiting for his senior military
aides to come Up with the answer on
February 2.<E><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P><P>