In [1]:
!pip install -q datasets jiwer evaluate transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
import os
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [14]:
dataset_path = '/drive/MyDrive/Comp 542/IAM/'
test_set_file_path = os.path.join(dataset_path,'evaluation_df.csv')

if os.path.exists(test_set_file_path):
  print(f"Found '{test_set_file_path}' on Google Drive.")
  test_df = pd.read_csv(test_set_file_path, sep=',')
  test_df.rename(columns={'file_path': "file_name", 'label': "text"}, inplace=True)
else:
  print(f"Waring '{test_set_file_path}' does not exist on Google Drive.")
  print(f"Creating a local csv upload to google drive")

  data_set_file_path= os.path.join(dataset_path,'dataset.csv')
  df = pd.read_csv(data_set_file_path, sep=',')
  df.rename(columns={'file_path': "file_name", 'label': "text"}, inplace=True)

  train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
  train_df.reset_index(drop=True, inplace=True)
  train_df.to_csv("training_df.csv", index=False)
  test_df.reset_index(drop=True, inplace=True)
  test_df.to_csv("evaluation_df.csv", index=False)

print(test_df)
print(f"Amount of test samples {len(test_df)}")

Found '/drive/MyDrive/Comp 542/IAM/evaluation_df.csv' on Google Drive.
                                              file_name     text bin_category  \
0     /drive/MyDrive/Comp 542/IAM/words/h07/h07-037/...       to   very short   
1     /drive/MyDrive/Comp 542/IAM/words/r06/r06-027/...        ,   very short   
2     /drive/MyDrive/Comp 542/IAM/words/b01/b01-000/...    issue        short   
3     /drive/MyDrive/Comp 542/IAM/words/f07/f07-039b...       in   very short   
4     /drive/MyDrive/Comp 542/IAM/words/p01/p01-174/...      led   very short   
...                                                 ...      ...          ...   
9061  /drive/MyDrive/Comp 542/IAM/words/c06/c06-043/...  seizure        short   
9062  /drive/MyDrive/Comp 542/IAM/words/e04/e04-103/...    third        short   
9063  /drive/MyDrive/Comp 542/IAM/words/g04/g04-011/...     been   very short   
9064  /drive/MyDrive/Comp 542/IAM/words/r06/r06-130/...        ,   very short   
9065  /drive/MyDrive/Comp 542/IAM/word

In [11]:
import torch
from torch.utils.data import Dataset
from PIL import Image
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        try:
          # get file name + text
          file_name = self.df['file_name'][idx]
          text = self.df['text'][idx]
          # prepare image (i.e. resize + normalize)
          image = Image.open(file_name).convert("RGB")
          pixel_values = self.processor(image, return_tensors="pt").pixel_values
          # add labels (input_ids) by encoding the text
          labels = self.processor.tokenizer(text,
                                            padding="max_length",
                                            max_length=self.max_target_length).input_ids
          # important: make sure that PAD tokens are ignored by the loss function
          labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

          encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        except:
          # in case there is an issue loading an image use the last image
          return self.__getitem__(idx-1)
        return encoding

In [12]:
from evaluate import load

def compute_metrics(pred):
    cer_metric = load("cer")

    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    precision = compute_precision(pred_str, label_str)
    recall = compute_recall(pred_str, label_str)

    return {"precision":precision, "recall":recall, "cer": cer}

def compute_precision(predictions=None,references=None):
    cer_metric = load("cer")
    correct = 0
    total = 0
    for prediction, reference in zip(predictions, references):
        correct += correctly_matched_characters(prediction, reference)
        total += len(prediction)

    return correct / total

def compute_recall(predictions=None,references=None):
    correct = 0
    total = 0
    for prediction, reference in zip(predictions, references):
        correct += correctly_matched_characters(prediction, reference)
        total += len(reference)
    return correct / total

def correctly_matched_characters(prediction=None,reference=None):
    correct = 0
    prediction_shift = 0
    reference_shift = 0
    for i in range(len(prediction)):
        if i+prediction_shift >= len(prediction) or i+reference_shift >= len(reference):
          break
        if prediction[i+prediction_shift] == reference[i+reference_shift]:
          correct += 1
        elif i+prediction_shift+1 < len(prediction) and prediction[i+prediction_shift+1] == reference[i+reference_shift]:
                #deletion
          correct += 1
          prediction_shift +=1
        elif i+reference_shift+1 < len(reference) and prediction[i+prediction_shift] == reference[i+reference_shift+1]:
                #intertion
          correct += 1
          reference_shift +=1
        elif i+reference_shift+1 < len(reference) and i+prediction_shift+1 < len(prediction) and prediction[i+prediction_shift+1] == reference[i+reference_shift+1]:
                #substitution
          correct += 1
          prediction_shift +=1
          reference_shift +=1
    return correct
# Precision = correctly matched character / number of detected character

# Recall = correctly matched character / number of original characters

In [15]:
os.environ["WANDB_DISABLED"] = "true"

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    eval_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir="/drive/MyDrive/Comp 542/model",
    logging_steps=2,
    save_steps=25,
    eval_steps=100,
    report_to=None,
    fp16=True,
    fp16_opt_level='03',
    num_train_epochs=1
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
import warnings
from transformers import logging as transformers_logging
warnings.filterwarnings("ignore")
transformers_logging.set_verbosity_error()

In [17]:
processor = TrOCRProcessor.from_pretrained('/drive/MyDrive/Comp 542/model/checkpoint-2750')
model=VisionEncoderDecoderModel.from_pretrained('/drive/MyDrive/Comp 542/model/checkpoint-2750')

eval_dataset = IAMDataset(root_dir='/drive/MyDrive/Comp 542/IAM/',df=test_df,processor=processor)

In [None]:
print(len(eval_dataset))
trainer = Seq2SeqTrainer(
    model=model,
    processing_class=processor,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=eval_dataset,)

results = trainer.evaluate()

In [None]:
pd.DataFrame.from_dict(results,orient='index').to_csv("results_test.csv", index=False)

In [None]:
print(f"results{results}")