## Results

- no training involved
- tested with pretrained model of trocr, handwritten and printed (vit + bert)
- tested on both coloured and black/white cleaned, no difference in prediction when using same model, for both models
- printed generally scores higher than handwritten but is not too good either
- ran on kaggle gpu with batch 16, takes 5 mins. runnning on cpu takes 1.5 hours

results:
- handwritten model character accuracy (position important): 0.42
- printed model character accuracy (position important): 0.52

- complete matches printed: 330/1967, 0.17
- complete matches handwritten: 219/1967, 0.11


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

train_c_path = '/kaggle/input/train-colour/train_cleaned_color_resized'
test_c_path = '/kaggle/input/test-colour/test_cleaned_color_resized'
train_bw_path = '/kaggle/input/train-black1/train_cleaned_black_resized'
test_bw_path = '/kaggle/input/test-black/test_cleaned_black_resized'
train_c_uc_path = '/kaggle/input/train-unclean-c-resized/train_resized'
test_c_uc_path = '/kaggle/input/test-unclean-c-resized/test_resized'

train_bw_df = pd.DataFrame(os.listdir(train_bw_path), columns=['filename'])
test_bw_df = pd.DataFrame(os.listdir(test_bw_path), columns=['filename'])
train_c_df = pd.DataFrame(os.listdir(train_c_path), columns=['filename'])
test_c_df = pd.DataFrame(os.listdir(test_c_path), columns=['filename'])
train_c_uc_df = pd.DataFrame(os.listdir(train_c_uc_path), columns=['filename'])
test_c_uc_df = pd.DataFrame(os.listdir(test_c_uc_path), columns=['filename'])

# get labels
train_bw_df["captcha_text"] = train_bw_df["filename"].str[:-6]
test_bw_df["captcha_text"] = test_bw_df["filename"].str[:-6]
train_c_df["captcha_text"] = train_c_df["filename"].str[:-6]
test_c_df["captcha_text"] = test_c_df["filename"].str[:-6]
train_c_uc_df["captcha_text"] = train_c_uc_df["filename"].str[:-6]
test_c_uc_df["captcha_text"] = test_c_uc_df["filename"].str[:-6]

# get complete file path
train_bw_df['filename'] = train_bw_df['filename'].apply(lambda x: os.path.join(train_bw_path, x))
test_bw_df['filename'] = test_bw_df['filename'].apply(lambda x: os.path.join(test_bw_path, x))
train_c_df['filename'] = train_c_df['filename'].apply(lambda x: os.path.join(train_c_path, x))
test_c_df['filename'] = test_c_df['filename'].apply(lambda x: os.path.join(test_c_path, x))
train_c_uc_df['filename'] = train_c_uc_df['filename'].apply(lambda x: os.path.join(train_c_uc_path, x))
test_c_uc_df['filename'] = test_c_uc_df['filename'].apply(lambda x: os.path.join(test_c_uc_path, x))

In [None]:
test_c_df = test_c_df[~test_c_df["filename"].str.endswith("_Store")]
test_c_df

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests
from PIL import Image
import torch

#or test with microsoft/trocr-base-handwritten
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [None]:
import torch
from tqdm.auto import tqdm

tqdm.pandas()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_batch(file_paths):
    images = [Image.open(fp).convert("RGB") for fp in file_paths]
    pixel_values = processor(images, return_tensors="pt", padding=True).pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return generated_texts

batch_size = 16
predictions = []

for i in tqdm(range(0, len(test_bw_df), batch_size)):
    batch_files = test_bw_df['filename'][i:i+batch_size].tolist()
    batch_preds = predict_batch(batch_files)
    predictions.extend(batch_preds)

test_c_df['predicted_text_printed'] = predictions


In [1]:
import pandas as pd
file = "predictions_bw_full.csv"
df = pd.read_csv(file, sep=",")
df

Unnamed: 0,filename,captcha_text,predicted_text,predicted_text_printed
0,/kaggle/input/test-black/test_cleaned_black_re...,q02a9jk,# # Q 0 2A.T.K.000,QOLATSK
1,/kaggle/input/test-black/test_cleaned_black_re...,nowqcrpn,N O. W. Q.C.R.P.N,NO WQCRPN
2,/kaggle/input/test-black/test_cleaned_black_re...,jpquvw6,( I p.O U.V W.S.,JQQUVW6
3,/kaggle/input/test-black/test_cleaned_black_re...,4edetz3,HEDET 23,4EDET23
4,/kaggle/input/test-black/test_cleaned_black_re...,l2dxd3,12dx.a,/2PKDA
...,...,...,...,...
1962,/kaggle/input/test-black/test_cleaned_black_re...,o1s3al0,", 0 1 & 3 & 10.",0183A\0
1963,/kaggle/input/test-black/test_cleaned_black_re...,x4ou7j,4 17 0 3/ 3,$ 0.00
1964,/kaggle/input/test-black/test_cleaned_black_re...,lzjyzk9n,2 1 12 12 12 12 12 12 12 12 12 12 July 2 days ...,12JYZKSR
1965,/kaggle/input/test-black/test_cleaned_black_re...,w7na,# with a,W7MAS


In [None]:
import re
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', str(text)).lower()

df["cleaned_predicted_text"] = df["predicted_text"].apply(clean_text)
df["cleaned_predicted_text_printed"] = df["predicted_text_printed"].apply(clean_text)
df

Unnamed: 0,filename,captcha_text,predicted_text,predicted_text_printed,cleaned_predicted_text,cleaned_predicted_text_printed
0,/kaggle/input/test-black/test_cleaned_black_re...,q02a9jk,# # Q 0 2A.T.K.000,QOLATSK,q02atk000,qolatsk
1,/kaggle/input/test-black/test_cleaned_black_re...,nowqcrpn,N O. W. Q.C.R.P.N,NO WQCRPN,nowqcrpn,nowqcrpn
2,/kaggle/input/test-black/test_cleaned_black_re...,jpquvw6,( I p.O U.V W.S.,JQQUVW6,ipouvws,jqquvw6
3,/kaggle/input/test-black/test_cleaned_black_re...,4edetz3,HEDET 23,4EDET23,hedet23,4edet23
4,/kaggle/input/test-black/test_cleaned_black_re...,l2dxd3,12dx.a,/2PKDA,12dxa,2pkda
...,...,...,...,...,...,...
1962,/kaggle/input/test-black/test_cleaned_black_re...,o1s3al0,", 0 1 & 3 & 10.",0183A\0,01310,0183a0
1963,/kaggle/input/test-black/test_cleaned_black_re...,x4ou7j,4 17 0 3/ 3,$ 0.00,417033,000
1964,/kaggle/input/test-black/test_cleaned_black_re...,lzjyzk9n,2 1 12 12 12 12 12 12 12 12 12 12 July 2 days ...,12JYZKSR,2112121212121212121212july2daysagoagoago195012,12jyzksr
1965,/kaggle/input/test-black/test_cleaned_black_re...,w7na,# with a,W7MAS,witha,w7mas


In [14]:
def character_accuracy(actual, predicted):
    min_len = min(len(actual), len(predicted))
    correct = sum([1 if actual[i] == predicted[i] else 0 for i in range(min_len)])
    return correct

chr_acc_hw = [character_accuracy(row['captcha_text'], row['cleaned_predicted_text']) for _, row in df.iterrows()]
df['chr_acc_hw'] = chr_acc_hw

chr_acc_p = [character_accuracy(row['captcha_text'], row['cleaned_predicted_text_printed']) for _, row in df.iterrows()]
df['chr_acc_p'] = chr_acc_p

print(df[['captcha_text', 'cleaned_predicted_text', 'cleaned_predicted_text_printed', 'chr_acc_hw', 'chr_acc_p']].head())


  captcha_text cleaned_predicted_text cleaned_predicted_text_printed  \
0      q02a9jk              q02atk000                        qolatsk   
1     nowqcrpn               nowqcrpn                       nowqcrpn   
2      jpquvw6                ipouvws                        jqquvw6   
3      4edetz3                hedet23                        4edet23   
4       l2dxd3                  12dxa                          2pkda   

   chr_acc_hw  chr_acc_p  
0           4          3  
1           8          8  
2           4          6  
3           5          6  
4           3          0  


In [15]:
sum_chr_acc_hw = df['chr_acc_hw'].sum()
sum_chr_acc_p = df['chr_acc_p'].sum()
sum_len_captcha_text = df['captcha_text'].apply(len).sum()

print(f"chr_acc_hw: {sum_chr_acc_hw /sum_len_captcha_text}")
print(f"chr_acc_p: {sum_chr_acc_p /sum_len_captcha_text}")

chr_acc_hw: 0.4162432249322493
chr_acc_p: 0.5249830623306233


In [19]:
total_matches_p = (df['captcha_text'] == df['cleaned_predicted_text_printed']).sum()
total_matches_hw = (df['captcha_text'] == df['cleaned_predicted_text']).sum()

print(f"complete matches printed: {total_matches_p} / {len(df)}, {total_matches_p/len(df)}")
print(f"complete matches handwritten: {total_matches_hw} / {len(df)}, {total_matches_hw/len(df)}")


complete matches printed: 330 / 1967, 0.1677681748856126
complete matches handwritten: 219 / 1967, 0.11133706151499746


In [None]:
df.to_csv('trocr_basic_with_results.csv', index=False)