In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import regex
import pandas as pd

In [45]:
DATA_DIR = Path('..') / 'data'
CORPORA = DATA_DIR / 'corpora'

WORDS_EN = CORPORA / 'nltk' / 'en'
MASC_500 = CORPORA / 'masc_500k_texts'

US_STATES = CORPORA / 'traiter' / 'US_states.csv'
US_COUNTIES = CORPORA / 'traiter' / 'US_counties.csv'
NAME_PARTS = CORPORA / 'traiter' / 'name_parts.csv'

ITIS_TAXA = CORPORA / 'plant_taxa.csv'
CITIES = CORPORA / 'US' / 'US.txt'

OCR_CSV = DATA_DIR / 'OCR_IE-partII_score.csv'

SPLITTER = '[^a-z]+'
VOWELS = '[aeiouy]'
CHARS = r'[\u0020-\u007e\s]'

In [4]:
def strip_words(words):
    words = {w for w in words
             if len(w) == 1 or regex.search(VOWELS, w)}
    return words

In [5]:
corpora = set()

In [6]:
with open(WORDS_EN) as in_file:
    words = in_file.read().lower().split()
    corpora |= strip_words(words)

In [7]:
for path in MASC_500.glob('**/*.txt'):
    with open(path, encoding='ISO-8859-1') as text_file:
        text = text_file.read()
    words = set(regex.split(SPLITTER, text.lower()))
    corpora |= strip_words(words)

In [8]:
df = pd.read_csv(US_STATES, na_filter=False)
corpora |= set(df['postal'].str.lower())

words = df['state'].str.lower().str.split()
words = {regex.sub(SPLITTER, '', a) for aa in words for a in aa if a}

words = df['abbrev'].str.lower().str.split(',')
words = {regex.sub(SPLITTER, '', a) for aa in words for a in aa if a}
corpora |= strip_words(words)

In [9]:
df = pd.read_csv(US_COUNTIES, na_filter=False)
words = df['County'].str.lower().str.split(SPLITTER)
words = {a for aa in words for a in aa if a}
corpora |= strip_words(words)

In [10]:
df = pd.read_csv(NAME_PARTS, na_filter=False)
words = set(df['name'].str.lower())
corpora |= strip_words(words)

In [11]:
df = pd.read_csv(ITIS_TAXA, na_filter=False)
words = df['complete_name'].str.lower().str.split(SPLITTER)
words = {a for aa in words for a in aa if a}
corpora |= strip_words(words)

In [12]:
df = pd.read_csv(CITIES, sep='\t', header=None)
words = df[2].str.lower().str.split(SPLITTER)
words = {a for aa in words for a in aa if a}
corpora |= strip_words(words)

In [13]:
len(corpora)

335583

In [46]:
word_scores = []
char_scores = []

df = pd.read_csv(OCR_CSV, na_filter=False, index_col='subject_id')
for subject_id, row in df.iterrows():
    text = row['predicted_text'].lower()

    chars = list(text)
    found = sum(1 for c in chars if regex.match(CHARS, c))
    char_score = round(found / len(chars), 4)
    char_scores.append(char_score)

    words = regex.split(SPLITTER, text.lower())
    found = sum(1 for w in words if w in corpora)
    word_score = round(found / len(words), 4)
    word_scores.append(word_score)
    # print(f'{subject_id:<10}\t{word_score:0.4f}\t{char_score:0.4f}')

data = {
    'subject_id': df.index,
    'word_score': word_scores,
    'char_score': char_scores}

df2 = pd.DataFrame(data=data)
df2.to_csv(DATA_DIR / 'OCR_scores.csv', index=False)
df2.head()

Unnamed: 0,subject_id,word_score,char_score
0,11782933,0.625,0.9797
1,11782141,0.7465,0.984
2,11782436,0.9136,1.0
3,4128609,0.6,0.7222
4,11782773,0.9074,1.0


In [47]:
text = df.at[10668179, 'predicted_text']
chars = list(text)
for char in chars:
    print(f'{char} {regex.match(CHARS, char) is not None}')
#  = sum(1 for c in chars if regex.match(CHARS, c))

H True
e True
r True
b True
a True
r True
i True
u True
m True
  True
o True
f True
  True
t True
h True
e True
  True
U True
n True
i True
v True
e True
r True
s True
i True
t True
y True
  True
o True
f True
  True
N True
o True
r True
t True
h True
  True
C True
a True
r True
o True
l True
i True
n True
a True

 True
  True
S True
O True
U True
T True
H True
  True
C True
A True
R True
O True
L True
I True
N True
A True

 True
  True
C True
h True
a True
r True
l True
e True
s True
t True
o True
n True
  True
C True
o True
u True
n True
t True
y True

 True
  True

 True
  True
P True
t True
e True
r True
i True
s True
  True
v True
i True
t True
t True
a True
t True
a True
  True
L True
. True

 True
  True
W True
a True
l True
l True
s True
  True
o True
f True
  True
F True
o True
r True
t True
  True
M True
o True
u True
l True
t True
r True
i True
e True
, True
  True
S True
u True
l True
l True
i True
v True
a True
n True
' True
s True

 True
  True

 True
  True
I True
s True