In [67]:
import csv
import json
import numpy as np
import pandas as pd
import random
from pathlib import Path
from lxml import etree
from nltk import word_tokenize
from collections import Counter
from tqdm.notebook import tqdm
import pickle
from nltk.corpus import words
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet 
from scipy.special import softmax

In [2]:
ocr_path = Path("/home/allekim/stonybook-data/hathi/ocr_model_results/double_books/")
result_paths = list(ocr_path.glob('*'))

In [3]:
def parse_entities(full_hid):
    hlab, hid = full_hid.split('.',1)
    hid_path = Path("/home/allekim/stonybook-data/hathi/processed/") / hlab / hid / 'character_coref_annotated.xml'
    if not hid_path.exists():
        return None
    root = etree.parse(str(hid_path))
    word_count = Counter()
    for e in root.iter('entity'):
        entity_type = e.get('ner')
        if entity_type in ["PERSON", "ORGANIZATION", "LOCATION"]:
            words = [x for x in word_tokenize(e.get('phrase')) if x.isalpha()]
            word_count.update(words)
    return {x for x, count in word_count.items() if count > 1}

In [4]:
all_entities = set()
for path in tqdm(result_paths):
    df = pd.read_csv(path)
    if len(df) <= 0:
        continue
    hid1 = df['hid1'].loc[0]
    hid2 = df['hid2'].loc[0]
    entities1 = parse_entities(hid1)
    if entities1:
        all_entities.update(entities1)
    entities2 = parse_entities(hid2)
    if entities2:
        all_entities.update(entities2)

  0%|          | 0/10641 [00:00<?, ?it/s]

In [11]:
with open('all_entities.pkl', 'wb') as f:
    pickle.dump(all_entities, f, 4)

In [37]:
en_words = set(words.words()) | set(all_entities)
lower_en_words = set([x.lower() for x in en_words])

In [51]:
df = pd.read_csv(result_paths[1], converters={'ctx1': eval, 'ctx2': eval, 'diff1': eval, 'diff2': eval})

In [58]:
def validate_ocr(orig, corr):
    if len(orig) == 0:
        return False
    lower_corr = [x.lower() for x in corr if x.isalpha()]
    return all([x in lower_en_words for x in lower_corr])

def is_good_example(row):
    loss1, loss2 = row['loss1'], row['loss2']
    loss_scores = softmax([loss1,loss2])
    if max(loss_scores) < 0.55:
        return False
    diff1, diff2 = row['diff1'], row['diff2']
    ctx1, ctx2 = row['ctx1'], row['ctx2']
    ocr1, ocr2 = ctx1[diff1[0]:diff1[1]], ctx2[diff2[0]:diff2[1]]
    if loss1 < loss2:
        return validate_ocr(ocr2, ocr1)
    else:
        return validate_ocr(ocr1, ocr2)

In [59]:
df

Unnamed: 0,hid1,hid2,ctx1,ctx2,diff1,diff2,loss1,loss2,is_ok
0,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[SMITTEN, AND, SLAIN, ,, fire, !, fire, !, fir...","[``, It, must, be, agreed, that, if, a, worthy...","(0, 10)","(0, 0)",5.396681,5.094469,True
1,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[``, It, must, be, agreed, that, if, a, worthy...","[``, It, must, be, agreed, that, if, a, worthy...","(15, 16)","(15, 15)",5.501276,5.094469,True
2,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[``, It, must, be, agreed, that, if, a, worthy...","[``, It, must, be, agreed, that, if, a, worthy...","(45, 46)","(44, 45)",5.501276,5.094469,True
3,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[HOWEVER, much, men, may, differ, from, one, a...","[HOWEVER, much, men, may, differ, from, one, a...","(8, 9)","(8, 8)",4.298444,3.983488,True
4,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[For, every, one, yaivns, sometimes, .]","[For, every, one, yawns, sometimes, .]","(3, 4)","(3, 4)",8.544419,8.164243,False
...,...,...,...,...,...,...,...,...,...
495,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[And, ere, many, seconds, had, elapsed, ,, Mau...","[And, ere, many, seconds, had, elapsed, ,, Mau...","(49, 50)","(49, 49)",4.663724,4.486324,True
496,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[How, I, wish, I, had, never, acquired, d, thi...","[How, I, wish, I, had, never, acquired, this, ...","(7, 8)","(7, 7)",6.077575,4.806390,True
497,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[To, satisfy, him, ,, though, only, too, well,...","[To, satisfy, him, ,, though, only, too, well,...","(10, 11)","(10, 10)",5.435009,5.171368,True
498,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[Has, she, had, any, particular, excitement, t...","[``, Has, she, had, any, particular, excitemen...","(6, 7)","(7, 7)",6.366374,5.640049,True


In [60]:
df['is_ok'] = df.apply(is_good_example, axis=1)

Unnamed: 0,hid1,hid2,ctx1,ctx2,diff1,diff2,loss1,loss2
0,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[SMITTEN, AND, SLAIN, ,, fire, !, fire, !, fir...","[``, It, must, be, agreed, that, if, a, worthy...","(0, 10)","(0, 0)",5.396681,5.094469
1,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[``, It, must, be, agreed, that, if, a, worthy...","[``, It, must, be, agreed, that, if, a, worthy...","(15, 16)","(15, 15)",5.501276,5.094469
2,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[``, It, must, be, agreed, that, if, a, worthy...","[``, It, must, be, agreed, that, if, a, worthy...","(45, 46)","(44, 45)",5.501276,5.094469
3,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[HOWEVER, much, men, may, differ, from, one, a...","[HOWEVER, much, men, may, differ, from, one, a...","(8, 9)","(8, 8)",4.298444,3.983488
6,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[Li, Francho, ,, Lino\sam, 's, father, ,, was,...","[Li, Francho, ,, Lingsam, 's, father, ,, was, ...","(3, 4)","(3, 4)",5.695899,5.364439
...,...,...,...,...,...,...,...,...
494,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[The, lamp, of, her, life, was, but, flickerin...","[The, lamp, of, her, life, was, but, nickering...","(7, 8)","(7, 8)",5.716770,6.591751
496,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[How, I, wish, I, had, never, acquired, d, thi...","[How, I, wish, I, had, never, acquired, this, ...","(7, 8)","(7, 7)",6.077575,4.806390
497,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[To, satisfy, him, ,, though, only, too, well,...","[To, satisfy, him, ,, though, only, too, well,...","(10, 11)","(10, 10)",5.435009,5.171368
498,uc2.ark+=13960=t2f76fq6r,uc1.b3326445,"[Has, she, had, any, particular, excitement, t...","[``, Has, she, had, any, particular, excitemen...","(6, 7)","(7, 7)",6.366374,5.640049


In [69]:
train_paths, test_paths = train_test_split(result_paths, test_size=0.2, random_state=1729)

In [74]:
def write_full_csv(fname, paths):
    all_df = []
    for p in tqdm(paths):
        df = pd.read_csv(p, converters={'ctx1': eval, 'ctx2': eval, 'diff1': eval, 'diff2': eval})
        if len(df) > 0:
            df['is_ok'] = df.apply(is_good_example, axis=1)
            df = df[df['is_ok']==True].drop('is_ok', axis=1)
            all_df.append(df)
    full_df = pd.concat(all_df)
    full_df.to_csv(fname, index=False)

In [75]:
write_full_csv('train.csv', train_paths)

  0%|          | 0/8512 [00:00<?, ?it/s]

In [76]:
write_full_csv('test.csv', test_paths)

  0%|          | 0/2129 [00:00<?, ?it/s]

In [83]:
x = np.zeros(5, dtype=int)

In [84]:
x

array([0, 0, 0, 0, 0])

In [85]:
x[2:4] = 1

In [86]:
x

array([0, 0, 1, 1, 0])