In [90]:
import fitz
import numpy as np
import pandas as pd
from tqdm import tqdm

import io
from PIL import Image, ImageDraw, ImageFont, ImageColor

import math
import re

In [2]:
vol1_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf'
vol2_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf'
vol3_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf'

vol1_doc = fitz.open(vol1_path)
vol2_doc = fitz.open(vol2_path)
vol3_doc = fitz.open(vol3_path)

vol1_pages = [vol1_doc[i] for i in range(vol1_doc.page_count)]
vol2_pages = [vol2_doc[i] for i in range(vol2_doc.page_count)]
vol3_pages = [vol3_doc[i] for i in range(vol3_doc.page_count)]

In [3]:
vol1_char_df = pd.read_pickle("../input/char_df/vol1_df.pkl")
vol2_char_df = pd.read_pickle("../input/char_df/vol2_df.pkl")
vol3_char_df = pd.read_pickle("../input/char_df/vol3_df.pkl")

vol1_index = list(range(616, 639)) #inclusive
vol2_index = list(range(703, 725))
vol3_index = list(range(555, 583))

In [4]:
TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

### finding the columns 
### & checking if a word is a strict match for the genus / epithet pattern

In [5]:
def epithet_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'].islower()

def genus_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'][0].isupper() and row['word'][1:].islower()

In [19]:
#rightmost point of any bounding box:
def get_center_x0(vol_char_df, page_num, bias = 30):
    """WARNING: large bias causes miscatagorization in page number in book"""
    df = vol_char_df[vol_char_df['page_num'] == page_num]
    
    right_bound = df['line_bbox'].apply(lambda x : x[2]).max() 
    #leftmost point of any bounding box:
    left_bound = df['line_bbox'].apply(lambda x : x[0]).min()

    return 0.5*(right_bound + left_bound) - bias


def get_col_num(coords, center_x0):
    x0, y0, x1, y1 = coords
    return int(x0 >= center_x0)


all_vol_data = [(vol1_char_df, vol1_index, vol1_doc),
                (vol2_char_df, vol2_index, vol2_doc),
                (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, doc in all_vol_data: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    vol_char_df['genus_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and genus_match(r), axis = 1) #does this for whole books which is bad
    vol_char_df['epithet_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and epithet_match(r), axis = 1) #does this for whole books which is bad
    
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 22/22 [00:13<00:00,  1.58it/s]
100%|██████████| 28/28 [00:14<00:00,  1.90it/s]


#### testing if col num correctly assigned

In [18]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "index_col_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "index_col_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "index_col_vol3")]

for vol_char_df, vol_index, vol_doc, output_name in all_vol_data:
    image_list = []
    keep_cols = vol_char_df.columns.difference(["char_num", "char", "char_origin", "char_bbox", "char_x0", "char_y0", "char_x1", "char_y1", "pruned_char_x0", "pruned_char_y0", "pruned_char_x1", "pruned_char_y1"], sort=False).tolist()
    for page_num in tqdm(vol_index):
        pix_map = vol_doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)

        temp_df = vol_char_df[vol_char_df["page_num"] == page_num].loc[:, keep_cols].drop_duplicates()

        for coord in temp_df[temp_df['col_num'] == 0]['line_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        for coord in temp_df[temp_df['col_num'] == 1]['line_bbox']:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=5)
            
        image_list.append(image)
        #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])


100%|██████████| 23/23 [00:04<00:00,  5.59it/s]
100%|██████████| 22/22 [00:03<00:00,  5.56it/s]
100%|██████████| 28/28 [00:04<00:00,  5.97it/s]


### Genus / epithet flagging 
flagging pages where number of strict genus or epithet patern matches is less than 3 per column

In [25]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "strickt_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "strickt_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "strickt_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    genus_flag_list = []
    epithet_flag_list = []
    for page_num in tqdm(vol_index):
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num)
                                & (vol_char_df['genus_index_pat_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus pattern match flag should check with half page and not entire page:
        for col in range(2):
            num_genus_col = genus_db[genus_db["col_num"] == col].shape[0]
            num_epithet_col = epithet_db[epithet_db["col_num"] == col].shape[0]
            if num_genus_col <= 2:
                genus_flag_list.append((num_genus_col, page_num - vol_index[0] + 1, col))
            if num_epithet_col <= 2:
                epithet_flag_list.append((num_epithet_col, page_num - vol_index[0] + 1, col))
    num_flag_pages = len(set([g[1] for g in genus_flag_list] + [e[1] for e in epithet_flag_list]))
    if num_flag_pages > 0: 
        print("***FLAGS***")
        print(f" number of pages to check: {num_flag_pages}")
        if genus_flag_list:
            print("  genera")
            [print(f"\t number of genera: {g_flag[0]}, page number: {g_flag[1]}, column number: {g_flag[2]}") for g_flag in genus_flag_list]
        if epithet_flag_list:
            print("  epithets")
            [print(f"\t number of epithets: {e_flag[0]}, page number: {e_flag[1]}, column number: {e_flag[2]}") for e_flag in epithet_flag_list]

100%|██████████| 23/23 [00:00<00:00, 78.39it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 1, page number: 2, column number: 0
	 number of genera: 2, page number: 15, column number: 1
	 number of genera: 0, page number: 20, column number: 1
	 number of genera: 1, page number: 23, column number: 0
  epithets
	 number of epithets: 2, page number: 23, column number: 1


100%|██████████| 22/22 [00:00<00:00, 75.92it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 2, page number: 4, column number: 0
	 number of genera: 1, page number: 4, column number: 1
	 number of genera: 0, page number: 5, column number: 0
	 number of genera: 1, page number: 12, column number: 0
	 number of genera: 2, page number: 14, column number: 1


100%|██████████| 28/28 [00:00<00:00, 86.63it/s]

***FLAGS***
 number of pages to check: 7
  genera
	 number of genera: 1, page number: 2, column number: 1
	 number of genera: 0, page number: 6, column number: 0
	 number of genera: 1, page number: 21, column number: 0
	 number of genera: 1, page number: 22, column number: 0
	 number of genera: 2, page number: 24, column number: 1
	 number of genera: 2, page number: 26, column number: 0
	 number of genera: 0, page number: 26, column number: 1
	 number of genera: 2, page number: 28, column number: 0





### match  based on coordinates

In [26]:
def is_coord_match(x, x_ref_left, x_ref_right, margin):
    return (x_ref_left - margin <= x[0] and x[0] <= x_ref_left + margin) or (x_ref_right - margin <= x[0] and x[0] <= x_ref_right + margin)

#### epithet

In [45]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["epithet_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["epithet_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        epithet_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["epithet_index_pat_match"] == True)]
        epithet_df = epithet_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_epithet_2dic = [{}, {}]
        
        for i in range(epithet_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = epithet_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = epithet_df['col_num'].iloc[i]

            ref_neighbors_df = epithet_df[(epithet_df["page_num"] == page_num) & 
                                          (epithet_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            page_epithet_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_epithet = max(page_epithet_2dic[0].values(), default = [-1, -1])[1]
        mean_right_epithet = max(page_epithet_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_epithet == -1 or mean_right_epithet == -1:
            mean_valid_col = max(mean_left_epithet, mean_right_epithet)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_epithet == -1 and mean_right_epithet == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet, mean_right_epithet, margin))

100%|██████████| 23/23 [00:01<00:00, 19.76it/s]
100%|██████████| 22/22 [00:01<00:00, 20.05it/s]
100%|██████████| 28/28 [00:01<00:00, 23.06it/s]


In [47]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "epithet_coord_match_pruned_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "epithet_coord_match_pruned_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "epithet_coord_match_pruned_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    
    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        epithet_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['epithet_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #epithet Coord is orange-pinkish, 5
        for coord in epithet_coord_db["pruned_word_bbox"] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        #epithet is blue, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.49it/s]
100%|██████████| 22/22 [00:04<00:00,  5.45it/s]
100%|██████████| 28/28 [00:04<00:00,  5.82it/s]


#### Genus coord match

In [58]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    #genus and not epithet
    vol_char_df["genus_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["genus_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        genus_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) &
                                    (vol_char_df["genus_index_pat_match"] == True)]
        genus_df = genus_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_genus_2dic = [{}, {}]
        
        for i in range(genus_df.shape[0]):
            g_index = str(page_num) + "_" + str(i)
            p0 = genus_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = genus_df['col_num'].iloc[i]

            ref_neighbors_df = genus_df[(genus_df["page_num"] == page_num) & 
                                        (genus_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]

            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            page_genus_2dic[col][g_index] = (num_neighbors, mean_neighbors)
        
        mean_left_genus = max(page_genus_2dic[0].values(), default = [-1, -1])[1]
        mean_right_genus = max(page_genus_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_genus == -1 or mean_right_genus == -1:
            mean_valid_col = max(mean_left_genus, mean_right_genus)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_genus == -1 and mean_right_genus == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_genus, mean_right_genus, margin))

100%|██████████| 23/23 [00:00<00:00, 45.68it/s]
100%|██████████| 22/22 [00:00<00:00, 44.01it/s]
100%|██████████| 28/28 [00:00<00:00, 52.69it/s]


In [59]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "genus_coord_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "genus_coord_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "genus_coord_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        genus_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['genus_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_coord_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.57it/s]
100%|██████████| 22/22 [00:04<00:00,  5.33it/s]
100%|██████████| 28/28 [00:05<00:00,  5.58it/s]


### extract potential genus / epithet matches

In [173]:
def potential_genus_match(row):
    return row['genus_coord_match'] == True and \
           row['epithet_coord_match'] == False and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False and \
           row['word'].find("Flore") == -1 #re.search(r"^Flore$", row['word'])== None

def potential_epithet_match(row):
    return row['epithet_coord_match'] == True and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False

In [174]:
vol1_char_df['potential_genus_match'] = vol1_char_df.apply(potential_genus_match, axis = 1)
vol1_char_df['potential_epithet_match'] = vol1_char_df.apply(potential_epithet_match, axis = 1)

vol2_char_df['potential_genus_match'] = vol2_char_df.apply(potential_genus_match, axis = 1)
vol2_char_df['potential_epithet_match'] = vol2_char_df.apply(potential_epithet_match, axis = 1)

vol3_char_df['potential_genus_match'] = vol3_char_df.apply(potential_genus_match, axis = 1)
vol3_char_df['potential_epithet_match'] = vol3_char_df.apply(potential_epithet_match, axis = 1)

In [175]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "GE_potential_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "GE_potential_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "GE_potential_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['potential_genus_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_epithet_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.38it/s]
100%|██████████| 22/22 [00:03<00:00,  5.53it/s]
100%|██████████| 28/28 [00:04<00:00,  5.79it/s]


### infra species

In [184]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["epithet_coord_match"] == True) | (vol_char_df["genus_coord_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus

        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 18.72it/s]
100%|██████████| 22/22 [00:01<00:00, 18.23it/s]
100%|██████████| 28/28 [00:01<00:00, 20.92it/s]


In [194]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["potential_epithet_match"] == True) | (vol_char_df["potential_genus_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus


        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 19.55it/s]
100%|██████████| 22/22 [00:01<00:00, 18.09it/s]
100%|██████████| 28/28 [00:01<00:00, 20.81it/s]


In [203]:
def potential_author_match_infra_coord(word):
    lower_word = word.lower()
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$"
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_infra_symbol = re.search(infra_symbols, lower_word) != None
    return (not is_infra_symbol) and (word[0].isupper() or is_latin_connectives)

In [230]:
potential_author_match_infra_coord("fil.")

True

In [204]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]
for vol_char_df, _ in all_vol_data:
    vol_char_df["potential_infra_match"] = (vol_char_df["infra_coord_match"] == True) & (vol_char_df['word'].apply(potential_author_match_infra_coord) == False)

In [224]:
def has_infra_symbols(word):
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    return re.search(infra_symbols, word) != None

In [235]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['infra_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_infra_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
                                         (vol_char_df['infra_coord_match'] == True) & 
                                         (vol_char_df['word'].apply(has_infra_symbols) == True)
                                        ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                        ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in infra_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

        for coord in infra_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in with_infra_symbols['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [00:25<00:00,  1.11it/s]
100%|██████████| 22/22 [00:23<00:00,  1.06s/it]
100%|██████████| 23/23 [00:22<00:00,  1.03it/s]


### functions for author matching 
to detect anamolies in epithet and infra indentations

In [216]:
vol1_char_df[(vol1_char_df['potential_infra_match'] == True) & (vol1_char_df['word'].apply(has_infra_symbols) == False)][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word


In [218]:
vol2_char_df[(vol2_char_df['potential_infra_match'] == True) & (vol2_char_df['word'].apply(has_infra_symbols) == False)][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word


In [219]:
vol3_char_df[(vol3_char_df['potential_infra_match'] == True) & (vol3_char_df['word'].apply(has_infra_symbols) == False)][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word
1559345,557,(3
1559874,557,f.
1561502,558,fa
1566359,560,deris
1570483,562,cock
1576678,565,f.
1578491,566,picha
1581443,567,adoxifolium
1582167,568,fil.
1584378,569,yar.


In [220]:
def potential_author_match_epithet_coord(word):
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$|^\s?f[\s|.]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_hybrid = word == "X"
    return is_latin_connectives or (word[0].isupper() and (not is_hybrid))

In [221]:
vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(potential_author_match_epithet_coord))][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word
1734317,624,"J.d,IlLIlU."
1753028,633,Phoenicia
1753527,633,Syriacus
1755122,634,Jilicaulis


In [222]:
vol2_char_df[(vol2_char_df['potential_epithet_match'] == True) & (vol2_char_df['word'].apply(potential_author_match_epithet_coord))][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word
1915513,706,Hbanoticus
1922530,710,Hppii
1937158,716,Ma


In [223]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(potential_author_match_epithet_coord))][["page_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,word
1554632,555,Krascheninnikovii
1565497,560,Wagenitz
1566524,560,Fritsch
1575185,564,Holub
1575488,565,Holub
1577207,565,et
1578956,566,Eichwaldii
1579044,566,Schrank
1582101,568,Kuntze
1583001,568,Kuntze


flag if we had 2 genus in the same line or 1 or more genus + 1 or more epithet on the same line