In [1]:
import fitz
import numpy as np
import pandas as pd
from tqdm import tqdm

import io
from PIL import Image, ImageDraw, ImageFont, ImageColor

import math
import re

### IMPORTING BOOKS

In [5]:
vol1_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf'
vol2_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf'
vol3_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf'

vol1_doc = fitz.open(vol1_path)
vol2_doc = fitz.open(vol2_path)
vol3_doc = fitz.open(vol3_path)

vol1_pages = [vol1_doc[i] for i in range(vol1_doc.page_count)]
vol2_pages = [vol2_doc[i] for i in range(vol2_doc.page_count)]
vol3_pages = [vol3_doc[i] for i in range(vol3_doc.page_count)]

In [6]:
vol1_char_df = pd.read_pickle("../input/char_df/vol1_df.pkl")
vol2_char_df = pd.read_pickle("../input/char_df/vol2_df.pkl")
vol3_char_df = pd.read_pickle("../input/char_df/vol3_df.pkl")

vol1_index = list(range(616, 639)) #inclusive
vol2_index = list(range(703, 725))
vol3_index = list(range(555, 583))

#### Setting Global parameters

In [8]:
TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

### Finding strict matching genera, epithet, and column numbers

In [28]:
def genus_match(row):
    word_rspace_removed = row['word'].rstrip()
    return row['word_num'] == 0 and \
           word_rspace_removed.isalpha() and \
           word_rspace_removed[0].isupper() and word_rspace_removed[1:].islower()
           
def epithet_match(row):
    word_rspace_removed = row['word'].rstrip()
    return row['word_num'] == 0 and \
           word_rspace_removed.isalpha() and \
           word_rspace_removed.islower()

In [29]:
#rightmost point of any bounding box:
def get_center_x0(vol_char_df, page_num, bias = 30):
    """WARNING: Bias = 30 large bias causes miscatagorization in page number in book"""
    df = vol_char_df[vol_char_df['page_num'] == page_num]
    
    right_bound = df['line_bbox'].apply(lambda x : x[2]).max() 
    #leftmost point of any bounding box:
    left_bound = df['line_bbox'].apply(lambda x : x[0]).min()

    return 0.5*(right_bound + left_bound) - bias


def get_col_num(coords, center_x0):
    x0, y0, x1, y1 = coords
    return int(x0 >= center_x0)

In [34]:
all_vol_data_col_num = [(vol1_char_df, vol1_index, vol1_doc),
                        (vol2_char_df, vol2_index, vol2_doc),
                        (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, doc in all_vol_data_col_num: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    vol_char_df['genus_index_pat_match'] = (vol_char_df['page_num'].isin(vol_index)) & (vol_char_df.apply(genus_match, axis = 1))
    vol_char_df['epithet_index_pat_match'] = (vol_char_df['page_num'].isin(vol_index)) & (vol_char_df.apply(epithet_match, axis = 1))
    
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
100%|██████████| 22/22 [00:14<00:00,  1.51it/s]
100%|██████████| 28/28 [00:14<00:00,  1.89it/s]


### Genus / epithet flagging 
flagging pages where number of strict genus or epithet patern matches is less than 3 per column

In [35]:
all_vol_data_flagg_strict_match = [(vol1_char_df, vol1_index, vol1_doc, "strickt_match_vol1"),
                                   (vol2_char_df, vol2_index, vol2_doc, "strickt_match_vol2"),
                                   (vol3_char_df, vol3_index, vol3_doc, "strickt_match_vol3")]

for vol_char_df, vol_index, vol_doc, output_name in all_vol_data_flagg_strict_match: 
    #for each volume 
    image_list = []
    genus_flag_list = []
    epithet_flag_list = []
    for page_num in tqdm(vol_index):
        pix_map = vol_doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)

        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num)
                                & (vol_char_df['genus_index_pat_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus pattern match flag should check with half page and not entire page:
        for col in range(2):
            num_genus_col = genus_db[genus_db["col_num"] == col].shape[0]
            num_epithet_col = epithet_db[epithet_db["col_num"] == col].shape[0]
            if num_genus_col <= 2:
                genus_flag_list.append((num_genus_col, page_num - vol_index[0] + 1, col))
            if num_epithet_col <= 2:
                epithet_flag_list.append((num_epithet_col, page_num - vol_index[0] + 1, col))

        for coord in genus_db['word_bbox']:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        for coord in epithet_db['word_bbox']:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=5)

        image_list.append(image)

    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])    
    
    num_flag_pages = len(set([g[1] for g in genus_flag_list] + [e[1] for e in epithet_flag_list]))
    if num_flag_pages > 0: 
        print("***FLAGS***")
        print(f" number of pages to check: {num_flag_pages}")
        if genus_flag_list:
            print("  genera")
            [print(f"\t number of genera: {g_flag[0]}, page number: {g_flag[1]}, column number: {g_flag[2]}") for g_flag in genus_flag_list]
        if epithet_flag_list:
            print("  epithets")
            [print(f"\t number of epithets: {e_flag[0]}, page number: {e_flag[1]}, column number: {e_flag[2]}") for e_flag in epithet_flag_list]

100%|██████████| 23/23 [00:04<00:00,  5.28it/s]


***FLAGS***
 number of pages to check: 3
  genera
	 number of genera: 1, page number: 2, column number: 0
	 number of genera: 0, page number: 20, column number: 1
	 number of genera: 1, page number: 23, column number: 0
  epithets
	 number of epithets: 2, page number: 23, column number: 1


100%|██████████| 22/22 [00:04<00:00,  5.40it/s]


***FLAGS***
 number of pages to check: 2
  genera
	 number of genera: 2, page number: 4, column number: 0
	 number of genera: 1, page number: 4, column number: 1
	 number of genera: 0, page number: 5, column number: 0


100%|██████████| 28/28 [00:04<00:00,  5.74it/s]


***FLAGS***
 number of pages to check: 7
  genera
	 number of genera: 1, page number: 2, column number: 1
	 number of genera: 1, page number: 6, column number: 0
	 number of genera: 1, page number: 21, column number: 0
	 number of genera: 1, page number: 22, column number: 0
	 number of genera: 2, page number: 24, column number: 1
	 number of genera: 0, page number: 26, column number: 1
	 number of genera: 2, page number: 28, column number: 0


Based on flags need to make sure: 
- first find epithet coord match 
- then find genus coord match s.t. word is not in epithet coord match

### match based on coordinates

In [37]:
def is_coord_match(x, x_ref_left, x_ref_right, margin):
    return (x_ref_left - margin <= x[0] and x[0] <= x_ref_left + margin) or (x_ref_right - margin <= x[0] and x[0] <= x_ref_right + margin)

#### epithets

In [38]:
all_vol_data_coord_match = [(vol1_char_df, vol1_index),
                            (vol2_char_df, vol2_index),
                            (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data_coord_match: 
    vol_char_df["epithet_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["epithet_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        epithet_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["epithet_index_pat_match"] == True)]
        epithet_df = epithet_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_epithet_2dic = [{}, {}]
        
        for i in range(epithet_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = epithet_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = epithet_df['col_num'].iloc[i]

            ref_neighbors_df = epithet_df[(epithet_df["page_num"] == page_num) & 
                                          (epithet_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            page_epithet_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_epithet = max(page_epithet_2dic[0].values(), default = [-1, -1])[1]
        mean_right_epithet = max(page_epithet_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_epithet == -1 or mean_right_epithet == -1:
            mean_valid_col = max(mean_left_epithet, mean_right_epithet)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_epithet == -1 and mean_right_epithet == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet, mean_right_epithet, margin))

100%|██████████| 23/23 [00:01<00:00, 21.53it/s]
100%|██████████| 22/22 [00:01<00:00, 20.30it/s]
100%|██████████| 28/28 [00:01<00:00, 23.50it/s]


In [39]:
all_vol_data_epithet_coord_match_test = [(vol1_char_df, vol1_index, vol1_doc, "epithet_coord_match_pruned_vol1"),
                                         (vol2_char_df, vol2_index, vol2_doc, "epithet_coord_match_pruned_vol2"),
                                         (vol3_char_df, vol3_index, vol3_doc, "epithet_coord_match_pruned_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data_epithet_coord_match_test: 
    #for each volume 
    image_list = []
    
    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        epithet_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['epithet_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #epithet Coord is orange-pinkish, 5
        for coord in epithet_coord_db["pruned_word_bbox"] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        #epithet is blue, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.48it/s]
100%|██████████| 22/22 [00:03<00:00,  5.53it/s]
100%|██████████| 28/28 [00:04<00:00,  5.78it/s]


In [40]:
# Reminder:
# all_vol_data_coord_match = [(vol1_char_df, vol1_index),
#                             (vol2_char_df, vol2_index),
#                             (vol3_char_df, vol3_index)]
# DOES NOT CHECK IF COORD IS SAME AS EPITHET UNTIL NEXT SECTION!

for vol_char_df, vol_index in all_vol_data_coord_match: 
    #genus and not epithet
    vol_char_df["genus_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["genus_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        genus_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) &
                                    (vol_char_df["genus_index_pat_match"] == True)]
        genus_df = genus_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_genus_2dic = [{}, {}]
        
        epithet_left_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                              (vol_char_df["page_num"] == page_num) &
                                              (vol_char_df["col_num"] == 0)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_right_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                               (vol_char_df["page_num"] == page_num) &
                                               (vol_char_df["col_num"] == 1)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_coord_mean_list = [epithet_left_coord_mean, epithet_right_coord_mean]

        for i in range(genus_df.shape[0]):
            g_index = str(page_num) + "_" + str(i)
            p0 = genus_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = genus_df['col_num'].iloc[i]

            ref_neighbors_df = genus_df[(genus_df["page_num"] == page_num) & 
                                        (genus_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]

            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            if mean_neighbors > epithet_coord_mean_list[col]: 
                mean_neighbors = -1
            page_genus_2dic[col][g_index] = (num_neighbors, mean_neighbors)
        
        mean_left_genus = max(page_genus_2dic[0].values(), default = [-1, -1])[1]
        mean_right_genus = max(page_genus_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_genus == -1 or mean_right_genus == -1:
            mean_valid_col = max(mean_left_genus, mean_right_genus)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_genus == -1 and mean_right_genus == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_genus, mean_right_genus, margin))

100%|██████████| 23/23 [00:00<00:00, 26.22it/s]
100%|██████████| 22/22 [00:00<00:00, 27.81it/s]
100%|██████████| 28/28 [00:00<00:00, 30.38it/s]


In [41]:
all_vol_data_genus_coord_match_test = [(vol1_char_df, vol1_index, vol1_doc, "genus_coord_match_vol1"),
                                       (vol2_char_df, vol2_index, vol2_doc, "genus_coord_match_vol2"),
                                       (vol3_char_df, vol3_index, vol3_doc, "genus_coord_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data_genus_coord_match_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        genus_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['genus_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_coord_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.35it/s]
100%|██████████| 22/22 [00:03<00:00,  5.55it/s]
100%|██████████| 28/28 [00:05<00:00,  5.57it/s]


#### improving the coord matches 
takes genus coming before epithet into account now

In [56]:
def potential_genus_match(row):
    word_rspace_removed = row['word'].rstrip()
    return row['genus_coord_match'] == True and \
           row['epithet_coord_match'] == False and \
           word_rspace_removed.find("Flore") == -1 and \
           ((word_rspace_removed.isupper() == False and \
             word_rspace_removed.isnumeric() == False) or \
            ((word_rspace_removed == 'X') or (word_rspace_removed =='×')))
           # removing this for-    hg now ... and row['genus_mean_coord'] < row['epithet_mean_coord'] #important to check this only when epithet_coord_match is false?

def potential_epithet_match(row):
    word_rspace_removed = row['word'].rstrip()
    return row['epithet_coord_match'] == True and \
           ((word_rspace_removed.isupper() == False and \
             word_rspace_removed.isnumeric() == False) or \
            (word_rspace_removed == 'X') or (word_rspace_removed =='×'))

In [48]:
vol1_char_df['potential_genus_match'] = vol1_char_df.apply(potential_genus_match, axis = 1)
vol1_char_df['potential_epithet_match'] = vol1_char_df.apply(potential_epithet_match, axis = 1)

vol2_char_df['potential_genus_match'] = vol2_char_df.apply(potential_genus_match, axis = 1)
vol2_char_df['potential_epithet_match'] = vol2_char_df.apply(potential_epithet_match, axis = 1)

vol3_char_df['potential_genus_match'] = vol3_char_df.apply(potential_genus_match, axis = 1)
vol3_char_df['potential_epithet_match'] = vol3_char_df.apply(potential_epithet_match, axis = 1)

In [49]:
all_vol_data_GE_potential_match_test = [(vol1_char_df, vol1_index, vol1_doc, "GE_potential_match_vol1"),
                                        (vol2_char_df, vol2_index, vol2_doc, "GE_potential_match_vol2"),
                                        (vol3_char_df, vol3_index, vol3_doc, "GE_potential_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data_GE_potential_match_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['potential_genus_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_epithet_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.48it/s]
100%|██████████| 22/22 [00:04<00:00,  5.47it/s]
100%|██████████| 28/28 [00:04<00:00,  5.64it/s]


### SOME HARDCODING PARTS:

In [77]:
vol1_char_df[vol1_char_df['word'].str.contains('d,IlLIlU')]

Unnamed: 0,vol_num,page_num,block_num,block_num_absolute,block_bbox,line_num,line_wmode,line_dir,line_bbox,span_num,...,char,char_origin,char_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,epithet_coord_match,genus_coord_match,potential_genus_match,potential_epithet_match


based on this image output in volumen 1:
 ![Erianthus](Erianthus.png)



In [78]:
weird_old_char_vol1 = vol1_char_df.loc[1734312:1734328]
weird_old_char_vol1['word']

1734312        Bieb.
1734313    Erianthus
1734314    Erianthus
1734315    Erianthus
1734316    Erianthus
1734317    Erianthus
1734318    Erianthus
1734319    Erianthus
1734320    Erianthus
1734321    Erianthus
1734322    Erianthus
1734323    Erianthus
1734324    Erianthus
1734325    Erianthus
1734326    Erianthus
1734327    Erianthus
1734328       hostii
Name: word, dtype: object

In [79]:
weird_old_char_vol1['word_num']

1734312    2
1734313    0
1734314    0
1734315    0
1734316    0
1734317    0
1734318    0
1734319    0
1734320    0
1734321    0
1734322    0
1734323    0
1734324    0
1734325    0
1734326    0
1734327    0
1734328    0
Name: word_num, dtype: int64

In [80]:
#manually fixing the OCR error for J_JI J.d,IlLIlU. hostii Griseb.
vol1_char_df.loc[1734313:1734327, 'word'] = 'Erianthus'
vol1_char_df.loc[1734313:1734327, 'word_num'] = 0
vol1_char_df.loc[1734313:1734327, 'pruned_word'] = 'Erianthus'
temp_word_x0 = vol1_char_df.loc[1734313:1734327, 'word_bbox'].apply(lambda x : x[0]).min()
temp_word_y0 = vol1_char_df.loc[1734313:1734327, 'word_bbox'].apply(lambda x : x[1]).min()
temp_word_x1 = vol1_char_df.loc[1734313:1734327, 'word_bbox'].apply(lambda x : x[2]).max()
temp_word_y1 = vol1_char_df.loc[1734313:1734327, 'word_bbox'].apply(lambda x : x[3]).max()
vol1_char_df.loc[1734313:1734327, 'word_bbox'] =vol1_char_df.loc[1734313:1734327, 'word_bbox'].apply(lambda x : (temp_word_x0, temp_word_y0, temp_word_x1, temp_word_y1))

vol1_char_df.loc[1734313:1734327, 'potential_epithet_match'] = False
vol1_char_df.loc[1734313:1734327, 'potential_genus_match'] = True

### Infra species

In [82]:
# Reminder:
# all_vol_data_coord_match = [(vol1_char_df, vol1_index),
#                             (vol2_char_df, vol2_index),
#                             (vol3_char_df, vol3_index)]
for vol_char_df, vol_index in all_vol_data_coord_match: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["potential_epithet_match"] == True) | (vol_char_df["potential_genus_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus


        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 14.00it/s]
100%|██████████| 22/22 [00:01<00:00, 13.49it/s]
100%|██████████| 28/28 [00:01<00:00, 15.16it/s]


In [92]:
# Takes longer but makes more sense generally. We will skip it here
# def potential_author_match_infra_coord(row):
#     word = row['word']
#     pruned_word = row['pruned_word']
#     lower_word = word.lower()
#     latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$"
#     infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
#     is_latin_connectives = re.search(latin_connectives, word) != None
#     is_infra_symbol = re.search(infra_symbols, lower_word) != None
#     if pruned_word:
#         is_upper_first = pruned_word[0].isupper()
#     else:
#         is_upper_first = False
#     return (not is_infra_symbol) and (is_upper_first or is_latin_connectives)

def potential_author_match_infra_coord(word):
    lower_word = word.lower()
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$"
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_infra_symbol = re.search(infra_symbols, lower_word) != None
    return (not is_infra_symbol) and (word[0].isupper() or is_latin_connectives)

In [94]:
def has_infra_symbols(word):
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    return re.search(infra_symbols, word) != None

In [99]:
# Reminder:
# all_vol_data_coord_match = [(vol1_char_df, vol1_index),
#                             (vol2_char_df, vol2_index),
#                             (vol3_char_df, vol3_index)]
for vol_char_df, _ in all_vol_data_coord_match:
    vol_char_df["potential_infra_match"] = (vol_char_df['word'].apply(has_infra_symbols)) | \
                                           ((vol_char_df["infra_coord_match"] == True) & (vol_char_df['word'].apply(potential_author_match_infra_coord) == False))

In [100]:
all_vol_dat_infra_match_test = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
                                (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
                                (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_dat_infra_match_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['infra_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_infra_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
                                         (vol_char_df['infra_coord_match'] == True) & 
                                         (vol_char_df['word'].apply(has_infra_symbols) == True)
                                        ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                        ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in infra_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

        for coord in infra_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in with_infra_symbols['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [01:36<00:00,  3.43s/it]
100%|██████████| 22/22 [01:30<00:00,  4.13s/it]
100%|██████████| 23/23 [01:26<00:00,  3.76s/it]


### page num processings

In [101]:
vol1_char_df['index_page_num'] = vol1_char_df['page_num'] - vol1_index[0] + 1
vol2_char_df['index_page_num'] = vol2_char_df['page_num'] - vol2_index[0] + 1
vol3_char_df['index_page_num'] = vol3_char_df['page_num'] - vol3_index[0] + 1

In [102]:
# all_vol_data_col_num = [(vol1_char_df, vol1_index, vol1_doc),
#                         (vol2_char_df, vol2_index, vol2_doc),
#                         (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, vol_doc in all_vol_data_col_num: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num, - 30)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num_for_PN'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:56<00:00,  2.48s/it]
100%|██████████| 22/22 [00:59<00:00,  2.72s/it]
100%|██████████| 28/28 [01:02<00:00,  2.24s/it]


In [103]:
def is_page_num(row):
    return row['pruned_word'].isnumeric()

In [104]:
# Reminder:
# all_vol_data_coord_match = [(vol1_char_df, vol1_index),
#                             (vol2_char_df, vol2_index),
#                             (vol3_char_df, vol3_index)]
for vol_char_df, vol_index in all_vol_data_coord_match: 
    vol_char_df['page_num_index_pat_match'] = (vol_char_df['page_num'].isin(vol_index)) & (vol_char_df.apply(is_page_num, axis = 1))
    vol_char_df["page_num_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["page_num_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        page_num_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["page_num_index_pat_match"] == True)]
        page_num_df = page_num_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_page_num_2dic = [{}, {}]
        
        for i in range(page_num_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = page_num_df['word_bbox'].iloc[i]
            x_ref = p0[2]
            col = page_num_df['col_num_for_PN'].iloc[i]

            ref_neighbors_df = page_num_df[(page_num_df["page_num"] == page_num) & 
                                           (page_num_df["word_bbox"].apply(lambda x : x_ref - margin <= x[2] and x[2] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[2]).mean()
            page_page_num_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_page_num = max(page_page_num_2dic[0].values(), default = [-1, -1])[1]
        mean_right_page_num = max(page_page_num_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_page_num == -1 or mean_right_page_num == -1:
            mean_valid_col = max(mean_left_page_num, mean_right_page_num)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match([x[2]], mean_valid_col, mean_valid_col, margin))
        elif mean_left_page_num == -1 and mean_right_page_num == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match([x[2]], mean_left_page_num, mean_right_page_num, margin))


100%|██████████| 23/23 [00:04<00:00,  5.52it/s]
100%|██████████| 22/22 [00:04<00:00,  5.02it/s]
100%|██████████| 28/28 [00:03<00:00,  7.58it/s]


In [105]:
all_vol_data_PN_test = [(vol1_char_df, vol1_index, vol1_doc, "potential_page_num_match_vol1"),
                        (vol2_char_df, vol2_index, vol2_doc, "potential_page_num_match_vol2"),
                        (vol3_char_df, vol3_index, vol3_doc, "potential_page_num_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_data_PN_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        page_num_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['page_num_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        # infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
        #                         & (vol_char_df['potential_infra_match'] == True)
        #                         ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
        #                         ].drop_duplicates()

        # with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
        #                                  (vol_char_df['infra_coord_match'] == True) & 
        #                                  (vol_char_df['word'].apply(has_infra_symbols) == True)
        #                                 ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
        #                                 ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in page_num_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)

        # for coord in infra_db['word_bbox'] :
        #     x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
        #     draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # # #epithet is red, 3
        # for coord in with_infra_symbols['word_bbox'] :
        #     x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
        #     draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [00:04<00:00,  6.01it/s]
100%|██████████| 22/22 [00:03<00:00,  5.83it/s]
100%|██████████| 23/23 [00:04<00:00,  5.70it/s]


### pruning char_df and getting index_df

In [107]:
[c for c in vol1_char_df.columns if c.startswith('potential')]

['potential_genus_match', 'potential_epithet_match', 'potential_infra_match']

In [333]:
#making sure page_num is in index
#making sure the genus level word is not all uppercase (a family name)
#making sure the pruned_word is not numeric (removing page_number as it's not in order usually) and removing page_num_coord_match

all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

result = [] 
ignore_word_list = ["NOUVELLE", "Flore", "FLORE", "INDEX", ""]
for vol_char_df, vol_index in all_vol_data:
    curr_result_df = vol_char_df[(vol_char_df['page_num'].isin(vol_index)) &
                                (~((vol_char_df["word"].str.isupper()) & (vol_char_df["word"].apply(lambda x : len(x) > 2)) & (vol_char_df['genus_coord_match'] == True))) & 
                                (~(vol_char_df["pruned_word"].isin(ignore_word_list))) &
                                (~(vol_char_df["pruned_word"].str.isnumeric() & (vol_char_df["word"] != "(3"))) & 
                                (~(vol_char_df["page_num_coord_match"] == True))
                                ].copy()
    result.append(curr_result_df)

vol1_index_df, vol2_index_df, vol3_index_df = result[0], result[1], result[2]

In [289]:
all_vol_data_PN_test = [(vol1_index_df, vol1_index, vol1_doc, "valid_words_vol1"),
                        (vol2_index_df, vol2_index, vol2_doc, "valid_words_vol2"),
                        (vol3_index_df, vol3_index, vol3_doc, "valid_words_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data_PN_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        temp_coords = vol_char_df[vol_char_df['page_num'] == page_num]['word_bbox'].drop_duplicates()
        for coord in temp_coords:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:03<00:00,  6.01it/s]
100%|██████████| 22/22 [00:03<00:00,  6.13it/s]
100%|██████████| 28/28 [00:04<00:00,  6.29it/s]


In [334]:
#only keeping word level
vol_index_df_list = [vol1_index_df, vol2_index_df, vol3_index_df]
result_df = []
for vol_index_df in vol_index_df_list:
    keep_cols = vol_index_df.columns.difference(["char_num", "char", "char_origin", "char_bbox"], sort=False).tolist()

    vol_index_df = vol_index_df.copy().loc[:,keep_cols].drop_duplicates().reset_index()
    vol_index_df.rename(columns={"index": "char_index"}, inplace = True)
    result_df.append(vol_index_df)

vol1_index_df, vol2_index_df, vol3_index_df = result_df[0], result_df[1], result_df[2]


In [335]:
def has_hybrid_symbols(word):
    infra_symbols = r"^X[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    return re.search(infra_symbols, word) != None

In [338]:
result_df_hybrids = []
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['is_hybrid'] = np.NaN
    vol_index_df.loc[(vol_index_df['potential_infra_match'] == True) | (vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True), 'is_hybrid'] = (vol_index_df['word'].apply(has_hybrid_symbols) == True) & ((vol_index_df['potential_infra_match'] == True) | (vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True))
    
    hybrid_genera_indecies = vol_index_df[(vol_index_df['potential_genus_match'] == True) & (vol_index_df['word'].apply(has_hybrid_symbols) == True)].index + 1
    hybrid_epithet_indecies = vol_index_df[(vol_index_df['potential_epithet_match'] == True) & (vol_index_df['word'].apply(has_hybrid_symbols) == True)].index + 1
    
    vol_index_df.loc[hybrid_epithet_indecies, 'is_hybrid'] = True
    vol_index_df.loc[hybrid_epithet_indecies, 'potential_epithet_match'] = True 

    vol_index_df.loc[hybrid_genera_indecies, 'is_hybrid'] = True
    vol_index_df.loc[hybrid_genera_indecies, 'potential_genus_match'] = True

    drop_list = list(hybrid_epithet_indecies - 1) + list(hybrid_genera_indecies -1)
    
    vol_index_df = vol_index_df[~vol_index_df.index.isin(drop_list)]
    vol_index_df['is_hybrid'].ffill(inplace=True)

    result_df_hybrids.append(vol_index_df)

vol1_index_df, vol2_index_df, vol3_index_df = result_df_hybrids[0], result_df_hybrids[1], result_df_hybrids[2]

In [339]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_genus_names(row):
    if row['potential_genus_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan
        
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_genus'] = vol_index_df.apply(extract_potential_genus_names, axis = 1)
    vol_index_df['closest_genus'].ffill(inplace=True)

In [340]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_epithet_names(row):
    if row['potential_epithet_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_epithet'] = vol_index_df.apply(extract_potential_epithet_names, axis = 1)
    vol_index_df.loc[vol_index_df['potential_genus_match'] == True, 'closest_epithet'] = -1
    vol_index_df['closest_epithet'].ffill(inplace=True)

In [341]:
def extract_potential_infra_type(row):
    if row['potential_infra_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df.loc[(vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True), 'closest_infra_type'] = -1
    vol_index_df['closest_infra_type'] = vol_index_df.apply(extract_potential_infra_type, axis = 1)
    vol_index_df.loc[(vol_index_df['potential_infra_match'] == False) & ((vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True)), 'closest_infra_type'] = -1
    vol_index_df['closest_infra_type'].ffill(inplace=True)

In [342]:
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    infra_name_match_indecies = vol_index_df[vol_index_df['potential_infra_match'] == True].index + 1
    vol_index_df['closest_infra_name'] = np.NaN
    vol_index_df.loc[infra_name_match_indecies, 'closest_infra_name'] = vol_index_df.apply(lambda row : row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num']) , axis = 1)
    vol_index_df['potential_infra_name_match'] = vol_index_df.index.isin(infra_name_match_indecies)
    vol_index_df.loc[(vol_index_df['potential_infra_match'] == True) | (vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True), 'closest_infra_name'] = -1
    vol_index_df['closest_infra_name'].ffill(inplace=True)

In [343]:
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['potential_author_match'] = (vol_index_df['potential_genus_match'] == False) & \
                                             (vol_index_df['potential_epithet_match'] == False) & \
                                             (vol_index_df['potential_infra_match'] == False) & \
                                             (vol_index_df['potential_infra_name_match'] == False)

In [345]:
vol3_index_df.iloc[100:150, 20:]

Unnamed: 0,word,word_bbox,pruned_word,pruned_word_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,epithet_coord_match,genus_coord_match,potential_genus_match,...,col_num_for_PN,page_num_index_pat_match,page_num_coord_match,is_hybrid,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_infra_name_match,potential_author_match
100,micrantha,"(26.639999389648438, 548.4010009765625, 63.269...",micrantha,"(26.639999389648438, 548.4010009765625, 63.269...",False,True,0,True,False,False,...,0,False,False,False,Achillea_555_17_0,micrantha_555_25_0,-1,-1,False,False
101,Willd.,"(65.68165588378906, 547.4830322265625, 89.5377...",Willd,"(65.68165588378906, 547.4830322265625, 87.0438...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,micrantha_555_25_0,-1,-1,False,True
102,non,"(113.14447021484375, 547.4830322265625, 127.13...",non,"(113.14447021484375, 547.4830322265625, 127.13...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,micrantha_555_25_0,-1,-1,False,True
103,Willd.,"(129.73193359375, 547.4830322265625, 153.38996...",Willd,"(129.73193359375, 547.4830322265625, 150.93576...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,micrantha_555_25_0,-1,-1,False,True
104,odorata,"(26.639999389648438, 558.0009765625, 54.857971...",odorata,"(26.639999389648438, 558.0009765625, 54.857971...",False,True,0,True,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,-1,-1,False,False
105,W.,"(58.12398147583008, 557.0830078125, 69.0228958...",W,"(58.12398147583008, 557.0830078125, 66.5350189...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,-1,-1,False,True
106,Koch,"(72.80469512939453, 557.0830078125, 93.1729583...",Koch,"(72.80469512939453, 557.0830078125, 93.1729583...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,-1,-1,False,True
107,subsp.,"(35.7599983215332, 566.6829833984375, 59.72493...",subsp,"(35.7599983215332, 566.6829833984375, 57.20934...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,subsp._555_26_0,-1,False,False
108,kotschyi,"(62.08212661743164, 567.6009521484375, 91.2802...",kotschyi,"(62.08212661743164, 567.6009521484375, 91.2802...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,subsp._555_26_0,kotschyi_555_26_0,True,False
109,(Boiss.),"(93.81856536865234, 566.6829833984375, 120.712...",Boiss,"(96.63214874267578, 566.6829833984375, 115.825...",False,False,0,False,False,False,...,0,False,False,False,Achillea_555_17_0,odorata_555_25_1,subsp._555_26_0,kotschyi_555_26_0,False,True


In [346]:
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df.replace(-1, np.NaN, inplace = True)
    vol_index_df.replace(np.NaN, "",inplace = True)

In [347]:
#author grouping 
# 
author_grouping = ['closest_genus', 'closest_epithet', 'closest_infra_name']
merge_on = ['closest_genus', 'closest_epithet', 'closest_infra_name']
def concatenate(group):
    return group.loc[group['potential_author_match'] == True, 'word'].str.cat(sep=' ')

result_df_authors = [] 
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]: 
    #author_grouping = ['closest_genus', 'closest_epithet']
    #merge_on = ['closest_genus', 'closest_epithet']
    groups = vol_index_df.groupby(author_grouping)
    concatenated = groups.apply(concatenate).reset_index()

    # add the concatenated values to the original dataframe
    result = vol_index_df.merge(concatenated[merge_on + [0]], on=merge_on, how='left').rename(columns={0: 'authors'})
    result_df_authors.append(result)
    
vol1_index_df, vol2_index_df, vol3_index_df = result_df_authors[0], result_df_authors[1], result_df_authors[2]


In [348]:
# for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
#     #vol_index_df.replace("", np.NaN,inplace = True)
#     vol_index_df.replace(np.NaN, "",inplace = True)

In [349]:
all_vol_data_cat_test = [(vol1_index_df, vol1_index, vol1_doc, "catagorized_vol1"),
                         (vol2_index_df, vol2_index, vol2_doc, "catagorized_vol2"),
                         (vol3_index_df, vol3_index, vol3_doc, "catagorized_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data_cat_test: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)

        for col_num in [0, 1]:
            temp_df = vol_char_df[(vol_char_df['page_num'] == page_num) & (vol_char_df['col_num'] == col_num)]
            #genus Coord is orange-pinkish, 5
            for name, group in temp_df.groupby(['closest_genus'])['word_bbox']:
                x0 = (group.apply(lambda x : x[0]).min())*TARGET_DPI/ 72
                y0 = (group.apply(lambda x : x[1]).min())*TARGET_DPI/ 72
                x1 = (group.apply(lambda x : x[2]).max())*TARGET_DPI/ 72
                y1 = (group.apply(lambda x : x[3]).max())*TARGET_DPI/ 72
                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#6939a3"), width=3)

            for name, group in temp_df.groupby(['closest_epithet'])['word_bbox']:
                if name != '':
                    x0 = (group.apply(lambda x : x[0]).min())*TARGET_DPI/ 72
                    y0 = (group.apply(lambda x : x[1]).min())*TARGET_DPI/ 72
                    x1 = (group.apply(lambda x : x[2]).max())*TARGET_DPI/ 72
                    y1 = (group.apply(lambda x : x[3]).max())*TARGET_DPI/ 72
                    draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)

            for name, group in temp_df.groupby(['closest_infra_name'])['word_bbox']:
                if name != '':
                    x0 = (group.apply(lambda x : x[0]).min())*TARGET_DPI/ 72
                    y0 = (group.apply(lambda x : x[1]).min())*TARGET_DPI/ 72
                    x1 = (group.apply(lambda x : x[2]).max())*TARGET_DPI/ 72
                    y1 = (group.apply(lambda x : x[3]).max())*TARGET_DPI/ 72
                    draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#8c690b"), width=3)

            temp_df_author_only = temp_df[temp_df['potential_author_match'] == True]
            for name, group in temp_df_author_only.groupby(['closest_genus', 'closest_epithet', 'closest_infra_name'])['word_bbox']:
                x0 = (group.apply(lambda x : x[0]).min())*TARGET_DPI/ 72
                y0 = (group.apply(lambda x : x[1]).min())*TARGET_DPI/ 72
                x1 = (group.apply(lambda x : x[2]).max())*TARGET_DPI/ 72
                y1 = (group.apply(lambda x : x[3]).max())*TARGET_DPI/ 72

                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#9e9e9e"), width=3)


        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:05<00:00,  4.58it/s]
100%|██████████| 22/22 [00:04<00:00,  4.66it/s]
100%|██████████| 28/28 [00:06<00:00,  4.63it/s]


In [350]:
def fix_words(word):
    head, sep, tail = word.partition('_')
    return head 

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_genus'] = vol_index_df['closest_genus'].apply(fix_words)
    vol_index_df['closest_epithet'] = vol_index_df['closest_epithet'].apply(fix_words)
    vol_index_df['closest_infra_type'] = vol_index_df['closest_infra_type'].apply(fix_words)
    vol_index_df['closest_infra_name'] = vol_index_df['closest_infra_name'].apply(fix_words)

In [351]:
result_prune_authors_list = []
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    result_prune_authors = vol_index_df[(vol_index_df['potential_genus_match'] == True) |
                                        (vol_index_df['potential_epithet_match'] == True) |
                                        (vol_index_df['potential_infra_name_match'] == True)]
    result_prune_authors_list.append(result_prune_authors)

prune_authors_vol1, prune_authors_vol2, prune_authors_vol3 =  result_prune_authors_list[0], result_prune_authors_list[1], result_prune_authors_list[2]

In [352]:
simplified_vol1 = prune_authors_vol1[['closest_genus',
                                      'closest_epithet',
                                      'closest_infra_type',
                                      'closest_infra_name',
                                      'authors']]
simplified_vol1.to_csv('../output/local/index_output/vol1_index_output.csv')

simplified_vol2 = prune_authors_vol2[['closest_genus',
                                      'closest_epithet',
                                      'closest_infra_type',
                                      'closest_infra_name',
                                      'authors']]
simplified_vol2.to_csv('../output/local/index_output/vol2_index_output.csv')
                                
simplified_vol3 = prune_authors_vol3[['closest_genus',
                                      'closest_epithet',
                                      'closest_infra_type',
                                      'closest_infra_name',
                                      'authors']]
simplified_vol3.to_csv('../output/local/index_output/vol3_index_output.csv')

In [353]:
non_italics_simplified_vol1 = prune_authors_vol1.loc[(prune_authors_vol1['span_flags'] != 6),
                                                     ['closest_genus',
                                                      'closest_epithet',
                                                      'closest_infra_type',
                                                      'closest_infra_name',
                                                      'authors']]
non_italics_simplified_vol1.to_csv('../output/local/index_output/vol1_nonitalics.csv')

non_italics_simplified_vol2 = prune_authors_vol2.loc[(prune_authors_vol2['span_flags'] != 6),
                                                     ['closest_genus',
                                                      'closest_epithet',
                                                      'closest_infra_type',
                                                      'closest_infra_name',
                                                      'authors']]
non_italics_simplified_vol2.to_csv('../output/local/index_output/vol2_nonitalics.csv')

non_italics_simplified_vol3 = prune_authors_vol3.loc[(prune_authors_vol3['span_flags'] != 6),
                                                     ['closest_genus',
                                                      'closest_epithet',
                                                      'closest_infra_type',
                                                      'closest_infra_name',
                                                      'authors']]
non_italics_simplified_vol3.to_csv('../output/local/index_output/vol3_nonitalics.csv')