In [414]:
import fitz
import numpy as np
import pandas as pd
from tqdm import tqdm

import io
from PIL import Image, ImageDraw, ImageFont, ImageColor

import math
import re

In [415]:
vol1_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf'
vol2_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf'
vol3_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf'

vol1_doc = fitz.open(vol1_path)
vol2_doc = fitz.open(vol2_path)
vol3_doc = fitz.open(vol3_path)

vol1_pages = [vol1_doc[i] for i in range(vol1_doc.page_count)]
vol2_pages = [vol2_doc[i] for i in range(vol2_doc.page_count)]
vol3_pages = [vol3_doc[i] for i in range(vol3_doc.page_count)]

In [416]:
vol1_char_df = pd.read_pickle("../input/char_df/vol1_df.pkl")
vol2_char_df = pd.read_pickle("../input/char_df/vol2_df.pkl")
vol3_char_df = pd.read_pickle("../input/char_df/vol3_df.pkl")

vol1_index = list(range(616, 639)) #inclusive
vol2_index = list(range(703, 725))
vol3_index = list(range(555, 583))

In [417]:
TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

### finding the columns 
### & checking if a word is a strict match for the genus / epithet pattern

In [418]:
def epithet_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'].islower()

def genus_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'][0].isupper() and row['word'][1:].islower()

In [419]:
#rightmost point of any bounding box:
def get_center_x0(vol_char_df, page_num, bias = 30):
    """WARNING: large bias causes miscatagorization in page number in book"""
    df = vol_char_df[vol_char_df['page_num'] == page_num]
    
    right_bound = df['line_bbox'].apply(lambda x : x[2]).max() 
    #leftmost point of any bounding box:
    left_bound = df['line_bbox'].apply(lambda x : x[0]).min()

    return 0.5*(right_bound + left_bound) - bias


def get_col_num(coords, center_x0):
    x0, y0, x1, y1 = coords
    return int(x0 >= center_x0)


all_vol_data = [(vol1_char_df, vol1_index, vol1_doc),
                (vol2_char_df, vol2_index, vol2_doc),
                (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, doc in all_vol_data: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    vol_char_df['genus_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and genus_match(r), axis = 1) #does this for whole books which is bad
    vol_char_df['epithet_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and epithet_match(r), axis = 1) #does this for whole books which is bad
    
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:13<00:00,  1.71it/s]
100%|██████████| 22/22 [00:15<00:00,  1.46it/s]
100%|██████████| 28/28 [00:15<00:00,  1.86it/s]


#### testing if col num correctly assigned

In [420]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "index_col_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "index_col_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "index_col_vol3")]

for vol_char_df, vol_index, vol_doc, output_name in all_vol_data:
    image_list = []
    keep_cols = vol_char_df.columns.difference(["char_num", "char", "char_origin", "char_bbox", "char_x0", "char_y0", "char_x1", "char_y1", "pruned_char_x0", "pruned_char_y0", "pruned_char_x1", "pruned_char_y1"], sort=False).tolist()
    for page_num in tqdm(vol_index):
        pix_map = vol_doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)

        temp_df = vol_char_df[vol_char_df["page_num"] == page_num].loc[:, keep_cols].drop_duplicates()

        for coord in temp_df[temp_df['col_num'] == 0]['line_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        for coord in temp_df[temp_df['col_num'] == 1]['line_bbox']:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=5)
            
        image_list.append(image)
        #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])


100%|██████████| 23/23 [00:04<00:00,  4.85it/s]
100%|██████████| 22/22 [00:04<00:00,  4.88it/s]
100%|██████████| 28/28 [00:05<00:00,  5.04it/s]


### Genus / epithet flagging 
flagging pages where number of strict genus or epithet patern matches is less than 3 per column

In [421]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "strickt_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "strickt_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "strickt_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    genus_flag_list = []
    epithet_flag_list = []
    for page_num in tqdm(vol_index):
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num)
                                & (vol_char_df['genus_index_pat_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus pattern match flag should check with half page and not entire page:
        for col in range(2):
            num_genus_col = genus_db[genus_db["col_num"] == col].shape[0]
            num_epithet_col = epithet_db[epithet_db["col_num"] == col].shape[0]
            if num_genus_col <= 2:
                genus_flag_list.append((num_genus_col, page_num - vol_index[0] + 1, col))
            if num_epithet_col <= 2:
                epithet_flag_list.append((num_epithet_col, page_num - vol_index[0] + 1, col))
    num_flag_pages = len(set([g[1] for g in genus_flag_list] + [e[1] for e in epithet_flag_list]))
    if num_flag_pages > 0: 
        print("***FLAGS***")
        print(f" number of pages to check: {num_flag_pages}")
        if genus_flag_list:
            print("  genera")
            [print(f"\t number of genera: {g_flag[0]}, page number: {g_flag[1]}, column number: {g_flag[2]}") for g_flag in genus_flag_list]
        if epithet_flag_list:
            print("  epithets")
            [print(f"\t number of epithets: {e_flag[0]}, page number: {e_flag[1]}, column number: {e_flag[2]}") for e_flag in epithet_flag_list]

100%|██████████| 23/23 [00:00<00:00, 85.28it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 1, page number: 2, column number: 0
	 number of genera: 2, page number: 15, column number: 1
	 number of genera: 0, page number: 20, column number: 1
	 number of genera: 1, page number: 23, column number: 0
  epithets
	 number of epithets: 2, page number: 23, column number: 1


100%|██████████| 22/22 [00:00<00:00, 78.92it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 2, page number: 4, column number: 0
	 number of genera: 1, page number: 4, column number: 1
	 number of genera: 0, page number: 5, column number: 0
	 number of genera: 1, page number: 12, column number: 0
	 number of genera: 2, page number: 14, column number: 1


100%|██████████| 28/28 [00:00<00:00, 92.06it/s]

***FLAGS***
 number of pages to check: 7
  genera
	 number of genera: 1, page number: 2, column number: 1
	 number of genera: 0, page number: 6, column number: 0
	 number of genera: 1, page number: 21, column number: 0
	 number of genera: 1, page number: 22, column number: 0
	 number of genera: 2, page number: 24, column number: 1
	 number of genera: 2, page number: 26, column number: 0
	 number of genera: 0, page number: 26, column number: 1
	 number of genera: 2, page number: 28, column number: 0





### match  based on coordinates

In [422]:
def is_coord_match(x, x_ref_left, x_ref_right, margin):
    return (x_ref_left - margin <= x[0] and x[0] <= x_ref_left + margin) or (x_ref_right - margin <= x[0] and x[0] <= x_ref_right + margin)

#### epithet

In [423]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["epithet_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["epithet_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        epithet_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["epithet_index_pat_match"] == True)]
        epithet_df = epithet_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_epithet_2dic = [{}, {}]
        
        for i in range(epithet_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = epithet_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = epithet_df['col_num'].iloc[i]

            ref_neighbors_df = epithet_df[(epithet_df["page_num"] == page_num) & 
                                          (epithet_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            page_epithet_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_epithet = max(page_epithet_2dic[0].values(), default = [-1, -1])[1]
        mean_right_epithet = max(page_epithet_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_epithet == -1 or mean_right_epithet == -1:
            mean_valid_col = max(mean_left_epithet, mean_right_epithet)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_epithet == -1 and mean_right_epithet == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet, mean_right_epithet, margin))

100%|██████████| 23/23 [00:01<00:00, 22.48it/s]
100%|██████████| 22/22 [00:01<00:00, 21.97it/s]
100%|██████████| 28/28 [00:01<00:00, 24.59it/s]


In [424]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "epithet_coord_match_pruned_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "epithet_coord_match_pruned_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "epithet_coord_match_pruned_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    
    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        epithet_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['epithet_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #epithet Coord is orange-pinkish, 5
        for coord in epithet_coord_db["pruned_word_bbox"] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        #epithet is blue, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  4.79it/s]
100%|██████████| 22/22 [00:04<00:00,  4.63it/s]
100%|██████████| 28/28 [00:05<00:00,  4.86it/s]


#### Genus coord match

In [425]:
# add something about genus should come before epithet? 
    # assert df[df['epithet_coord_match'] == True]['word_bbox'].apply(lambda x: x[0]).mean() 
    #     >  df[df['genus_coord_match'] == True]['word_bbox'].apply(lambda x: x[0]).mean() 
    # and if False it shouldn't be a genus_coord?

In [426]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    #genus and not epithet
    vol_char_df["genus_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["genus_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        genus_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) &
                                    (vol_char_df["genus_index_pat_match"] == True)]
        genus_df = genus_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_genus_2dic = [{}, {}]
        
        epithet_left_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                              (vol_char_df["page_num"] == page_num) &
                                              (vol_char_df["col_num"] == 0)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_right_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                               (vol_char_df["page_num"] == page_num) &
                                               (vol_char_df["col_num"] == 1)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_coord_mean_list = [epithet_left_coord_mean, epithet_right_coord_mean]

        for i in range(genus_df.shape[0]):
            g_index = str(page_num) + "_" + str(i)
            p0 = genus_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = genus_df['col_num'].iloc[i]

            ref_neighbors_df = genus_df[(genus_df["page_num"] == page_num) & 
                                        (genus_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]

            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            if mean_neighbors > epithet_coord_mean_list[col]: 
                mean_neighbors = -1
            page_genus_2dic[col][g_index] = (num_neighbors, mean_neighbors)
        
        mean_left_genus = max(page_genus_2dic[0].values(), default = [-1, -1])[1]
        mean_right_genus = max(page_genus_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_genus == -1 or mean_right_genus == -1:
            mean_valid_col = max(mean_left_genus, mean_right_genus)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_genus == -1 and mean_right_genus == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_genus, mean_right_genus, margin))

100%|██████████| 23/23 [00:00<00:00, 32.44it/s]
100%|██████████| 22/22 [00:00<00:00, 29.16it/s]
100%|██████████| 28/28 [00:00<00:00, 35.03it/s]


In [427]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "genus_coord_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "genus_coord_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "genus_coord_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        genus_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['genus_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_coord_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  4.75it/s]
100%|██████████| 22/22 [00:04<00:00,  4.82it/s]
100%|██████████| 28/28 [00:05<00:00,  5.04it/s]


### Add column for genus / epithet coord mean for each page

In [428]:
# all_vol_data = [(vol1_char_df, vol1_index),
#                 (vol2_char_df, vol2_index),
#                 (vol3_char_df, vol3_index)]
# for vol_char_df, vol_index in all_vol_data:
#     for page_num in vol_index:
#         for c_i in [0, 1]:
#             genus_mean_coord = vol_char_df[(vol_char_df['page_num'] == page_num) & (vol_char_df['genus_coord_match'] == True) & (vol_char_df['col_num'] == c_i)]['word_bbox'].apply(lambda x: x[0]).mean()
#             epithet_mean_coord = vol_char_df[(vol_char_df['page_num'] == page_num) & (vol_char_df['epithet_coord_match'] == True) & (vol_char_df['col_num'] == c_i)]['word_bbox'].apply(lambda x: x[0]).mean()
        
#             #doing this because you can have no genus in one page but not no genus but an epithet...
#             if np.isnan(genus_mean_coord):
#                 genus_mean_coord == 0
#             if np.isnan(epithet_mean_coord):
#                 epithet_mean_coord = 1

#             vol_char_df.loc[(vol_char_df['page_num'] == page_num) & (vol_char_df['col_num'] == c_i), 'genus_mean_coord'] = genus_mean_coord
#             vol_char_df.loc[(vol_char_df['page_num'] == page_num) & (vol_char_df['col_num'] == c_i), 'epithet_mean_coord'] = epithet_mean_coord

### extract potential genus / epithet matches

In [429]:
def potential_genus_match(row):
    return row['genus_coord_match'] == True and \
           row['epithet_coord_match'] == False and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False and \
           row['word'].find("Flore") == -1 
           # removing this for now ... and row['genus_mean_coord'] < row['epithet_mean_coord'] #important to check this only when epithet_coord_match is false?

def potential_epithet_match(row):
    return row['epithet_coord_match'] == True and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False

In [430]:
vol1_char_df['potential_genus_match'] = vol1_char_df.apply(potential_genus_match, axis = 1)
vol1_char_df['potential_epithet_match'] = vol1_char_df.apply(potential_epithet_match, axis = 1)

vol2_char_df['potential_genus_match'] = vol2_char_df.apply(potential_genus_match, axis = 1)
vol2_char_df['potential_epithet_match'] = vol2_char_df.apply(potential_epithet_match, axis = 1)

vol3_char_df['potential_genus_match'] = vol3_char_df.apply(potential_genus_match, axis = 1)
vol3_char_df['potential_epithet_match'] = vol3_char_df.apply(potential_epithet_match, axis = 1)

In [431]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "GE_potential_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "GE_potential_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "GE_potential_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['potential_genus_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_epithet_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  4.76it/s]
100%|██████████| 22/22 [00:04<00:00,  4.83it/s]
100%|██████████| 28/28 [00:05<00:00,  5.07it/s]


In [432]:
# all_vol_data = [(vol1_char_df, vol1_index),
#                 (vol2_char_df, vol2_index),
#                 (vol3_char_df, vol3_index)]
# for vol_char_df, vol_index in all_vol_data:
#     for page_num in vol_index: 
#         for col_num in [0,1]:
#             if vol_char_df["genus_mean_coord"]
#                 print(page_num, col_num)

### infra species

In [433]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["epithet_coord_match"] == True) | (vol_char_df["genus_coord_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus

        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 21.71it/s]
100%|██████████| 22/22 [00:01<00:00, 19.81it/s]
100%|██████████| 28/28 [00:01<00:00, 21.53it/s]


In [434]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["potential_epithet_match"] == True) | (vol_char_df["potential_genus_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus


        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 20.25it/s]
100%|██████████| 22/22 [00:01<00:00, 17.76it/s]
100%|██████████| 28/28 [00:01<00:00, 23.22it/s]


In [435]:
def potential_author_match_infra_coord(word):
    lower_word = word.lower()
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$"
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_infra_symbol = re.search(infra_symbols, lower_word) != None
    return (not is_infra_symbol) and (word[0].isupper() or is_latin_connectives)

In [436]:
potential_author_match_infra_coord("fil.")

True

In [437]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]
for vol_char_df, _ in all_vol_data:
    vol_char_df["potential_infra_match"] = (vol_char_df["infra_coord_match"] == True) & (vol_char_df['word'].apply(potential_author_match_infra_coord) == False)

In [438]:
def has_infra_symbols(word):
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    return re.search(infra_symbols, word) != None

In [439]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['infra_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_infra_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
                                         (vol_char_df['infra_coord_match'] == True) & 
                                         (vol_char_df['word'].apply(has_infra_symbols) == True)
                                        ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                        ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in infra_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

        for coord in infra_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in with_infra_symbols['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [00:27<00:00,  1.03it/s]
100%|██████████| 22/22 [00:24<00:00,  1.10s/it]
100%|██████████| 23/23 [00:23<00:00,  1.03s/it]


### functions for author matching 
to detect anamolies in epithet and infra indentations

In [440]:
vol1_char_df['index_page_num'] = vol1_char_df['page_num'] - vol1_index[0] + 1
vol2_char_df['index_page_num'] = vol2_char_df['page_num'] - vol2_index[0] + 1
vol3_char_df['index_page_num'] = vol3_char_df['page_num'] - vol3_index[0] + 1

In [441]:
vol1_char_df[(vol1_char_df['potential_infra_match'] == True) & (vol1_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word


In [442]:
vol2_char_df[(vol2_char_df['potential_infra_match'] == True) & (vol2_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word


In [443]:
vol3_char_df[(vol3_char_df['potential_infra_match'] == True) & (vol3_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1559345,3,(3
1559874,3,f.
1561502,4,fa
1566359,6,deris
1570483,8,cock
1576678,11,f.
1578491,12,picha
1581443,13,adoxifolium
1582167,14,fil.
1584378,15,yar.


#### upper case beggining / latin words in epithet coordd

In [444]:
def potential_author_match_epithet_coord(word):
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$|^\s?f[\s|.]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_hybrid = word == "X"
    return is_latin_connectives or (word[0].isupper() and (not is_hybrid))

In [445]:
vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1734317,9,"J.d,IlLIlU."
1753028,18,Phoenicia
1753527,18,Syriacus
1755122,19,Jilicaulis


In [446]:
vol2_char_df[(vol2_char_df['potential_epithet_match'] == True) & (vol2_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1915513,4,Hbanoticus
1922530,8,Hppii
1937158,14,Ma


In [447]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1554632,1,Krascheninnikovii
1565497,6,Wagenitz
1566524,6,Fritsch
1575185,10,Holub
1575488,11,Holub
1577207,11,et
1578956,12,Eichwaldii
1579044,12,Schrank
1582101,14,Kuntze
1583001,14,Kuntze


#### epithet coord word has uppper case in the middle (but not the first letter)

In [448]:
def has_upper_not_first(word):
    return word[1:].lower() != word[1:]

In [449]:
vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1716055,1,peregrina(Hack.)
1716156,1,umbeUulata
1734317,9,"J.d,IlLIlU."
1734633,9,elatior'L.
1736494,10,sessUis
1737303,11,pilosaHuds.
1741588,13,phleoides^Vill.)
1747388,15,albaL.
1752078,18,aegUops
1752829,18,glaucaVahl


In [450]:
#vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()
vol1_char_df.loc[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first)) & (vol1_char_df['word'].isin(['J.d,IlLIlU.'])), 'potential_genus_match'] = True
vol1_char_df.loc[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first)) & (vol1_char_df['word'].isin(['J.d,IlLIlU.'])), 'potential_epithet_match'] = False
#vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

In [451]:
# J.d,IlLIlU. -> unidentified genus 


In [452]:
vol2_char_df[(vol2_char_df['potential_epithet_match'] == True) & (vol2_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1914061,4,corîdûpUcaÈu^Sretoï.
1917935,6,securidacaiÇL.)
1941855,17,corymbulosum(Planch.)Reichenb.
1953489,22,aqUatilis


In [453]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1568523,7,gaiUardotii
1579879,13,albu^L.
1585656,15,sieberiC.
1586349,16,Schiman-Czeika
1597633,21,desertiTUéh.
1603606,24,DOteriifolium
1608206,26,agMmoniifolium
1612787,28,'Abd-el-'asissi


In [454]:
not_epithet_list = ['Schiman-Czeika']
vol3_char_df.loc[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(has_upper_not_first)) & (vol3_char_df['word'].isin(not_epithet_list)), 'potential_epithet_match'] = False

In [455]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1568523,7,gaiUardotii
1579879,13,albu^L.
1585656,15,sieberiC.
1597633,21,desertiTUéh.
1603606,24,DOteriifolium
1608206,26,agMmoniifolium
1612787,28,'Abd-el-'asissi


potential genus match but name is not alphabetic or is of length < 3

In [456]:
def flag_genus_name(word):
    word_no_space = word.replace(" ", "")
    return ((not word_no_space.isalpha()) or (len(word_no_space) < 3))

In [457]:
vol1_char_df[(vol1_char_df['potential_genus_match'] == True) & (vol1_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()
#skipping over all these 

Unnamed: 0,index_page_num,word
1730943,8,c
1734317,9,"J.d,IlLIlU."
1738656,11,f
1754704,19,j.


In [458]:
vol1_char_df = vol1_char_df.loc[~((vol1_char_df['potential_genus_match'] == True) & (vol1_char_df['word'].apply(flag_genus_name))), :]

In [459]:
vol2_char_df[(vol2_char_df['potential_genus_match'] == True) & (vol2_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()
#VV.1l.* only removing 

Unnamed: 0,index_page_num,word
1921256,7,•Ceratophyllum
1921779,7,Chelidonium^
1939484,16,Jussiaea-
1939606,16,VV.1l.*


In [460]:
vol2_char_df = vol2_char_df.loc[(vol2_char_df['word'] != 'VV.1l.*'),:]

In [461]:
vol3_char_df[(vol3_char_df['potential_genus_match'] == True) & (vol3_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1559728,3,BallotaL.
1569974,8,CordiaL.
1584638,15,x
1601635,23,SolidagoL.


flag if we had 2 genus in the same line or 1 or more genus + 1 or more epithet on the same line

In [462]:
#doesn't pick up all the issues because sometimes when the space if large enough 
# it thinks we're on a "new line"

In [463]:
line_groups = [c for c in vol1_char_df.columns if c.startswith("vol")] + \
              [c for c in vol1_char_df.columns if c.startswith("page")] + \
              [c for c in vol1_char_df.columns if c.startswith("block")] +\
              [c for c in vol1_char_df.columns if c.startswith("line")]
              
line_group_df = vol1_char_df.groupby(line_groups)
temp_line_df = vol1_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word
1762657,638,19,0,Zea
1762660,638,19,0,mays


In [464]:
line_groups = [c for c in vol2_char_df.columns if c.startswith("vol")] + \
              [c for c in vol2_char_df.columns if c.startswith("page")] + \
              [c for c in vol2_char_df.columns if c.startswith("block")] +\
              [c for c in vol2_char_df.columns if c.startswith("line")]
              
line_group_df = vol2_char_df.groupby(line_groups)
temp_line_df = vol2_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word


In [465]:
line_groups = [c for c in vol3_char_df.columns if c.startswith("vol")] + \
              [c for c in vol3_char_df.columns if c.startswith("page")] + \
              [c for c in vol3_char_df.columns if c.startswith("block")] +\
              [c for c in vol3_char_df.columns if c.startswith("line")]
              
line_group_df = vol3_char_df.groupby(line_groups)
temp_line_df = vol3_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word
1584638,569,36,0,x
1584639,569,36,0,Majoranamaracus


### Matching page number

In [466]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc),
                (vol2_char_df, vol2_index, vol2_doc),
                (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, doc in all_vol_data: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num, - 30)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num_for_PN'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:13<00:00,  1.68it/s]
100%|██████████| 22/22 [00:14<00:00,  1.54it/s]
100%|██████████| 28/28 [00:15<00:00,  1.75it/s]


In [467]:
def is_page_num(row):
    return row['pruned_word'].isnumeric()


all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df['page_num_index_pat_match'] = (vol_char_df['page_num'].isin(vol_index)) & (vol_char_df.apply(is_page_num, axis = 1))
    vol_char_df["page_num_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["page_num_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        page_num_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["page_num_index_pat_match"] == True)]
        page_num_df = page_num_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_page_num_2dic = [{}, {}]
        
        for i in range(page_num_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = page_num_df['word_bbox'].iloc[i]
            x_ref = p0[2]
            col = page_num_df['col_num_for_PN'].iloc[i]

            ref_neighbors_df = page_num_df[(page_num_df["page_num"] == page_num) & 
                                           (page_num_df["word_bbox"].apply(lambda x : x_ref - margin <= x[2] and x[2] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[2]).mean()
            page_page_num_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_page_num = max(page_page_num_2dic[0].values(), default = [-1, -1])[1]
        mean_right_page_num = max(page_page_num_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_page_num == -1 or mean_right_page_num == -1:
            mean_valid_col = max(mean_left_page_num, mean_right_page_num)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match([x[2]], mean_valid_col, mean_valid_col, margin))
        elif mean_left_page_num == -1 and mean_right_page_num == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "page_num_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match([x[2]], mean_left_page_num, mean_right_page_num, margin))


100%|██████████| 23/23 [00:01<00:00, 16.39it/s]
100%|██████████| 22/22 [00:01<00:00, 15.47it/s]
100%|██████████| 28/28 [00:01<00:00, 22.47it/s]


In [468]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_page_num_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "potential_page_num_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "potential_page_num_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        page_num_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['page_num_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        # infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
        #                         & (vol_char_df['potential_infra_match'] == True)
        #                         ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
        #                         ].drop_duplicates()

        # with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
        #                                  (vol_char_df['infra_coord_match'] == True) & 
        #                                  (vol_char_df['word'].apply(has_infra_symbols) == True)
        #                                 ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
        #                                 ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in page_num_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)

        # for coord in infra_db['word_bbox'] :
        #     x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
        #     draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # # #epithet is red, 3
        # for coord in with_infra_symbols['word_bbox'] :
        #     x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
        #     draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [00:05<00:00,  5.14it/s]
100%|██████████| 22/22 [00:04<00:00,  4.95it/s]
100%|██████████| 23/23 [00:04<00:00,  4.85it/s]


### testing highlighting instead of making image:

page.add_highlight_annot(quads)


In [469]:
#vol3_char_df[''].apply()

### marking all values in the dataframe

### index df 

In [470]:
#making sure page_num is in index
#making sure the genus level word is not all uppercase (a family name)
#making sure the pruned_word is not numeric (removing page_number as it's not in order usually)


all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

result = [] 
ignore_word_list = ["NOUVELLE", "Flore", "FLORE", "INDEX", ""]
for vol_char_df, vol_index in all_vol_data:
    curr_result_df = vol_char_df[(vol_char_df['page_num'].isin(vol_index)) &
                                (~((vol_char_df["word"].str.isupper()) & (vol_char_df["word"].apply(lambda x : len(x) > 2)) & (vol_char_df['genus_coord_match'] == True))) & 
                                (~(vol_char_df["pruned_word"].isin(ignore_word_list))) &
                                (~(vol_char_df["pruned_word"].str.isnumeric() & (vol_char_df["word"] != "(3"))) & 
                                (~(vol_char_df["page_num_coord_match"] == True))
                                ].copy()
    result.append(curr_result_df)

vol1_index_df, vol2_index_df, vol3_index_df = result[0], result[1], result[2]

In [471]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_genus_names(row):
    if row['potential_genus_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan
        
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_genus'] = vol_index_df.apply(extract_potential_genus_names, axis = 1)
    vol_index_df['closest_genus'].ffill(inplace=True)

In [472]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_epithet_names(row):
    if row['potential_epithet_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_epithet'] = vol_index_df.apply(extract_potential_epithet_names, axis = 1)
    vol_index_df.loc[vol_index_df['potential_genus_match'] == True, 'closest_epithet'] = -1
    vol_index_df['closest_epithet'].ffill(inplace=True)

In [473]:
def extract_potential_infra_type(row):
    if row['potential_infra_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_infra_type'] = vol_index_df.apply(extract_potential_infra_type, axis = 1)
    vol_index_df.loc[(vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True), 'closest_infra_type'] = -1
    vol_index_df['closest_infra_type'].ffill(inplace=True)

In [474]:
keep_cols = vol3_index_df.columns.difference(["char_num", "char", "char_origin", "char_bbox"], sort=False).tolist()

vol3_index_test = vol3_index_df.copy().loc[:,keep_cols].drop_duplicates().reset_index()
vol3_index_test.rename(columns={"index": "char_index"}, inplace = True)

In [475]:
for vol_index_df in [vol3_index_test]:#[vol1_index_df, vol2_index_df, vol3_index_df]:
    infra_name_match_indecies = vol_index_df[vol_index_df['potential_infra_match'] == True].index + 1
    vol_index_df['closest_infra_name'] = np.NaN
    vol_index_df.loc[infra_name_match_indecies, 'closest_infra_name'] = vol_index_df.apply(lambda row : row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num']) , axis = 1)
    vol_index_df['potential_epithet_name_match'] = vol_index_df.index.isin(infra_name_match_indecies)
    vol_index_df.loc[(vol_index_df['potential_epithet_match'] == True) | (vol_index_df['potential_genus_match'] == True), 'closest_infra_name'] = -1
    vol_index_df['closest_infra_name'].ffill(inplace=True)

In [476]:
vol3_index_test.replace(-1, np.NaN, inplace = True)

In [477]:
vol3_index_test.iloc[:,17:].head(50)

Unnamed: 0,span_origin,span_bbox,word_num,word,word_bbox,pruned_word,pruned_word_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,...,potential_infra_match,index_page_num,col_num_for_PN,page_num_index_pat_match,page_num_coord_match,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match
0,"(16.079999923706055, 168.8800048828125)","(16.079999923706055, 160.3209991455078, 62.117...",0,Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",True,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,,,,False
1,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",1,Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,,,,False
2,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",2,et,"(100.82400512695312, 159.4029998779297, 107.43...",et,"(100.82400512695312, 159.4029998779297, 107.43...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,,,,False
3,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",3,Eig,"(109.93597412109375, 159.4029998779297, 122.56...",Eig,"(109.93597412109375, 159.4029998779297, 122.56...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,,,,False
4,"(23.040000915527344, 178.48001098632812)","(23.040000915527344, 169.92100524902344, 67.03...",0,factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",False,True,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
5,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",1,Warb.,"(69.28821563720703, 169.0030059814453, 93.4471...",Warb,"(69.28821563720703, 169.0030059814453, 90.6840...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
6,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",2,et,"(96.83100891113281, 169.0030059814453, 103.471...",et,"(96.83100891113281, 169.0030059814453, 103.471...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
7,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",3,Eig,"(106.63101959228516, 169.0030059814453, 119.26...",Eig,"(106.63101959228516, 169.0030059814453, 119.26...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
8,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",4,in,"(122.19778442382812, 169.0030059814453, 129.33...",in,"(122.19778442382812, 169.0030059814453, 129.33...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
9,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",5,Eig,"(132.4967498779297, 169.0030059814453, 145.130...",Eig,"(132.4967498779297, 169.0030059814453, 145.130...",False,False,0,...,False,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False


In [478]:
[c for c in vol3_index_test.columns if c.startswith('potential')]

['potential_genus_match',
 'potential_epithet_match',
 'potential_infra_match',
 'potential_epithet_name_match']

In [479]:
vol3_index_test['potential_author_match'] = (vol3_index_test['potential_genus_match'] == False) & \
                                            (vol3_index_test['potential_epithet_match'] == False) & \
                                            (vol3_index_test['potential_infra_match'] == False) & \
                                            (vol3_index_test['potential_epithet_name_match'] == False)

In [480]:
#vol3_index_test[vol3_index_test['potential_infra_match'] == True].index + 1

In [481]:
#vol3_index_test.iloc[:,18:].head(50)
#genus author: genus = "genus" & "closest_epithet = -1 & potential_genus_match = False"
#epithet author: epithet = "epithet" & potential_epithet_match = False & closest_infra = -1 
#closest_infra 

In [482]:
#vol3_index_test = vol3_index_df.copy()
#vol3_index_test['after_potential_infra_match'] = vol3_index_test['potential_infra_match'].shift()

# group_cols = vol3_index_test.columns.difference(["char_num", "char", "char_origin", "char_bbox"], sort=False).tolist()
# vol3_index_test["after_potential_infra_match"] = vol3_index_test.groupby(group_cols)['potential_infra_match'].shift()#.transform('min')

In [483]:
# vol3_index_df[['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match']].drop_duplicates()

In [484]:
# vol3_index_test[['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'after_potential_infra_match']].drop_duplicates().head(50)

In [485]:
# vol3_index_test.loc[vol3_index_test['after_potential_infra_match'] == True, ['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'after_potential_infra_match']].drop_duplicates()

In [486]:
# #df['closest_epithet_v2'] = np.nan
# def extract_potential_infra_names(row):
#     if row['after_potential_infra_match'] == True:
#         return row['word']
#     else:
#         return np.nan

# for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
#     vol_index_df['closest_epithet'] = vol_index_df.apply(extract_potential_epithet_names, axis = 1)
#     df.loc[df['potential_genus_match'] == True, 'closest_epithet_v2'] = -1
#     vol_index_df['closest_epithet'].ffill(inplace=True)

In [487]:
# df.loc[:,['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'closest_genus', 'closest_epithet']]#.drop_duplicates().tail(50)

In [488]:
# type(df.at[i, 'closest_genus'])

In [489]:
# closes_genus = df.at[i, 'closest_genus']
# pd.isnull(closes_genus) == False

In [490]:
# df.loc[df['potential_genus_match'] == True, 'closest_epithet'] = np.nan

In [491]:
# df.loc[:,['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'closest_genus', 'closest_epithet']].drop_duplicates().tail(50)

In [492]:
# all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
#                 (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
#                 (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

# for vol_char_df, vol_index, doc, output_name in all_vol_data: 
#     #for each volume 
#     image_list = []

#     for page_num in tqdm(vol_index):
#         pix_map = doc.get_page_pixmap(page_num,matrix=mat)
#         image = Image.open(io.BytesIO(pix_map.tobytes()))
#         draw = ImageDraw.Draw(image)
        

#         infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
#                                      (vol_char_df['infra_coord_match'] == True)
#                             ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                             ].drop_duplicates()

#         infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
#                                 & (vol_char_df['potential_infra_match'] == True)
#                                 ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                                 ].drop_duplicates()

#         with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
#                                          (vol_char_df['infra_coord_match'] == True) & 
#                                          (vol_char_df['word'].apply(has_infra_symbols) == True)
#                                         ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                                         ].drop_duplicates()

#         #genus Coord is orange-pinkish, 5
#         for coord in infra_coord_db['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

#         for coord in infra_db['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
#         # #epithet is red, 3
#         for coord in with_infra_symbols['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

#         image_list.append(image)

#     #save pages of the volume
#     image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

In [493]:
vol3_index_test

Unnamed: 0,char_index,vol_num,page_num,block_num,block_num_absolute,block_bbox,line_num,line_wmode,line_dir,line_bbox,...,index_page_num,col_num_for_PN,page_num_index_pat_match,page_num_coord_match,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match,potential_author_match
0,1554027,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,1,0,False,False,Aaronsohnia_555_1_0,,,,False,False
1,1554038,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,1,0,False,False,Aaronsohnia_555_1_0,,,,False,True
2,1554045,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,1,0,False,False,Aaronsohnia_555_1_0,,,,False,True
3,1554047,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,1,0,False,False,Aaronsohnia_555_1_0,,,,False,True
4,1554050,3,555,2,3,"(23.040000915527344, 169.0030059814453, 145.13...",0,0,"(1.0, 0.0)","(23.040000915527344, 169.0030059814453, 145.13...",...,1,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7823,1613066,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,28,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,False
7824,1613076,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,28,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True
7825,1613080,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,28,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True
7826,1613090,3,582,37,55,"(242.8800048828125, 246.01600646972656, 414.28...",0,0,"(1.0, 0.0)","(242.8800048828125, 246.01600646972656, 299.05...",...,28,0,False,False,Zollikoferia_582_35_0,tenuiloba_582_37_0,,,False,False


In [494]:
vol3_index_test.replace(np.NaN, "",inplace = True)

In [495]:
# import pandas as pd

# # create a sample dataframe
# df = pd.DataFrame({
#     'A': [1, 1, 1, 2, 2],
#     'B': [2, 2, 2, 1, 2],
#     'C': [True, False, False, False, True],
#     'D': ['hello', 'hello2', 'world', 'python', 'pandas']
# })
author_grouping = ['closest_genus', 'closest_epithet', 'closest_infra_name']
vol3_index_test['potential_author_match']
# group by 'A' and 'B' columns
groups = vol3_index_test.groupby(author_grouping)

# concatenate 'D' values for each group where 'C' is False
def concatenate(group):
    return group.loc[group['potential_author_match'] == True, 'word'].str.cat(sep=' ')

concatenated = groups.apply(concatenate).reset_index()

# add the concatenated values to the original dataframe
result = vol3_index_test.merge(concatenated[['closest_genus', 'closest_epithet', 'closest_infra_name', 0]], on=['closest_genus', 'closest_epithet', 'closest_infra_name'], how='left').rename(columns={0: 'authors'})

In [496]:
result.iloc[:,20:]

Unnamed: 0,word,word_bbox,pruned_word,pruned_word_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,epithet_coord_match,genus_coord_match,potential_genus_match,...,col_num_for_PN,page_num_index_pat_match,page_num_coord_match,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match,potential_author_match,authors
0,Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",True,False,0,False,True,True,...,0,False,False,Aaronsohnia_555_1_0,,,,False,False,Warburg et Eig
1,Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",False,False,0,False,False,False,...,0,False,False,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
2,et,"(100.82400512695312, 159.4029998779297, 107.43...",et,"(100.82400512695312, 159.4029998779297, 107.43...",False,False,0,False,False,False,...,0,False,False,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
3,Eig,"(109.93597412109375, 159.4029998779297, 122.56...",Eig,"(109.93597412109375, 159.4029998779297, 122.56...",False,False,0,False,False,False,...,0,False,False,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
4,factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",False,True,0,True,False,False,...,0,False,False,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False,False,Warb. et Eig in Eig
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7823,nudicaulis,"(242.8800048828125, 236.28099060058594, 280.00...",nudicaulis,"(242.8800048828125, 236.28099060058594, 280.00...",False,True,1,True,False,False,...,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,False,(L.) Boiss.
7824,(L.),"(282.0889892578125, 235.3629913330078, 296.101...",L,"(285.191650390625, 235.3629913330078, 290.6356...",False,False,1,False,False,False,...,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True,(L.) Boiss.
7825,Boiss.,"(298.3200378417969, 235.3629913330078, 321.646...",Boiss,"(298.3200378417969, 235.3629913330078, 317.378...",False,False,1,False,False,False,...,0,False,False,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True,(L.) Boiss.
7826,tenuiloba,"(242.8800048828125, 246.83200073242188, 272.80...",tenuiloba,"(242.8800048828125, 246.83200073242188, 272.80...",False,True,1,True,False,False,...,0,False,False,Zollikoferia_582_35_0,tenuiloba_582_37_0,,,False,False,Boiss.


In [497]:
def fix_words(word):
    head, sep, tail = word.partition('_')
    return head 

result['closest_genus'] = result['closest_genus'].apply(fix_words)
result['closest_epithet'] = result['closest_epithet'].apply(fix_words)
result['closest_infra_type'] = result['closest_infra_type'].apply(fix_words)
result['closest_infra_name'] = result['closest_infra_name'].apply(fix_words)

In [498]:
result_prune_authors = result[(result['potential_genus_match'] == True) |
                              (result['potential_epithet_match'] == True) |
                              (result['potential_epithet_name_match'] == True)]

In [499]:
[c for c in vol3_index_test.columns if c.startswith('closest')]

['closest_genus',
 'closest_epithet',
 'closest_infra_type',
 'closest_infra_name']

In [500]:
simplified_result = result_prune_authors[['closest_genus',
                                          'closest_epithet',
                                          'closest_infra_type',
                                          'closest_infra_name',
                                          'authors']]

In [501]:
simplified_result.to_csv('vol3_index_output_v2.csv')

In [502]:
non_italics_simplified_result = result_prune_authors.loc[(result_prune_authors['span_flags'] != 6),
                                                     ['closest_genus',
                                                      'closest_epithet',
                                                      'closest_infra_type',
                                                      'closest_infra_name',
                                                      'authors']]

non_italics_simplified_result.to_csv('vol3_nonitalics_index_output_v2.csv')

In [503]:
text = 'closest_infra_name'
head, sep, tail = text.partition('_')

In [504]:
result_prune_authors.columns

Index(['char_index', 'vol_num', 'page_num', 'block_num', 'block_num_absolute',
       'block_bbox', 'line_num', 'line_wmode', 'line_dir', 'line_bbox',
       'span_num', 'span_size', 'span_flags', 'span_font', 'span_color',
       'span_ascender', 'span_descender', 'span_origin', 'span_bbox',
       'word_num', 'word', 'word_bbox', 'pruned_word', 'pruned_word_bbox',
       'genus_index_pat_match', 'epithet_index_pat_match', 'col_num',
       'epithet_coord_match', 'genus_coord_match', 'potential_genus_match',
       'potential_epithet_match', 'infra_coord_match', 'potential_infra_match',
       'index_page_num', 'col_num_for_PN', 'page_num_index_pat_match',
       'page_num_coord_match', 'closest_genus', 'closest_epithet',
       'closest_infra_type', 'closest_infra_name',
       'potential_epithet_name_match', 'potential_author_match', 'authors'],
      dtype='object')

In [505]:
simplified_result[((simplified_result['closest_genus'].str.contains('x')) | (simplified_result['closest_genus'].str.contains('x')) | (simplified_result['closest_genus'].str.contains('×'))) &
                  (simplified_result['closest_genus'].apply(lambda x : len(x)) <= 3)]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
4078,x,,,,
4079,x,Majoranamaracus,,,Rech. fil.
4082,x,zernyi,,,Rech. fil.


In [506]:
simplified_result[((simplified_result['closest_epithet'].str.contains('x')) | (simplified_result['closest_epithet'].str.contains('x')) | (simplified_result['closest_epithet'].str.contains('×'))) &
                  (simplified_result['closest_epithet'].apply(lambda x : len(x)) <= 3)]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
1692,Centauserratula,x.,,,mouterdei Arènes
4404,Nepeta,x,,,campylantha Rech. fil.
4688,Origanum,x,,,adonidis Moût.
4691,Origanum,x,,,barbarae Bornm.
4724,Origanum,x,,,pabotii Moût.
4727,Origanum,x,,,"symeonis Moût,"
5429,Ptilostemon,x,,,pabotii Greuter
5545,Rhaponserratula,x,,,mouterdei (Arènes) Mouterde mss.
6120,Senecio,x,,,berythaeus Camus et Gombault


In [507]:
simplified_result[((simplified_result['closest_infra_type'].str.contains('x')) | (simplified_result['closest_infra_type'].str.contains('x')) | (simplified_result['closest_infra_type'].str.contains('×'))) &
                  (simplified_result['closest_infra_type'].apply(lambda x : len(x)) <= 3)]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
7198,Verbascum,alepense,x,assurense,Bornm. et Hand.-Mazz.
7205,Verbascum,aliciae,x,kotschyi,Boiss. et Hoh.
7263,Verbascum,cedreti,x,lasianthum,Boiss.
7282,Verbascum,gaillardotii,x,sinuatum,L.
7285,Verbascum,gaillardotii,x,tripolitanum,Boiss.
7290,Verbascum,galilaeum,x,sinuatum,L.
7341,Verbascum,leptostachyum,x,ptychophyllum,Boiss.
7344,Verbascum,leptostachyum,x,tropidocarpum,Murb.
7405,Verbascum,sinuatum,x,tripolitanum,Boiss.
7458,Veronica,anagalloides,x,anagallis-aquatica,L.


In [508]:
simplified_result[((simplified_result['closest_infra_name'].str.contains('x')) | (simplified_result['closest_infra_name'].str.contains('x')) | (simplified_result['closest_infra_name'].str.contains('×')))]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
1334,Carthamus,tinctorius,var.,oxyacantha,Alef.
1644,Centaurium,erythraea,var.,laxum,(Boiss.) Moût.
2053,Convolvulus,dorycnium,subsp.,oxysepalus,(Boiss.) Rech. fil.
2058,Convolvulus,dorycnium,var.,oxysepalus,Boiss.
2356,Cuscuta,approximata,var.,approximata,
2425,Cuscuta,planiflora,var.,approximata,(Bab.) Engelm.
2476,Cynoglossum,montanum,subsp.,extra-europaeum,
2727,Erythraea,centaurium,var.,laxa,Boiss.
4243,Micromeria,graeca,subsp.,laxiflora,(Post) Mouterde
4607,Onosma,aleppica,var.,xanthotricha,(Boiss.) Boiss.


In [509]:
simplified_result[~((simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('e×'))) &
    ((simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('×')))]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
3056,Galium,thiebautii,,,x canum
3783,Legousia,speculum-veneris,,,(L.) Chaix
3963,Lippia,nodiflora,,,(L.) Michaux
4696,Origanum,bargyli,,,x syriacum
4707,Origanum,ehrenbergii,,,x syriacum
4712,Origanum,laevigatum,,,x syriacum
4717,Origanum,libanoticum,,,x syriacum
5147,Picris,sprengeriana,,,(L.) Chaix in Villars
5416,Ptilostemon,chamaepeuce,var.,camptolepis,x diacantha subsp.
6144,Senecio,gallicus,,,Chaix in Vill.


In [510]:
# author_split = simplified_result[~((simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('e×'))) &
#     ((simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('×')))]#['authors'].str.split()#.apply(lambda x : x[0] == 'x')

vol3_char_df[(vol3_char_df['word'] == 'x') & (vol3_char_df['potential_genus_match'] == True)]

Unnamed: 0,vol_num,page_num,block_num,block_num_absolute,block_bbox,line_num,line_wmode,line_dir,line_bbox,span_num,...,epithet_coord_match,genus_coord_match,potential_genus_match,potential_epithet_match,infra_coord_match,potential_infra_match,index_page_num,col_num_for_PN,page_num_index_pat_match,page_num_coord_match
1584638,3,569,36,57,"(31.440000534057617, 544.322998046875, 140.030...",0,0,"(1.0, 0.0)","(31.440000534057617, 544.322998046875, 140.030...",0,...,False,True,True,False,False,False,15,0,False,False


In [511]:
vol3_char_df['is_hybrid'] = False

In [512]:
index_end = len(vol3_char_df.loc[1584638+1,'word'])
vol3_char_df.loc[1584638,'potential_genus_match'] = False
vol3_char_df.loc[1584638+1:1584638+index_end,'potential_genus_match'] = True
vol3_char_df.loc[1584638+1:1584638+index_end,'potential_epithet_match'] = False
vol3_char_df.loc[1584638+1:1584638+index_end,'is_hybrid'] = True

In [513]:
vol3_char_df[(vol3_char_df['word'] == 'x') & (vol3_char_df['potential_epithet_match'] == True)]

Unnamed: 0,vol_num,page_num,block_num,block_num_absolute,block_bbox,line_num,line_wmode,line_dir,line_bbox,span_num,...,genus_coord_match,potential_genus_match,potential_epithet_match,infra_coord_match,potential_infra_match,index_page_num,col_num_for_PN,page_num_index_pat_match,page_num_coord_match,is_hybrid
1587139,3,570,50,77,"(237.60000610351562, 117.96300506591797, 351.9...",1,0,"(1.0, 0.0)","(237.60000610351562, 127.08300018310547, 333.8...",0,...,False,False,True,False,False,16,0,False,False,False
1589221,3,571,51,92,"(243.83999633789062, 156.3629913330078, 313.09...",0,0,"(1.0, 0.0)","(243.83999633789062, 156.3629913330078, 306.27...",0,...,False,False,True,False,False,17,0,False,False,False
1589235,3,571,51,92,"(243.83999633789062, 156.3629913330078, 313.09...",1,0,"(1.0, 0.0)","(243.83999633789062, 165.96299743652344, 313.0...",0,...,False,False,True,False,False,17,0,False,False,False
1589443,3,571,57,103,"(244.0800018310547, 282.843017578125, 305.5553...",0,0,"(1.0, 0.0)","(244.0800018310547, 282.843017578125, 305.5553...",0,...,False,False,True,False,False,17,0,False,False,False
1589456,3,571,58,105,"(244.0800018310547, 292.4430236816406, 312.325...",0,0,"(1.0, 0.0)","(244.0800018310547, 292.4430236816406, 312.325...",0,...,False,False,True,False,False,17,0,False,False,False
1594920,3,574,27,44,"(36.0, 326.52301025390625, 208.8193817138672, ...",0,0,"(1.0, 0.0)","(36.0, 326.52301025390625, 106.50386810302734,...",0,...,False,False,True,False,False,20,0,False,False,False
1595798,3,574,65,110,"(241.67999267578125, 261.2430114746094, 376.82...",0,0,"(1.0, 0.0)","(241.67999267578125, 261.2430114746094, 376.82...",0,...,False,False,True,False,False,20,0,False,False,False
1600077,3,576,46,71,"(240.0, 248.08299255371094, 370.65771484375, 2...",2,0,"(1.0, 0.0)","(240.24000549316406, 268.2430114746094, 370.65...",0,...,False,False,True,False,False,22,0,False,False,False


In [514]:
author_split = simplified_result[~((simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('ex')) | (simplified_result['authors'].str.contains('e×'))) &
    ((simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('x')) | (simplified_result['authors'].str.contains('×')))]#['authors'].str.split()#.apply(lambda x : x[0] == 'x')

In [515]:
author_split[author_split['authors'].str.split().apply(lambda x : x[0] == 'x')]

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
3056,Galium,thiebautii,,,x canum
4696,Origanum,bargyli,,,x syriacum
4707,Origanum,ehrenbergii,,,x syriacum
4712,Origanum,laevigatum,,,x syriacum
4717,Origanum,libanoticum,,,x syriacum
5416,Ptilostemon,chamaepeuce,var.,camptolepis,x diacantha subsp.
6158,Senecio,vernalis,,,x leucanthemifolius


In [516]:
author_split = author_split[author_split['authors'].str.split().apply(lambda x : x[0] == 'x')]

In [517]:
author_split['closest_infra_type'] = 'hybrid'
author_split['closest_infra_name'] = author_split['authors'].apply(lambda x : x[1:])
author_split['authors'] = author_split['authors'].apply(lambda x : x[2:])

In [518]:
author_split

Unnamed: 0,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,authors
3056,Galium,thiebautii,hybrid,canum,canum
4696,Origanum,bargyli,hybrid,syriacum,syriacum
4707,Origanum,ehrenbergii,hybrid,syriacum,syriacum
4712,Origanum,laevigatum,hybrid,syriacum,syriacum
4717,Origanum,libanoticum,hybrid,syriacum,syriacum
5416,Ptilostemon,chamaepeuce,hybrid,diacantha subsp.,diacantha subsp.
6158,Senecio,vernalis,hybrid,leucanthemifolius,leucanthemifolius


In [519]:
author_split['authors'].apply(lambda x : x.split()[1:])

3056          []
4696          []
4707          []
4712          []
4717          []
5416    [subsp.]
6158          []
Name: authors, dtype: object