In [2]:
import fitz
import numpy as np
import pandas as pd
from tqdm import tqdm

import io
from PIL import Image, ImageDraw, ImageFont, ImageColor

import math
import re

In [3]:
vol1_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf'
vol2_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf'
vol3_path = '../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf'

vol1_doc = fitz.open(vol1_path)
vol2_doc = fitz.open(vol2_path)
vol3_doc = fitz.open(vol3_path)

vol1_pages = [vol1_doc[i] for i in range(vol1_doc.page_count)]
vol2_pages = [vol2_doc[i] for i in range(vol2_doc.page_count)]
vol3_pages = [vol3_doc[i] for i in range(vol3_doc.page_count)]

In [4]:
vol1_char_df = pd.read_pickle("../input/char_df/vol1_df.pkl")
vol2_char_df = pd.read_pickle("../input/char_df/vol2_df.pkl")
vol3_char_df = pd.read_pickle("../input/char_df/vol3_df.pkl")

vol1_index = list(range(616, 639)) #inclusive
vol2_index = list(range(703, 725))
vol3_index = list(range(555, 583))

In [5]:
TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

### finding the columns 
### & checking if a word is a strict match for the genus / epithet pattern

In [6]:
def epithet_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'].islower()

def genus_match(row):
    return row['word_num'] == 0 and \
           row['word'].isalpha() and \
           row['word'][0].isupper() and row['word'][1:].islower()

In [7]:
#rightmost point of any bounding box:
def get_center_x0(vol_char_df, page_num, bias = 30):
    """WARNING: large bias causes miscatagorization in page number in book"""
    df = vol_char_df[vol_char_df['page_num'] == page_num]
    
    right_bound = df['line_bbox'].apply(lambda x : x[2]).max() 
    #leftmost point of any bounding box:
    left_bound = df['line_bbox'].apply(lambda x : x[0]).min()

    return 0.5*(right_bound + left_bound) - bias


def get_col_num(coords, center_x0):
    x0, y0, x1, y1 = coords
    return int(x0 >= center_x0)


all_vol_data = [(vol1_char_df, vol1_index, vol1_doc),
                (vol2_char_df, vol2_index, vol2_doc),
                (vol3_char_df, vol3_index, vol3_doc)]

for vol_char_df ,vol_index, doc in all_vol_data: 
    #for each volume check if genus pattern / epithet pattern exists within the index part of the book
    vol_char_df['genus_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and genus_match(r), axis = 1) #does this for whole books which is bad
    vol_char_df['epithet_index_pat_match'] = vol_char_df.apply(lambda r : r['page_num'] in vol_index and epithet_match(r), axis = 1) #does this for whole books which is bad
    
    for page_num in tqdm(vol_index):
        center_x0 = get_center_x0(vol_char_df, page_num)
        #find center based on x0 coordinate of each line
        vol_char_df['col_num'] = vol_char_df['line_bbox'].apply(lambda coords : get_col_num(coords, center_x0)) 

100%|██████████| 23/23 [00:13<00:00,  1.72it/s]
100%|██████████| 22/22 [00:14<00:00,  1.56it/s]
100%|██████████| 28/28 [00:14<00:00,  1.91it/s]


#### testing if col num correctly assigned

In [8]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "index_col_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "index_col_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "index_col_vol3")]

for vol_char_df, vol_index, vol_doc, output_name in all_vol_data:
    image_list = []
    keep_cols = vol_char_df.columns.difference(["char_num", "char", "char_origin", "char_bbox", "char_x0", "char_y0", "char_x1", "char_y1", "pruned_char_x0", "pruned_char_y0", "pruned_char_x1", "pruned_char_y1"], sort=False).tolist()
    for page_num in tqdm(vol_index):
        pix_map = vol_doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)

        temp_df = vol_char_df[vol_char_df["page_num"] == page_num].loc[:, keep_cols].drop_duplicates()

        for coord in temp_df[temp_df['col_num'] == 0]['line_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        for coord in temp_df[temp_df['col_num'] == 1]['line_bbox']:
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=5)
            
        image_list.append(image)
        #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])


100%|██████████| 23/23 [00:04<00:00,  5.58it/s]
100%|██████████| 22/22 [00:03<00:00,  5.59it/s]
100%|██████████| 28/28 [00:04<00:00,  5.87it/s]


### Genus / epithet flagging 
flagging pages where number of strict genus or epithet patern matches is less than 3 per column

In [9]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "strickt_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "strickt_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "strickt_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    genus_flag_list = []
    epithet_flag_list = []
    for page_num in tqdm(vol_index):
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num)
                                & (vol_char_df['genus_index_pat_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus pattern match flag should check with half page and not entire page:
        for col in range(2):
            num_genus_col = genus_db[genus_db["col_num"] == col].shape[0]
            num_epithet_col = epithet_db[epithet_db["col_num"] == col].shape[0]
            if num_genus_col <= 2:
                genus_flag_list.append((num_genus_col, page_num - vol_index[0] + 1, col))
            if num_epithet_col <= 2:
                epithet_flag_list.append((num_epithet_col, page_num - vol_index[0] + 1, col))
    num_flag_pages = len(set([g[1] for g in genus_flag_list] + [e[1] for e in epithet_flag_list]))
    if num_flag_pages > 0: 
        print("***FLAGS***")
        print(f" number of pages to check: {num_flag_pages}")
        if genus_flag_list:
            print("  genera")
            [print(f"\t number of genera: {g_flag[0]}, page number: {g_flag[1]}, column number: {g_flag[2]}") for g_flag in genus_flag_list]
        if epithet_flag_list:
            print("  epithets")
            [print(f"\t number of epithets: {e_flag[0]}, page number: {e_flag[1]}, column number: {e_flag[2]}") for e_flag in epithet_flag_list]

100%|██████████| 23/23 [00:00<00:00, 82.67it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 1, page number: 2, column number: 0
	 number of genera: 2, page number: 15, column number: 1
	 number of genera: 0, page number: 20, column number: 1
	 number of genera: 1, page number: 23, column number: 0
  epithets
	 number of epithets: 2, page number: 23, column number: 1


100%|██████████| 22/22 [00:00<00:00, 78.12it/s]


***FLAGS***
 number of pages to check: 4
  genera
	 number of genera: 2, page number: 4, column number: 0
	 number of genera: 1, page number: 4, column number: 1
	 number of genera: 0, page number: 5, column number: 0
	 number of genera: 1, page number: 12, column number: 0
	 number of genera: 2, page number: 14, column number: 1


100%|██████████| 28/28 [00:00<00:00, 90.61it/s]

***FLAGS***
 number of pages to check: 7
  genera
	 number of genera: 1, page number: 2, column number: 1
	 number of genera: 0, page number: 6, column number: 0
	 number of genera: 1, page number: 21, column number: 0
	 number of genera: 1, page number: 22, column number: 0
	 number of genera: 2, page number: 24, column number: 1
	 number of genera: 2, page number: 26, column number: 0
	 number of genera: 0, page number: 26, column number: 1
	 number of genera: 2, page number: 28, column number: 0





### match  based on coordinates

In [10]:
def is_coord_match(x, x_ref_left, x_ref_right, margin):
    return (x_ref_left - margin <= x[0] and x[0] <= x_ref_left + margin) or (x_ref_right - margin <= x[0] and x[0] <= x_ref_right + margin)

#### epithet

In [11]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["epithet_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["epithet_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        epithet_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["epithet_index_pat_match"] == True)]
        epithet_df = epithet_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_epithet_2dic = [{}, {}]
        
        for i in range(epithet_df.shape[0]):
            e_index = str(page_num) + "_" + str(i)
            p0 = epithet_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = epithet_df['col_num'].iloc[i]

            ref_neighbors_df = epithet_df[(epithet_df["page_num"] == page_num) & 
                                          (epithet_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]
            
            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            page_epithet_2dic[col][e_index] = (num_neighbors, mean_neighbors)
        
        mean_left_epithet = max(page_epithet_2dic[0].values(), default = [-1, -1])[1]
        mean_right_epithet = max(page_epithet_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_epithet == -1 or mean_right_epithet == -1:
            mean_valid_col = max(mean_left_epithet, mean_right_epithet)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_epithet == -1 and mean_right_epithet == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "epithet_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet, mean_right_epithet, margin))

100%|██████████| 23/23 [00:01<00:00, 22.33it/s]
100%|██████████| 22/22 [00:01<00:00, 21.02it/s]
100%|██████████| 28/28 [00:01<00:00, 23.81it/s]


In [12]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "epithet_coord_match_pruned_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "epithet_coord_match_pruned_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "epithet_coord_match_pruned_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []
    
    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        epithet_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['epithet_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_index_pat_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #epithet Coord is orange-pinkish, 5
        for coord in epithet_coord_db["pruned_word_bbox"] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)

        #epithet is blue, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#003399"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.60it/s]
100%|██████████| 22/22 [00:03<00:00,  5.66it/s]
100%|██████████| 28/28 [00:04<00:00,  5.89it/s]


#### Genus coord match

In [13]:
# add something about genus should come before epithet? 
    # assert df[df['epithet_coord_match'] == True]['word_bbox'].apply(lambda x: x[0]).mean() 
    #     >  df[df['genus_coord_match'] == True]['word_bbox'].apply(lambda x: x[0]).mean() 
    # and if False it shouldn't be a genus_coord?

In [42]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    #genus and not epithet
    vol_char_df["genus_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):
        margin = 1.25 * vol_char_df[(vol_char_df["genus_index_pat_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        genus_char_df = vol_char_df[(vol_char_df["page_num"] == page_num) &
                                    (vol_char_df["genus_index_pat_match"] == True)]
        genus_df = genus_char_df.loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin", "char_bbox"])].drop_duplicates()
        page_genus_2dic = [{}, {}]
        
        epithet_left_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                              (vol_char_df["page_num"] == page_num) &
                                              (vol_char_df["col_num"] == 0)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_right_coord_mean = vol_char_df[(vol_char_df["epithet_coord_match"] == True) &
                                               (vol_char_df["page_num"] == page_num) &
                                               (vol_char_df["col_num"] == 1)
                                             ]['pruned_word_bbox'].apply(lambda x : x[0]).mean()
        epithet_coord_mean_list = [epithet_left_coord_mean, epithet_right_coord_mean]

        for i in range(genus_df.shape[0]):
            g_index = str(page_num) + "_" + str(i)
            p0 = genus_df['word_bbox'].iloc[i]
            x_ref = p0[0]
            col = genus_df['col_num'].iloc[i]

            ref_neighbors_df = genus_df[(genus_df["page_num"] == page_num) & 
                                        (genus_df["word_bbox"].apply(lambda x : x_ref - margin <= x[0] and x[0] <= x_ref + margin))]

            num_neighbors = ref_neighbors_df.shape[0]
            mean_neighbors = ref_neighbors_df["word_bbox"].apply(lambda x : x[0]).mean()
            if mean_neighbors > epithet_coord_mean_list[col]: 
                mean_neighbors = -1
            page_genus_2dic[col][g_index] = (num_neighbors, mean_neighbors)
        
        mean_left_genus = max(page_genus_2dic[0].values(), default = [-1, -1])[1]
        mean_right_genus = max(page_genus_2dic[1].values(), default = [-1, -1])[1]

        if mean_left_genus == -1 or mean_right_genus == -1:
            mean_valid_col = max(mean_left_genus, mean_right_genus)
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_valid_col, mean_valid_col, margin))
        elif mean_left_genus == -1 and mean_right_genus == -1:
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : False)
        else: 
            vol_char_df.loc[(vol_char_df["page_num"] == page_num) , "genus_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_genus, mean_right_genus, margin))

100%|██████████| 23/23 [00:00<00:00, 29.64it/s]
100%|██████████| 22/22 [00:00<00:00, 27.50it/s]
100%|██████████| 28/28 [00:00<00:00, 33.39it/s]


In [43]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "genus_coord_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "genus_coord_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "genus_coord_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        genus_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['genus_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['epithet_coord_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.57it/s]
100%|██████████| 22/22 [00:03<00:00,  5.62it/s]
100%|██████████| 28/28 [00:04<00:00,  5.79it/s]


### Add column for genus / epithet coord mean for each page

In [26]:
# all_vol_data = [(vol1_char_df, vol1_index),
#                 (vol2_char_df, vol2_index),
#                 (vol3_char_df, vol3_index)]
# for vol_char_df, vol_index in all_vol_data:
#     for page_num in vol_index:
#         for c_i in [0, 1]:
#             genus_mean_coord = vol_char_df[(vol_char_df['page_num'] == page_num) & (vol_char_df['genus_coord_match'] == True) & (vol_char_df['col_num'] == c_i)]['word_bbox'].apply(lambda x: x[0]).mean()
#             epithet_mean_coord = vol_char_df[(vol_char_df['page_num'] == page_num) & (vol_char_df['epithet_coord_match'] == True) & (vol_char_df['col_num'] == c_i)]['word_bbox'].apply(lambda x: x[0]).mean()
        
#             #doing this because you can have no genus in one page but not no genus but an epithet...
#             if np.isnan(genus_mean_coord):
#                 genus_mean_coord == 0
#             if np.isnan(epithet_mean_coord):
#                 epithet_mean_coord = 1

#             vol_char_df.loc[(vol_char_df['page_num'] == page_num) & (vol_char_df['col_num'] == c_i), 'genus_mean_coord'] = genus_mean_coord
#             vol_char_df.loc[(vol_char_df['page_num'] == page_num) & (vol_char_df['col_num'] == c_i), 'epithet_mean_coord'] = epithet_mean_coord

### extract potential genus / epithet matches

In [44]:
def potential_genus_match(row):
    return row['genus_coord_match'] == True and \
           row['epithet_coord_match'] == False and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False and \
           row['word'].find("Flore") == -1 
           # removing this for now ... and row['genus_mean_coord'] < row['epithet_mean_coord'] #important to check this only when epithet_coord_match is false?

def potential_epithet_match(row):
    return row['epithet_coord_match'] == True and \
           row['word'].isupper() == False and \
           row['word'].isnumeric() == False

In [45]:
vol1_char_df['potential_genus_match'] = vol1_char_df.apply(potential_genus_match, axis = 1)
vol1_char_df['potential_epithet_match'] = vol1_char_df.apply(potential_epithet_match, axis = 1)

vol2_char_df['potential_genus_match'] = vol2_char_df.apply(potential_genus_match, axis = 1)
vol2_char_df['potential_epithet_match'] = vol2_char_df.apply(potential_epithet_match, axis = 1)

vol3_char_df['potential_genus_match'] = vol3_char_df.apply(potential_genus_match, axis = 1)
vol3_char_df['potential_epithet_match'] = vol3_char_df.apply(potential_epithet_match, axis = 1)

In [47]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "GE_potential_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "GE_potential_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "GE_potential_match_vol3")]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        genus_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['potential_genus_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        epithet_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_epithet_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in genus_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in epithet_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#000099"), width=3)
        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 23/23 [00:04<00:00,  5.52it/s]
100%|██████████| 22/22 [00:03<00:00,  5.66it/s]
100%|██████████| 28/28 [00:04<00:00,  5.78it/s]


In [None]:
# all_vol_data = [(vol1_char_df, vol1_index),
#                 (vol2_char_df, vol2_index),
#                 (vol3_char_df, vol3_index)]
# for vol_char_df, vol_index in all_vol_data:
#     for page_num in vol_index: 
#         for col_num in [0,1]:
#             if vol_char_df["genus_mean_coord"]
#                 print(page_num, col_num)

### infra species

In [48]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["epithet_coord_match"] == True) | (vol_char_df["genus_coord_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["genus_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["epithet_coord_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus

        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["pruned_word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 19.57it/s]
100%|██████████| 22/22 [00:01<00:00, 19.74it/s]
100%|██████████| 28/28 [00:01<00:00, 22.34it/s]


In [49]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]

for vol_char_df, vol_index in all_vol_data: 
    vol_char_df["infra_coord_match"] = vol_char_df["word_bbox"].apply(lambda x : False)
    for page_num in tqdm(vol_index):

        margin = 1.25 * vol_char_df[(vol_char_df["potential_epithet_match"] == True) | (vol_char_df["potential_genus_match"] == True)]["char_bbox"].apply(lambda x : x[2] - x[0]).mean()
        
        mean_left_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_left_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_left_genus):
            mean_left_genus_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_epithet_all = vol_char_df[(vol_char_df["col_num"] == 0) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_left_tab = mean_left_epithet_all - mean_left_genus_all
        else: 
            mean_left_tab = mean_left_epithet - mean_left_genus
        
        mean_right_epithet = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        mean_right_genus = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
        if math.isnan(mean_right_genus):
            mean_right_genus_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_genus_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_epithet_all = vol_char_df[(vol_char_df["col_num"] == 1) & (vol_char_df["potential_epithet_match"] == True)]["word_bbox"].apply(lambda x : x[0]).mean()
            mean_right_tab = mean_right_epithet_all - mean_right_genus_all
        else: 
            mean_right_tab = mean_right_epithet - mean_right_genus


        vol_char_df.loc[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)  , "infra_coord_match"] = vol_char_df[(vol_char_df["page_num"] == page_num) & (vol_char_df["word_num"] == 0)]["word_bbox"].apply(lambda x : is_coord_match(x, mean_left_epithet + mean_left_tab, mean_right_epithet + mean_right_tab, margin))

100%|██████████| 23/23 [00:01<00:00, 21.99it/s]
100%|██████████| 22/22 [00:01<00:00, 19.73it/s]
100%|██████████| 28/28 [00:01<00:00, 22.96it/s]


In [50]:
def potential_author_match_infra_coord(word):
    lower_word = word.lower()
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$"
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_infra_symbol = re.search(infra_symbols, lower_word) != None
    return (not is_infra_symbol) and (word[0].isupper() or is_latin_connectives)

In [51]:
potential_author_match_infra_coord("fil.")

True

In [52]:
all_vol_data = [(vol1_char_df, vol1_index),
                (vol2_char_df, vol2_index),
                (vol3_char_df, vol3_index)]
for vol_char_df, _ in all_vol_data:
    vol_char_df["potential_infra_match"] = (vol_char_df["infra_coord_match"] == True) & (vol_char_df['word'].apply(potential_author_match_infra_coord) == False)

In [53]:
def has_infra_symbols(word):
    infra_symbols = r"^var[\s|.|\b]?$|^subsp[\s|.|\b]?$|^ssp[\s|.|\b]?$|^spp[\s|.|\b]?$|^x[\s|.|\b]?$|^×[\s|.|\b]?$"
    return re.search(infra_symbols, word) != None

In [54]:
all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
                (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
                (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

for vol_char_df, vol_index, doc, output_name in all_vol_data: 
    #for each volume 
    image_list = []

    for page_num in tqdm(vol_index):
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        

        infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
                                     (vol_char_df['infra_coord_match'] == True)
                            ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                            ].drop_duplicates()

        infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
                                & (vol_char_df['potential_infra_match'] == True)
                                ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                ].drop_duplicates()

        with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
                                         (vol_char_df['infra_coord_match'] == True) & 
                                         (vol_char_df['word'].apply(has_infra_symbols) == True)
                                        ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
                                        ].drop_duplicates()

        #genus Coord is orange-pinkish, 5
        for coord in infra_coord_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

        for coord in infra_db['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
        # #epithet is red, 3
        for coord in with_infra_symbols['word_bbox'] :
            x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
            draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

        image_list.append(image)

    #save pages of the volume
    image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

100%|██████████| 28/28 [00:24<00:00,  1.12it/s]
100%|██████████| 22/22 [00:22<00:00,  1.03s/it]
100%|██████████| 23/23 [00:21<00:00,  1.05it/s]


### functions for author matching 
to detect anamolies in epithet and infra indentations

In [55]:
vol1_char_df['index_page_num'] = vol1_char_df['page_num'] - vol1_index[0] + 1
vol2_char_df['index_page_num'] = vol2_char_df['page_num'] - vol2_index[0] + 1
vol3_char_df['index_page_num'] = vol3_char_df['page_num'] - vol3_index[0] + 1

In [56]:
vol1_char_df[(vol1_char_df['potential_infra_match'] == True) & (vol1_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word


In [57]:
vol2_char_df[(vol2_char_df['potential_infra_match'] == True) & (vol2_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word


In [58]:
vol3_char_df[(vol3_char_df['potential_infra_match'] == True) & (vol3_char_df['word'].apply(has_infra_symbols) == False)][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1559345,3,(3
1559874,3,f.
1561502,4,fa
1566359,6,deris
1570483,8,cock
1576678,11,f.
1578491,12,picha
1581443,13,adoxifolium
1582167,14,fil.
1584378,15,yar.


#### upper case beggining / latin words in epithet coordd

In [59]:
def potential_author_match_epithet_coord(word):
    latin_connectives = r"^\s?et[\s|.]?$|^\s?in[\s|.]?$|^\s?non[\s|.]?$|^\s?&[\s|.]?$|^\s?er[\s|.]?$|^\s?nec[\s|.]?$|^\s?mult[\s|.]?$|^\s?ex[\s|.]?$|^\s?fil[\s|.]?$|^\s?f[\s|.]?$"
    is_latin_connectives = re.search(latin_connectives, word) != None
    is_hybrid = word == "X"
    return is_latin_connectives or (word[0].isupper() and (not is_hybrid))

In [60]:
vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1734317,9,"J.d,IlLIlU."
1753028,18,Phoenicia
1753527,18,Syriacus
1755122,19,Jilicaulis


In [61]:
vol2_char_df[(vol2_char_df['potential_epithet_match'] == True) & (vol2_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1915513,4,Hbanoticus
1922530,8,Hppii
1937158,14,Ma


In [62]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(potential_author_match_epithet_coord))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1554632,1,Krascheninnikovii
1565497,6,Wagenitz
1566524,6,Fritsch
1575185,10,Holub
1575488,11,Holub
1577207,11,et
1578956,12,Eichwaldii
1579044,12,Schrank
1582101,14,Kuntze
1583001,14,Kuntze


#### epithet coord word has uppper case in the middle (but not the first letter)

In [63]:
def has_upper_not_first(word):
    return word[1:].lower() != word[1:]

In [64]:
vol1_char_df[(vol1_char_df['potential_epithet_match'] == True) & (vol1_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1716055,1,peregrina(Hack.)
1716156,1,umbeUulata
1734317,9,"J.d,IlLIlU."
1734633,9,elatior'L.
1736494,10,sessUis
1737303,11,pilosaHuds.
1741588,13,phleoides^Vill.)
1747388,15,albaL.
1752078,18,aegUops
1752829,18,glaucaVahl


In [65]:
vol2_char_df[(vol2_char_df['potential_epithet_match'] == True) & (vol2_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1914061,4,corîdûpUcaÈu^Sretoï.
1917935,6,securidacaiÇL.)
1941855,17,corymbulosum(Planch.)Reichenb.
1953489,22,aqUatilis


In [66]:
vol3_char_df[(vol3_char_df['potential_epithet_match'] == True) & (vol3_char_df['word'].apply(has_upper_not_first))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1568523,7,gaiUardotii
1579879,13,albu^L.
1585656,15,sieberiC.
1586349,16,Schiman-Czeika
1597633,21,desertiTUéh.
1603606,24,DOteriifolium
1608206,26,agMmoniifolium
1612787,28,'Abd-el-'asissi


potential genus match but name is not alphabetic or is of length < 3

In [67]:
def flag_genus_name(word):
    word_no_space = word.replace(" ", "")
    return ((not word_no_space.isalpha()) or (len(word_no_space) < 3))

In [68]:
vol1_char_df[(vol1_char_df['potential_genus_match'] == True) & (vol1_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1730943,8,c
1738656,11,f
1754704,19,j.


In [70]:
vol2_char_df[(vol2_char_df['potential_genus_match'] == True) & (vol2_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1921256,7,•Ceratophyllum
1921779,7,Chelidonium^
1939484,16,Jussiaea-
1939606,16,VV.1l.*


In [71]:
vol3_char_df[(vol3_char_df['potential_genus_match'] == True) & (vol3_char_df['word'].apply(flag_genus_name))][["index_page_num", "word"]].drop_duplicates()

Unnamed: 0,index_page_num,word
1559728,3,BallotaL.
1569974,8,CordiaL.
1584638,15,x
1601635,23,SolidagoL.


flag if we had 2 genus in the same line or 1 or more genus + 1 or more epithet on the same line

In [72]:
#doesn't pick up all the issues because sometimes when the space if large enough 
# it thinks we're on a "new line"

In [73]:
line_groups = [c for c in vol1_char_df.columns if c.startswith("vol")] + \
              [c for c in vol1_char_df.columns if c.startswith("page")] + \
              [c for c in vol1_char_df.columns if c.startswith("block")] +\
              [c for c in vol1_char_df.columns if c.startswith("line")]
              
line_group_df = vol1_char_df.groupby(line_groups)
temp_line_df = vol1_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word
1754704,634,29,0,j.
1754706,634,29,0,kali
1762657,638,19,0,Zea
1762660,638,19,0,mays


In [76]:
line_groups = [c for c in vol2_char_df.columns if c.startswith("vol")] + \
              [c for c in vol2_char_df.columns if c.startswith("page")] + \
              [c for c in vol2_char_df.columns if c.startswith("block")] +\
              [c for c in vol2_char_df.columns if c.startswith("line")]
              
line_group_df = vol2_char_df.groupby(line_groups)
temp_line_df = vol2_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word


In [77]:
line_groups = [c for c in vol3_char_df.columns if c.startswith("vol")] + \
              [c for c in vol3_char_df.columns if c.startswith("page")] + \
              [c for c in vol3_char_df.columns if c.startswith("block")] +\
              [c for c in vol3_char_df.columns if c.startswith("line")]
              
line_group_df = vol3_char_df.groupby(line_groups)
temp_line_df = vol3_char_df[line_group_df['potential_genus_match'].transform('any') & line_group_df['potential_epithet_match'].transform('any')]
temp_line_df[(temp_line_df['potential_genus_match'] == True) | (temp_line_df['potential_epithet_match'] == True)][["page_num", "block_num", "line_num", "word"]].drop_duplicates()

Unnamed: 0,page_num,block_num,line_num,word
1584638,569,36,0,x
1584639,569,36,0,Majoranamaracus


### testing highlighting instead of making image:

page.add_highlight_annot(quads)


In [None]:
vol3_char_df[''].apply()

### marking all values in the dataframe

In [82]:
import pandas as pd
import numpy as np

# Create example dataframe
df = pd.DataFrame({'A': [1, 4, 3, 6, 8],
                   'is_B': [False, True, False, True, False]})

# Find the row indices where 'is_B' is True
is_B_indices = df[df['is_B'] == True].index

# Define a function to find the closest 'is_B' row above and return the corresponding 'A' value
def find_closest_A_above_B(row):
    try:
        # Get the index of the next 'is_B' row above
        closest_index = df.loc[:row.name][df['is_B'] == True].iloc[-1].name
        # Return the corresponding 'A' value
        return df.loc[closest_index, 'A']
    except IndexError:
        # If there is no 'is_B' row above, return NaN
        return np.nan

# Add a new column 'closest_A_above_B' initialized with NaNs using the 'apply' method
df['closest_A_above_B'] = df.apply(find_closest_A_above_B, axis=1)

df

  closest_index = df.loc[:row.name][df['is_B'] == True].iloc[-1].name


Unnamed: 0,A,is_B,closest_A_above_B
0,1,False,
1,4,True,4.0
2,3,False,4.0
3,6,True,6.0
4,8,False,6.0


In [83]:
vol1_char_df.columns

Index(['vol_num', 'page_num', 'block_num', 'block_num_absolute', 'block_bbox',
       'line_num', 'line_wmode', 'line_dir', 'line_bbox', 'span_num',
       'span_size', 'span_flags', 'span_font', 'span_color', 'span_ascender',
       'span_descender', 'span_origin', 'span_bbox', 'word_num', 'word',
       'word_bbox', 'pruned_word', 'pruned_word_bbox', 'char_num', 'char',
       'char_origin', 'char_bbox', 'genus_index_pat_match',
       'epithet_index_pat_match', 'col_num', 'epithet_coord_match',
       'genus_coord_match', 'genus_mean_coord', 'epithet_mean_coord',
       'potential_genus_match', 'potential_epithet_match', 'infra_coord_match',
       'potential_infra_match', 'index_page_num'],
      dtype='object')

In [85]:
import pandas as pd
import numpy as np

# Create example dataframe
df = pd.DataFrame({'A': [1, 4, 3, 6, 8],
                   'is_B': [False, True, False, True, False]})

# Find the row indices where 'is_B' is True

df = vol1_char_df.copy()
genus_indecies = df[df['potential_genus_match'] == True].index

# Define a function to find the closest 'is_B' row above and return the corresponding 'A' value
def find_closest_A_above_B(row):
    try:
        # Get the index of the next 'is_B' row above
        closest_index = df.loc[:row.name][df['potential_genus_match'] == True].iloc[-1].name
        # Return the corresponding 'A' value
        return df.loc[closest_index, 'word']
    except IndexError:
        # If there is no 'is_B' row above, return NaN
        return np.nan

# Add a new column 'closest_A_above_B' initialized with NaNs using the 'apply' method

tqdm.pandas()
df['closest_genus'] = df.progress_apply(find_closest_A_above_B, axis=1)

  closest_index = df.loc[:row.name][df['potential_genus_match'] == True].iloc[-1].name
  8%|▊         | 147427/1764644 [05:44<1:02:58, 428.01it/s]


KeyboardInterrupt: 

In [86]:
import pandas as pd
import numpy as np

# Create example dataframe
df = pd.DataFrame({'A': [1, 4, 3, 6, 8],
                   'is_B': [False, True, False, True, False]})

# Find the row indices where 'is_B' is True

df = vol1_char_df.copy()
genus_indecies = df[df['potential_genus_match'] == True].index

# Initialize an empty list to store the indices of the closest 'is_B' row above for each row in the dataframe
closest_index_list = []

# Loop over each row in the dataframe
for i, row in tqdm(df.iterrows()):
    # Use a binary search to find the closest index above with 'is_B' set to True
    closest_index = genus_indecies.searchsorted(i)
    # If the index is out of range, set it to the last index
    closest_index = closest_index if closest_index < len(genus_indecies) else len(genus_indecies)-1
    # Add the index to the list
    closest_index_list.append(genus_indecies[closest_index])
    
# Add a new column 'closest_A_above_B' to the dataframe
df['closest_genus'] = df.loc[closest_index_list, 'word'].values

# Print the updated dataframe

1764644it [00:41, 42396.73it/s]


        vol_num  page_num  block_num  block_num_absolute  \
0             1         0          0                   0   
1             1         0          0                   0   
2             1         0          0                   0   
3             1         0          0                   0   
4             1         0          0                   0   
...         ...       ...        ...                 ...   
1764639       1       641          5                   7   
1764640       1       641          5                   7   
1764641       1       641          5                   7   
1764642       1       641          5                   7   
1764643       1       641          5                   7   

                                                block_bbox  line_num  \
0        (110.87999725341797, 127.92001342773438, 344.0...         0   
1        (110.87999725341797, 127.92001342773438, 344.0...         0   
2        (110.87999725341797, 127.92001342773438, 344.0...     

In [None]:
str.isnumeric

In [173]:
import numpy as np
import pandas as pd

# create sample data
data = {'A': [1, 4, 3, 6, 8, 5], 'is_B': [False, True, False, True, False, True]}
df = pd.DataFrame(data)

df = vol1_char_df[(vol1_char_df['page_num'].isin(vol1_index)) & ~((vol1_char_df["word"].str.isupper()) & (vol1_char_df["word"].str.len()>2)) & (~vol1_char_df["pruned_word"].str.isnumeric())].copy()

# find index of rows where is_B is True
genus_indecies = df.index[df['potential_genus_match'] == True]

# initialize closest_A_above_B column to NaN
df['closest_genus'] = np.nan

# loop over is_B rows and update closest_A_above_B column
for i in tqdm(genus_indecies):
    prev_rows = df.loc[:i][df['potential_genus_match'] == True]
    if (not prev_rows.empty) and (df.at[i, 'word'].isupper() == False):
        closest_index = prev_rows.index[-1]
        df.at[i, 'closest_genus'] = df.at[closest_index, 'word']

# forward fill the closest_A_above_B column to fill NaN values
#df.loc[(df["word"].str.isupper()) & (df["word"].str.len()>2),'closest_genus'] = -1
df['closest_genus'].ffill(inplace=True)


  prev_rows = df.loc[:i][df['potential_genus_match'] == True]
100%|██████████| 3578/3578 [00:13<00:00, 261.20it/s]


In [172]:
df.loc[:,['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'closest_genus']].drop_duplicates().head(50)

Unnamed: 0,word_num,word,word_bbox,pruned_word,pruned_word_bbox,potential_genus_match,potential_epithet_match,potential_infra_match,closest_genus
1715579,0,A,"(115.44000244140625, 131.1444091796875, 121.17...",A,"(115.44000244140625, 131.1444091796875, 121.17...",False,False,False,
1715580,0,cilicica,"(42.47999954223633, 153.70440673828125, 68.823...",cilicica,"(42.47999954223633, 153.70440673828125, 68.823...",False,True,False,
1715588,1,Ant.,"(71.73958587646484, 153.70440673828125, 88.752...",Ant,"(71.73958587646484, 153.70440673828125, 85.994...",False,False,False,
1715592,2,et,"(92.64039611816406, 153.70440673828125, 99.590...",et,"(92.64039611816406, 153.70440673828125, 99.590...",False,False,False,
1715594,3,Ky,"(102.99232482910156, 153.70440673828125, 114.1...",Ky,"(102.99232482910156, 153.70440673828125, 114.1...",False,False,False,
1715596,0,Acanthophyllum,"(28.559999465942383, 163.30438232421875, 93.46...",Acanthophyllum,"(28.559999465942383, 163.30438232421875, 93.46...",True,False,False,Acanthophyllum
1715610,0,kurdicum,"(42.2400016784668, 172.90438842773438, 78.5574...",kurdicum,"(42.2400016784668, 172.90438842773438, 78.5574...",False,True,False,Acanthophyllum
1715618,1,Boiss.,"(81.9963607788086, 172.90438842773438, 102.457...",Boiss,"(81.9963607788086, 172.90438842773438, 100.229...",False,False,False,Acanthophyllum
1715624,2,et,"(106.75651550292969, 172.90438842773438, 113.4...",et,"(106.75651550292969, 172.90438842773438, 113.4...",False,False,False,Acanthophyllum
1715626,3,Hausskn.,"(116.87992095947266, 172.90438842773438, 151.8...",Hausskn,"(116.87992095947266, 172.90438842773438, 149.2...",False,False,False,Acanthophyllum


In [181]:
import numpy as np
import pandas as pd

# # create sample data
# data = {'A': [1, 4, 3, 6, 8, 5], 'is_B': [False, True, False, True, False, True]}
# df = pd.DataFrame(data)

# df = vol1_char_df[vol1_char_df['page_num'].isin(vol1_index)].copy()

# # find index of rows where is_B is True
epithet_indecies = df.index[df['potential_epithet_match'] == True]

# initialize closest_A_above_B column to NaN
df['closest_epithet'] = np.nan

# loop over is_B rows and update closest_A_above_B column
for i in tqdm(epithet_indecies):
    prev_rows = df.loc[:i][df['potential_epithet_match'] == True]
    if (not prev_rows.empty):
        closest_index = prev_rows.index[-1]
        df.at[i, 'closest_epithet'] = df.at[closest_index, 'word']

# forward fill the closest_A_above_B column to fill NaN values
df.loc[df['potential_genus_match'] == True, 'closest_epithet'] = -1
df['closest_epithet'].ffill(inplace=True)


  prev_rows = df.loc[:i][df['potential_epithet_match'] == True]
100%|██████████| 17824/17824 [01:33<00:00, 190.52it/s]


### index df 

In [359]:
#making sure page_num is in index
#making sure the genus level word is not all uppercase (a family name)
#making sure the pruned_word is not numeric (removing page_number as it's not in order usually)

vol1_index_df = vol1_char_df[(vol1_char_df['page_num'].isin(vol1_index)) &
                             (~((vol1_char_df["word"].str.isupper()) & (vol1_char_df['genus_coord_match'] == True))) & 
                             (~vol1_char_df["pruned_word"].str.isnumeric())].copy()

vol2_index_df = vol2_char_df[(vol2_char_df['page_num'].isin(vol2_index)) &
                             (~((vol2_char_df["word"].str.isupper()) & (vol2_char_df['genus_coord_match'] == True))) & 
                             (~vol2_char_df["pruned_word"].str.isnumeric())].copy()

vol3_index_df = vol3_char_df[(vol3_char_df['page_num'].isin(vol3_index)) &
                             (~((vol3_char_df["word"].str.isupper()) & (vol3_char_df['genus_coord_match'] == True))) & 
                             (~vol3_char_df["pruned_word"].str.isnumeric())].copy()

In [360]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_genus_names(row):
    if row['potential_genus_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan
        
for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_genus'] = vol_index_df.apply(extract_potential_genus_names, axis = 1)
    vol_index_df['closest_genus'].ffill(inplace=True)

In [361]:
#df['closest_epithet_v2'] = np.nan
def extract_potential_epithet_names(row):
    if row['potential_epithet_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_epithet'] = vol_index_df.apply(extract_potential_epithet_names, axis = 1)
    vol_index_df.loc[vol_index_df['potential_genus_match'] == True, 'closest_epithet'] = -1
    vol_index_df['closest_epithet'].ffill(inplace=True)

In [362]:
def extract_potential_infra_type(row):
    if row['potential_infra_match'] == True:
        return row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num'])
    else:
        return np.nan

for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
    vol_index_df['closest_infra_type'] = vol_index_df.apply(extract_potential_infra_type, axis = 1)
    vol_index_df.loc[vol_index_df['potential_epithet_match'] == True, 'closest_infra_type'] = -1
    vol_index_df['closest_infra_type'].ffill(inplace=True)

In [363]:
keep_cols = vol3_index_df.columns.difference(["char_num", "char", "char_origin", "char_bbox"], sort=False).tolist()

vol3_index_test = vol3_index_df.copy().loc[:,keep_cols].drop_duplicates().reset_index()
vol3_index_test.rename(columns={"index": "char_index"}, inplace = True)

In [364]:
for vol_index_df in [vol3_index_test]:#[vol1_index_df, vol2_index_df, vol3_index_df]:
    infra_name_match_indecies = vol_index_df[vol_index_df['potential_infra_match'] == True].index + 1
    vol_index_df['closest_infra_name'] = np.NaN
    vol_index_df.loc[infra_name_match_indecies, 'closest_infra_name'] = vol_index_df.apply(lambda row : row['word'] + "_" + str(row['page_num']) + "_" + str(row['block_num']) + "_" + str(row['line_num']) , axis = 1)
    vol_index_df['potential_epithet_name_match'] = vol_index_df.index.isin(infra_name_match_indecies)
    vol_index_df.loc[vol_index_df['potential_epithet_match'] == True, 'closest_infra_name'] = -1
    vol_index_df['closest_infra_name'].ffill(inplace=True)

In [365]:
vol3_index_test.replace(-1, np.NaN, inplace = True)

In [366]:
vol3_index_test.iloc[:,17:].head(50)

Unnamed: 0,span_origin,span_bbox,word_num,word,word_bbox,pruned_word,pruned_word_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,...,potential_genus_match,potential_epithet_match,infra_coord_match,potential_infra_match,index_page_num,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match
0,"(188.39999389648438, 100.96002197265625)","(188.39999389648438, 87.27102661132812, 229.41...",0,INDEX,"(188.39999389648438, 87.27102661132812, 229.41...",INDEX,"(188.39999389648438, 87.27102661132812, 229.41...",False,False,0,...,False,False,False,False,1,,,,,False
1,"(16.079999923706055, 168.8800048828125)","(16.079999923706055, 160.3209991455078, 62.117...",0,Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",True,False,0,...,True,False,False,False,1,Aaronsohnia_555_1_0,,,,False
2,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",1,Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,,,,False
3,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",2,et,"(100.82400512695312, 159.4029998779297, 107.43...",et,"(100.82400512695312, 159.4029998779297, 107.43...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,,,,False
4,"(62.11798095703125, 168.8800048828125)","(62.11798095703125, 159.4029998779297, 122.569...",3,Eig,"(109.93597412109375, 159.4029998779297, 122.56...",Eig,"(109.93597412109375, 159.4029998779297, 122.56...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,,,,False
5,"(23.040000915527344, 178.48001098632812)","(23.040000915527344, 169.92100524902344, 67.03...",0,factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",factorovskyi,"(23.040000915527344, 169.92100524902344, 67.03...",False,True,0,...,False,True,False,False,1,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
6,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",1,Warb.,"(69.28821563720703, 169.0030059814453, 93.4471...",Warb,"(69.28821563720703, 169.0030059814453, 90.6840...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
7,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",2,et,"(96.83100891113281, 169.0030059814453, 103.471...",et,"(96.83100891113281, 169.0030059814453, 103.471...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
8,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",3,Eig,"(106.63101959228516, 169.0030059814453, 119.26...",Eig,"(106.63101959228516, 169.0030059814453, 119.26...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False
9,"(67.0320053100586, 178.48001098632812)","(67.0320053100586, 169.0030059814453, 145.1301...",4,in,"(122.19778442382812, 169.0030059814453, 129.33...",in,"(122.19778442382812, 169.0030059814453, 129.33...",False,False,0,...,False,False,False,False,1,Aaronsohnia_555_1_0,factorovskyi_555_2_0,,,False


In [367]:
[c for c in vol3_index_test.columns if c.startswith('potential')]

['potential_genus_match',
 'potential_epithet_match',
 'potential_infra_match',
 'potential_epithet_name_match']

In [368]:
vol3_index_test['potential_author_match'] = (vol3_index_test['potential_genus_match'] == False) & \
                                            (vol3_index_test['potential_epithet_match'] == False) & \
                                            (vol3_index_test['potential_infra_match'] == False) & \
                                            (vol3_index_test['potential_epithet_name_match'] == False)

In [369]:
#vol3_index_test[vol3_index_test['potential_infra_match'] == True].index + 1

In [370]:
#vol3_index_test.iloc[:,18:].head(50)
#genus author: genus = "genus" & "closest_epithet = -1 & potential_genus_match = False"
#epithet author: epithet = "epithet" & potential_epithet_match = False & closest_infra = -1 
#closest_infra 

In [371]:
#vol3_index_test = vol3_index_df.copy()
#vol3_index_test['after_potential_infra_match'] = vol3_index_test['potential_infra_match'].shift()

# group_cols = vol3_index_test.columns.difference(["char_num", "char", "char_origin", "char_bbox"], sort=False).tolist()
# vol3_index_test["after_potential_infra_match"] = vol3_index_test.groupby(group_cols)['potential_infra_match'].shift()#.transform('min')

In [372]:
# vol3_index_df[['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match']].drop_duplicates()

In [373]:
# vol3_index_test[['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'after_potential_infra_match']].drop_duplicates().head(50)

In [374]:
# vol3_index_test.loc[vol3_index_test['after_potential_infra_match'] == True, ['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'after_potential_infra_match']].drop_duplicates()

In [375]:
# #df['closest_epithet_v2'] = np.nan
# def extract_potential_infra_names(row):
#     if row['after_potential_infra_match'] == True:
#         return row['word']
#     else:
#         return np.nan

# for vol_index_df in [vol1_index_df, vol2_index_df, vol3_index_df]:
#     vol_index_df['closest_epithet'] = vol_index_df.apply(extract_potential_epithet_names, axis = 1)
#     df.loc[df['potential_genus_match'] == True, 'closest_epithet_v2'] = -1
#     vol_index_df['closest_epithet'].ffill(inplace=True)

In [376]:
# df.loc[:,['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'closest_genus', 'closest_epithet']]#.drop_duplicates().tail(50)

In [377]:
# type(df.at[i, 'closest_genus'])

In [378]:
# closes_genus = df.at[i, 'closest_genus']
# pd.isnull(closes_genus) == False

In [379]:
# df.loc[df['potential_genus_match'] == True, 'closest_epithet'] = np.nan

In [380]:
# df.loc[:,['word_num','word','word_bbox','pruned_word', 'pruned_word_bbox', 'potential_genus_match', 'potential_epithet_match', 'potential_infra_match', 'closest_genus', 'closest_epithet']].drop_duplicates().tail(50)

In [381]:
# all_vol_data = [(vol1_char_df, vol1_index, vol1_doc, "potential_infra_match_vol1"),
#                 (vol2_char_df, vol2_index, vol2_doc, "potential_infra_match_vol2"),
#                 (vol3_char_df, vol3_index, vol3_doc, "potential_infra_match_vol3")][::-1]

# for vol_char_df, vol_index, doc, output_name in all_vol_data: 
#     #for each volume 
#     image_list = []

#     for page_num in tqdm(vol_index):
#         pix_map = doc.get_page_pixmap(page_num,matrix=mat)
#         image = Image.open(io.BytesIO(pix_map.tobytes()))
#         draw = ImageDraw.Draw(image)
        

#         infra_coord_db = vol_char_df[(vol_char_df['page_num'] == page_num) & 
#                                      (vol_char_df['infra_coord_match'] == True)
#                             ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                             ].drop_duplicates()

#         infra_db = vol_char_df[(vol_char_df['page_num'] == page_num) 
#                                 & (vol_char_df['potential_infra_match'] == True)
#                                 ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                                 ].drop_duplicates()

#         with_infra_symbols = vol_char_df[(vol_char_df['page_num'] == page_num) &
#                                          (vol_char_df['infra_coord_match'] == True) & 
#                                          (vol_char_df['word'].apply(has_infra_symbols) == True)
#                                         ].loc[:,~vol_char_df.columns.isin(["char_num", "char", "char_origin",	"char_bbox"])
#                                         ].drop_duplicates()

#         #genus Coord is orange-pinkish, 5
#         for coord in infra_coord_db['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0-5, y0-5, x1+5, y1+5), fill=None, outline=ImageColor.getrgb("#003399"), width=7)

#         for coord in infra_db['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0-3, y0-3, x1+3, y1+3), fill=None, outline=ImageColor.getrgb("#FF7F50"), width=5)
            
#         # #epithet is red, 3
#         for coord in with_infra_symbols['word_bbox'] :
#             x0, y0, x1, y1 = [f*TARGET_DPI/ 72 for f in coord]
#             draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb("#990000"), width=3)

#         image_list.append(image)

#     #save pages of the volume
#     image_list[0].save('../output/local/'+output_name+'.pdf' ,save_all=True, append_images=image_list[1:])

In [382]:
vol3_index_test

Unnamed: 0,char_index,vol_num,page_num,block_num,block_num_absolute,block_bbox,line_num,line_wmode,line_dir,line_bbox,...,potential_epithet_match,infra_coord_match,potential_infra_match,index_page_num,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match,potential_author_match
0,1554022,3,555,0,0,"(188.39999389648438, 87.27102661132812, 229.41...",0,0,"(1.0, 0.0)","(188.39999389648438, 87.27102661132812, 229.41...",...,False,False,False,1,,,,,False,True
1,1554027,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,False,False,False,1,Aaronsohnia_555_1_0,,,,False,False
2,1554038,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,False,False,False,1,Aaronsohnia_555_1_0,,,,False,True
3,1554045,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,False,False,False,1,Aaronsohnia_555_1_0,,,,False,True
4,1554047,3,555,1,1,"(16.079999923706055, 159.4029998779297, 122.56...",0,0,"(1.0, 0.0)","(16.079999923706055, 159.4029998779297, 122.56...",...,False,False,False,1,Aaronsohnia_555_1_0,,,,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8124,1613066,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,True,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,False
8125,1613076,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,False,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True
8126,1613080,3,582,36,54,"(242.8800048828125, 235.3629913330078, 414.587...",0,0,"(1.0, 0.0)","(242.8800048828125, 235.3629913330078, 321.646...",...,False,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True
8127,1613090,3,582,37,55,"(242.8800048828125, 246.01600646972656, 414.28...",0,0,"(1.0, 0.0)","(242.8800048828125, 246.01600646972656, 299.05...",...,True,False,False,28,Zollikoferia_582_35_0,tenuiloba_582_37_0,,,False,False


In [383]:
vol3_index_test.replace(np.NaN, "",inplace = True)

In [384]:
# import pandas as pd

# # create a sample dataframe
# df = pd.DataFrame({
#     'A': [1, 1, 1, 2, 2],
#     'B': [2, 2, 2, 1, 2],
#     'C': [True, False, False, False, True],
#     'D': ['hello', 'hello2', 'world', 'python', 'pandas']
# })
author_grouping = ['closest_genus', 'closest_epithet', 'closest_infra_name']
vol3_index_test['potential_author_match']
# group by 'A' and 'B' columns
groups = vol3_index_test.groupby(author_grouping)

# concatenate 'D' values for each group where 'C' is False
def concatenate(group):
    return group.loc[group['potential_author_match'] == True, 'word'].str.cat(sep=' ')

concatenated = groups.apply(concatenate).reset_index()

# add the concatenated values to the original dataframe
result = vol3_index_test.merge(concatenated[['closest_genus', 'closest_epithet', 'closest_infra_name', 0]], on=['closest_genus', 'closest_epithet', 'closest_infra_name'], how='left').rename(columns={0: 'authors'})

In [385]:
result.iloc[:,20:]

Unnamed: 0,word,word_bbox,pruned_word,pruned_word_bbox,genus_index_pat_match,epithet_index_pat_match,col_num,epithet_coord_match,genus_coord_match,genus_mean_coord,...,infra_coord_match,potential_infra_match,index_page_num,closest_genus,closest_epithet,closest_infra_type,closest_infra_name,potential_epithet_name_match,potential_author_match,authors
0,INDEX,"(188.39999389648438, 87.27102661132812, 229.41...",INDEX,"(188.39999389648438, 87.27102661132812, 229.41...",False,False,0,False,False,16.312258,...,False,False,1,,,,,False,True,INDEX
1,Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",Aaronsohnia,"(16.079999923706055, 160.3209991455078, 62.117...",True,False,0,False,True,16.312258,...,False,False,1,Aaronsohnia_555_1_0,,,,False,False,Warburg et Eig
2,Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",Warburg,"(64.11976623535156, 159.4029998779297, 98.6549...",False,False,0,False,False,16.312258,...,False,False,1,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
3,et,"(100.82400512695312, 159.4029998779297, 107.43...",et,"(100.82400512695312, 159.4029998779297, 107.43...",False,False,0,False,False,16.312258,...,False,False,1,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
4,Eig,"(109.93597412109375, 159.4029998779297, 122.56...",Eig,"(109.93597412109375, 159.4029998779297, 122.56...",False,False,0,False,False,16.312258,...,False,False,1,Aaronsohnia_555_1_0,,,,False,True,Warburg et Eig
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8124,nudicaulis,"(242.8800048828125, 236.28099060058594, 280.00...",nudicaulis,"(242.8800048828125, 236.28099060058594, 280.00...",False,True,1,True,False,234.04114,...,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,False,(L.) Boiss.
8125,(L.),"(282.0889892578125, 235.3629913330078, 296.101...",L,"(285.191650390625, 235.3629913330078, 290.6356...",False,False,1,False,False,234.04114,...,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True,(L.) Boiss.
8126,Boiss.,"(298.3200378417969, 235.3629913330078, 321.646...",Boiss,"(298.3200378417969, 235.3629913330078, 317.378...",False,False,1,False,False,234.04114,...,False,False,28,Zollikoferia_582_35_0,nudicaulis_582_36_0,,,False,True,(L.) Boiss.
8127,tenuiloba,"(242.8800048828125, 246.83200073242188, 272.80...",tenuiloba,"(242.8800048828125, 246.83200073242188, 272.80...",False,True,1,True,False,234.04114,...,False,False,28,Zollikoferia_582_35_0,tenuiloba_582_37_0,,,False,False,Boiss.


In [405]:
def fix_words(word):
    head, sep, tail = word.partition('_')
    return head 

result['closest_genus'] = result['closest_genus'].apply(fix_words)
result['closest_epithet'] = result['closest_epithet'].apply(fix_words)
result['closest_infra_type'] = result['closest_infra_type'].apply(fix_words)
result['closest_infra_name'] = result['closest_infra_name'].apply(fix_words)

In [406]:
result_prune_authors = result[(result['potential_genus_match'] == True) |
                              (result['potential_epithet_match'] == True) |
                              (result['potential_epithet_name_match'] == True)]

In [407]:
[c for c in vol3_index_test.columns if c.startswith('closest')]

['closest_genus',
 'closest_epithet',
 'closest_infra_type',
 'closest_infra_name']

In [408]:
simplified_result = result_prune_authors[['closest_genus',
                                          'closest_epithet',
                                          'closest_infra_type',
                                          'closest_infra_name',
                                          'authors']]

In [409]:
simplified_result.to_csv('vol3_index_output.csv')

In [410]:
non_italics_simplified_result = result_prune_authors.loc[(result_prune_authors['span_flags'] != 6),
                                                     ['closest_genus',
                                                      'closest_epithet',
                                                      'closest_infra_type',
                                                      'closest_infra_name',
                                                      'authors']]

non_italics_simplified_result.to_csv('vol3_nonitalics_index_output.csv')

In [411]:
text = 'closest_infra_name'
head, sep, tail = text.partition('_')

In [412]:
result_prune_authors.columns

Index(['char_index', 'vol_num', 'page_num', 'block_num', 'block_num_absolute',
       'block_bbox', 'line_num', 'line_wmode', 'line_dir', 'line_bbox',
       'span_num', 'span_size', 'span_flags', 'span_font', 'span_color',
       'span_ascender', 'span_descender', 'span_origin', 'span_bbox',
       'word_num', 'word', 'word_bbox', 'pruned_word', 'pruned_word_bbox',
       'genus_index_pat_match', 'epithet_index_pat_match', 'col_num',
       'epithet_coord_match', 'genus_coord_match', 'genus_mean_coord',
       'epithet_mean_coord', 'potential_genus_match',
       'potential_epithet_match', 'infra_coord_match', 'potential_infra_match',
       'index_page_num', 'closest_genus', 'closest_epithet',
       'closest_infra_type', 'closest_infra_name',
       'potential_epithet_name_match', 'potential_author_match', 'authors'],
      dtype='object')