In [265]:
import io
import fitz
import re
from PIL import Image, ImageDraw, ImageFont, ImageColor
import operator
import pandas as pd 
import numpy as np
import math
from tqdm import tqdm

# ROI Plotting 

In [266]:
def plot_genus_blocks(page_df, draw, color = '#6c899e', w = 3):
    try:
        genus_list = page_df['draw_genus'].unique()
    except:
        print("no GENUS found")
        return 

    for g in genus_list:
        temp_df = page_df[(page_df['draw_genus'] == g)]
        g_x0 = temp_df['x0'].min()
        g_y0 = temp_df['y0'].min()
        g_x1 = temp_df['x1'].max()
        g_y1 = temp_df['y1'].max()

        draw.rectangle((g_x0, g_y0, g_x1, g_y1), fill=None, outline=ImageColor.getrgb(color), width = w)
        
def plot_epithet_blocks(page_df, draw, color = '#660066', w = 3):
    try:
        epithet_list = page_df['draw_epithet'].unique()
    except:
        print("no EPITHET found")
        return 
    
    for e in epithet_list:
        temp_df = page_df[(page_df['draw_epithet'] == e)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_author_blocks(page_df, draw, color = '#a3a3a3', w = 1):
    try:
        author_list = page_df['draw_author'].unique()
    except:
        print("no AUTHOR found")
        return 

    for a in author_list:
        temp_df = page_df[(page_df['draw_author'] == a)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_infra_blocks(page_df, draw, color = '#ff6289', w = 1):
    try:
        infra_list = page_df['draw_infra'].unique()
    except:
        print("no INFRA Spp. found")
        return 

    for infra_spp in infra_list:
        temp_df = page_df[(page_df['draw_infra'] == infra_spp)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

# regex based boolean functions

In [267]:
def valid(word):
    """
    valid words are words that are:
    - at least 2 characters
        - unless it's x (symbol for hybrid)
    """
    return (not bool(re.search(r"[0-9]+[,.]?", word))) and (len(word) > 1 or word == 'x' or word == 'X' or word == '×' or word == r'\u00D7')

In [268]:
def is_genus(word):
    """
    A word in the index might be a genus if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - first letter upper case
        - all but first lowecase 
    in regex: ^[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\u00D7]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    

def is_epithet(word):
    """
    A word in the index might be an epithet if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - all letters lowecase 
    in regex: ^[a-zàâäèéêëîïôœùûüÿç]+[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise 
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[a-zàâäèéêëîïôœùûüÿç\u00D7]+[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    
def is_hybrid(word):
    regex = r"^(([Xx\u00D7])|([Xx\u00D7]\.))$"
    return re.search(regex, word)

def is_infra(word):
    regex = r"^(var\.)|(subsp\.)"
    return re.search(regex, word)

# Pre-processing

In [269]:
def preprocessing(pages, page_num, indent_err = 15):
    
    #initiate dataframe
    page_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    
    #add page number to dataframe
    page_df['page_num'] = np.array([page_num]*page_df.shape[0])
    #updating coordinates to represent target DPI
    page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    #get x corner coordinates 
    x_min = page_df['x0'].min()
    x_max = page_df['x1'].max()

    #invalid words dataframe -- for error checking
    pruned_words_df = page_df[~page_df["word"].apply(valid)].reset_index()
    #prune out invalid words (based on function valid)
    page_df = page_df[page_df["word"].apply(valid)].reset_index()
    
    indent_groups = []
    blocks = page_df['block_no'].unique()
    for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            #reset word_no values (useful for cases where word that was originally at 0th index was pruned out)
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            num_words = len(page_df[cond]['word_no'])
            page_df.loc[cond, 'word_no'] = np.arange(num_words).astype(int) #this is slowww
            #set column number (0 or 1)
            x_0 = page_df[cond]['x0'].min()
            #THIS DOESN'T WORK AAAA -- issue was with line no thing
            if not np.isnan(x_0):
                page_df.loc[cond, 'col_no'] = np.array([int(x_0 > ((x_min + x_max) / 2))]*num_words).astype(int)

                #initiate indent groups -- only first word should get an indent_group value 
                new_group = True
                for g_i in range(len(indent_groups)):
                    g = indent_groups[g_i]
                    g_arr = np.array(g)
                    if x_0 <= np.mean(g_arr) + indent_err and x_0 >= np.mean(g_arr) - indent_err:
                        g.append(x_0)
                        new_group = False
                        page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
                if new_group:
                    indent_groups.append([x_0])
                    g_i = len(indent_groups) - 1
                    page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)


    #return updated page_df, pruned_words_df, indent groups
    return page_df.reset_index(), pruned_words_df, indent_groups

#https://stackoverflow.com/questions/53468558/adding-image-to-pandas-dataframe

# Finding indentations associated with genus, epithet, infra

In [270]:
types = ['genus', 'epithet', 'infra', 'author', 'misc.']
def n_leftmost_indent(df, n):
    """return a tuple with at most 3 elements each element itself is a tuple containing indent group, mean, group len"""
    indent_groups = [(g, df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'].mean(), len(df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'])) for g in df['indent_group'].unique()]
    indent_groups.sort(key = lambda x : x[1])
    return indent_groups[:n]

In [271]:
def get_genusEpithetInfra_indent(col_df):
    leftmost_3_indents = n_leftmost_indent(col_df, 3) 
    min_gap = 25
    max_gap = 50

    # possibly not specific enough
    # first identifying indent based don distance from one another only
    if len(leftmost_3_indents) == 3:
        if leftmost_3_indents[0][1] < max_gap:
            leftmost_3_indents = leftmost_3_indents[1:]
        elif ((leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) > max_gap or \
            (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) < min_gap): #comparing first two (if satisfied last two will be checked in next if block)
            leftmost_3_indents = [max(leftmost_3_indents[1:], key = lambda x : x[2])] + [leftmost_3_indents[2]]
        elif (leftmost_3_indents[2][1] - leftmost_3_indents[1][1]) > max_gap or \
            (leftmost_3_indents[2][1] - leftmost_3_indents[1][1]) < min_gap: #comparing last two
            leftmost_3_indents = [leftmost_3_indents[0]] + [max(leftmost_3_indents[1:], key = lambda x : x[2])]

    if len(leftmost_3_indents) == 2:
        if leftmost_3_indents[0][1] < max_gap:
            leftmost_3_indents = leftmost_3_indents[1]
        elif (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) > max_gap or (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) < min_gap:
            leftmost_3_indents = [max(leftmost_3_indents, key = lambda x : x[2])]

    has_genus, has_epithet, has_infra = False, False, False
    genus_indent, epithet_indent, infra_indent = -1, -1, -1
    if len(leftmost_3_indents) == 3:
        has_genus, has_epithet, has_infra = True, True, True
        genus_indent, epithet_indent, infra_indent = [el[0] for el in leftmost_3_indents]
    elif len(leftmost_3_indents) == 2:
        if col_df[col_df['indent_group'] == leftmost_3_indents[1][0]]['word'].apply(is_infra).any():
            has_genus, has_epithet, has_infra = False, True, True
            epithet_indent, infra_indent = [el[0] for el in leftmost_3_indents]
        else:
            has_genus, has_epithet, has_infra = True, True, False
            genus_indent, epithet_indent = [el[0] for el in leftmost_3_indents]
    elif len(leftmost_3_indents) == 2: 
        has_genus, has_epithet, has_infra = False, True, False
        epithet_indent = leftmost_3_indents[0][0]

    return genus_indent, epithet_indent, infra_indent, leftmost_3_indents

# Processing column dataframes

In [272]:
def process_col(col_df, genus, epithet, draw_genus, draw_epithet, draw_infra = np.NaN):
    genus_indent, epithet_indent, infra_indent, indent_3_left = get_genusEpithetInfra_indent(col_df)
    blocks = col_df['block_no'].unique()
    start_word_cond = -1 
    author = ''
    #draw_infra = np.NaN
    
    for b in blocks:
        lines = col_df[col_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            cond = (col_df['line_no'] == l) & (col_df['block_no'] == b)
            words = col_df[cond]['word_no'].unique()
            process_hybrid = False
            process_infra = False
            
            col_df = col_df.copy()
            for w in words:
                word_cond = (col_df['line_no'] == l) & (col_df['block_no'] == b) & (col_df['word_no'] == w) 
                word = col_df[word_cond]['word'].item()
                #print(word)
            
                if w == 0:
                    infra = ''
                    if author != '':
                        col_df.loc[start_word_cond, 'author'] = author
                        author = ''
                    
                    start_word_cond = word_cond
                    start_l = l 
                    start_b = b 

                    indent_group = col_df[word_cond]['indent_group'].item()
                    
                    if is_hybrid(word):
                        process_hybrid = True
                        misc = word
                        author = ''
                        #col_df.loc[start_word_cond, 'misc.'] = misc
                    #now only gotta say INDENT AND satisfies these paterns
                    #print(indent_group, genus_indent)
                    else: 
                        if indent_group == genus_indent:
                            if not ''.join(e for e in word if e.isalpha()).isupper():
                                genus = word
                                misc = ''
                                author = ''
                                infra = ''
                                epithet = ''
                                draw_genus = genus
                                col_df.loc[start_word_cond, 'genus'] = genus
                                col_df.loc[start_word_cond, 'taxon rank'] = 'genus'
                                if not is_genus(word):
                                    col_df.loc[start_word_cond, 'error_check'] = True
                                
                            else: 
                                genus = ''
                                misc = ''
                                author = ''
                                infra = ''
                                epithet = ''
                                draw_genus = ''
                        elif indent_group == epithet_indent:
                            epithet = word
                            misc = ''
                            infra = ''
                            author = ''
                            col_df.loc[start_word_cond, 'genus'] = genus
                            col_df.loc[start_word_cond, 'epithet'] = epithet
                            col_df.loc[start_word_cond, 'taxon rank'] = 'species'
                            if not is_epithet(word):
                                col_df.loc[start_word_cond, 'error_check'] = True
                            draw_epithet = str(genus) + '_' + str(epithet) +'_' + str(start_b) + '_' + str(start_l)
                            
                        elif indent_group == infra_indent:
                            process_infra = True
                            misc = word
                            author = ''
                            #col_df.loc[start_word_cond, 'misc.'] = misc
                            if not (is_infra(word) or is_hybrid(word)):
                                col_df.loc[start_word_cond, 'error_check'] = True
                    
                elif process_infra:
                    start_word_cond = word_cond
                    start_l = l 
                    start_b = b 
                    infra = word 
                    col_df.loc[start_word_cond, 'genus'] = genus
                    col_df.loc[start_word_cond, 'epithet'] = epithet
                    col_df.loc[start_word_cond, 'infra'] = infra
                    #col_df.loc[start_word_cond, 'misc.'] = misc
                    col_df.loc[start_word_cond, 'taxon rank'] = misc
                    draw_infra = str(infra) + '_'+str(start_b)+'_'+str(start_l)
                    process_infra = False
                    
                elif process_hybrid:
                    start_word_cond = word_cond
                    start_l = l 
                    start_b = b 
                    if indent_group == genus_indent:
                        genus = word
                        epithet = ''
                        infra = ''
                        author = ''
                        draw_genus = genus
                        col_df.loc[start_word_cond, 'genus'] = genus
                        col_df.loc[start_word_cond, 'taxon rank'] = 'genus - hybrid'
                        if not is_genus(word):
                            col_df.loc[start_word_cond, 'error_check'] = True
                            
                    elif indent_group == epithet_indent:
                        epithet = word
                        author = ''
                        infra = ''
                        col_df.loc[start_word_cond, 'genus'] = genus
                        col_df.loc[start_word_cond, 'epithet'] = epithet
                        col_df.loc[start_word_cond, 'taxon rank'] = 'species - hybrid'
                        draw_epithet = str(genus) + '_' + str(epithet) +'_' + str(start_b) + '_' + str(start_l)
                        if not is_epithet(word):
                            col_df.loc[start_word_cond, 'error_check'] = True
                    elif indent_group == infra_indent:
                        infra = word
                        col_df.loc[start_word_cond, 'genus'] = genus
                        col_df.loc[start_word_cond, 'epithet'] = epithet
                        col_df.loc[start_word_cond, 'infra'] = infra
                        col_df.loc[start_word_cond, 'taxon rank'] = 'hybrid'

                    #col_df.loc[start_word_cond, 'misc.'] = 'x'
                    process_hybrid = False
                else:
                    author = author + word + ' '
                    col_df.loc[word_cond, 'draw_author'] = 'author_'+str(start_b)+'_'+str(start_l)
                
                if genus:
                    col_df.loc[word_cond, 'draw_genus'] = draw_genus
                if epithet:
                    col_df.loc[word_cond, 'draw_epithet'] = draw_epithet
                if infra: 
                    col_df.loc[word_cond, 'draw_infra'] = draw_infra

    #Last author
    if author != '':
        col_df.loc[start_word_cond, 'author'] = author
    
    return col_df, genus, epithet, draw_genus, draw_epithet

# the Results

In [280]:
#pre-processing 
def preprocessing_pages(pages, index):
    pre_df_dict = {}
    pruned_dict = {}

    for page_num in tqdm(index):
        #print(page_num, type(page_num))
        page_df, pruned_df, indent_group = preprocessing(pages, page_num)
        pre_df_dict[page_num] = page_df
        pruned_dict[page_num] = pruned_df
    return pre_df_dict, pruned_dict

In [281]:
#processing columns 
def processing_pages(pre_df_dict, index):
    genus = np.NaN
    epithet = np.NaN
    draw_genus = np.NaN
    draw_epithet = np.NaN
    processed_df_dict = {}

    for page_num in tqdm(index):
        #print(page_num)
        col_df_list = []
        #process the pre-processed dfs
        page_df = pre_df_dict[page_num]

        #processing each column
        for c in page_df['col_no'].unique():
            col_df = page_df[page_df['col_no'] == c]
            col_df, genus, epithet, draw_genus, draw_epithet = process_col(col_df, genus, epithet, draw_genus, draw_epithet)
            col_df_list.append(col_df)

        page_df = pd.concat(col_df_list, axis = 0)
        processed_df_dict[page_num] = page_df
    
    return processed_df_dict

In [289]:
#drawing ROI boxes in PDF
def plot_ROI(processed_df_dict, index):
    result_ims = []
    for page_num in tqdm(index):
        page_df = processed_df_dict[page_num]
        
        pix_map = doc.get_page_pixmap(page_num,matrix=mat)
        image = Image.open(io.BytesIO(pix_map.tobytes()))
        draw = ImageDraw.Draw(image)
        
        plot_genus_blocks(col_df, draw)
        plot_epithet_blocks(col_df, draw)
        plot_author_blocks(col_df, draw)
        plot_infra_blocks(col_df, draw)

        result_ims.append(image)
    return result_ims

In [296]:
#Saving pdfs
def save_outputs(processed_df_dict, index, output_name, make_csv = True, make_ROI_pdf = True, make_html = True, pruned = True):
    df_list = [processed_df_dict[df_index] for df_index in processed_df_dict]
    df = pd.concat(df_list, axis = 0)
    print("merged dataframe")

    pruned_df = df[(~df['genus'].isnull())]
    pruned_df = pruned_df[["page_num", "genus", "epithet", "infra" ,"author", "taxon rank"]]
    print("pruned dataframe")

    if make_csv:
        df.to_csv('../output/index/CSV/'+ output_name +'.csv', index = False)
        print("made .csv file")
        if pruned:
            pruned_df.to_csv('../output/index/CSV/'+ output_name +'_pruned.csv', index = False)

    if make_html: 
        df.to_html('../output/index/'+ output_name +'.html')
        print("made .hrml file")
        if pruned: 
            pruned_df.to_html('../output/index/'+ output_name +'_pruned.html')

    if make_ROI_pdf:
        print("making .pdf file of ROIs")
        result_ims = plot_ROI(processed_df_dict, index)
        result_ims[0].save('../output/index/PDF/'+ output_name +'_ROI.pdf',save_all=True, append_images=result_ims[1:])

In [299]:
#some global variables
TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

def process_index_pages(pdf_dir, index, output_name, make_csv = True, make_ROI_pdf = True, make_html = True, pruned = True):
    #importing all pages
    doc = fitz.open(pdf_dir)
    pages = [doc[i] for i in range(doc.pageCount)]
    #print(index)
    print("initiating dataframe for", output_name)
    pre_df_dict, pruned_dict = preprocessing_pages(pages, index)
    print("processing dataframe for", output_name)
    processed_df_dict = processing_pages(pre_df_dict, index)
    print("saving results for", output_name)
    save_outputs(processed_df_dict, index, output_name, make_csv = True, make_ROI_pdf = True, make_html = True, pruned = True)

In [300]:
vol3_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf"
vol3_index = list(range(555, 583))
vol3_output = 'vol3_index'
process_index_pages(vol3_dir, vol3_index, vol3_output)

initiating dataframe for vol3_index


100%|██████████| 28/28 [00:09<00:00,  2.96it/s]


processing dataframe for vol3_index


100%|██████████| 28/28 [00:23<00:00,  1.17it/s]


saving results for vol3_index
merged dataframe
pruned dataframe
made .csv file
made .hrml file
making .pdf file of ROIs


100%|██████████| 28/28 [00:08<00:00,  3.27it/s]


In [301]:
vol2_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf"
vol2_index = list(range(703, 725))
vol2_output = 'vol2_index'
process_index_pages(vol2_dir, vol2_index, vol2_output)

initiating dataframe for vol2_index


100%|██████████| 22/22 [00:06<00:00,  3.15it/s]


processing dataframe for vol2_index


100%|██████████| 22/22 [00:18<00:00,  1.20it/s]

saving results for vol2_index
merged dataframe





KeyError: 'genus'

In [302]:
vol1_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf"
vol1_index = range(616, 639)
vol1_output = 'vol1_index'
process_index_pages(vol1_dir, vol1_index, vol1_output)

initiating dataframe for vol1_index


100%|██████████| 23/23 [00:07<00:00,  2.94it/s]


processing dataframe for vol1_index


100%|██████████| 23/23 [00:16<00:00,  1.44it/s]


saving results for vol1_index
merged dataframe
pruned dataframe
made .csv file
made .hrml file
making .pdf file of ROIs


  0%|          | 0/23 [00:00<?, ?it/s]


IndexError: page not in document