In [96]:
import io
import fitz
import re
from PIL import Image, ImageDraw, ImageFont, ImageColor
import operator
import pandas as pd 
import numpy as np
import math
from tqdm import tqdm

# ROI Plotting

In [102]:
def plot_genus_blocks(page_df, draw, color = '#6c899e', w = 3):
    genus_list = page_df['genus'].unique()
    for c in page_df['col'].unique():
        for g in genus_list:
            temp_df = page_df[(page_df['col'] == c) & (page_df['genus'] == g)]
            g_x0 = temp_df['x0'].min()
            g_y0 = temp_df['y0'].min()
            g_x1 = temp_df['x1'].max()
            g_y1 = temp_df['y1'].max()

            draw.rectangle((g_x0, g_y0, g_x1, g_y1), fill=None, outline=ImageColor.getrgb(color), width = w)
        
def plot_epitet_blocks(page_df, draw, color = '#54081f', w = 2):
    epitet_list = page_df['epitet'].unique()
    for c in page_df['col'].unique():
        for s in epitet_list:
            temp_df = page_df[(page_df['col'] == c) & (page_df['epitet'] == s)]
            for g in temp_df['genus'].unique():
                    
                temp_g_df = temp_df[(temp_df['genus']) == g]
                s_x0 = temp_g_df['x0'].min()
                s_y0 = temp_g_df['y0'].min()
                s_x1 = temp_g_df['x1'].max()
                s_y1 = temp_g_df['y1'].max()

                draw.rectangle((s_x0, s_y0, s_x1, s_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_blocks(page_df, draw, color = '#4a3757', w = 2):
    block_list = page_df['block_no'].unique()
    for i in block_list:
        df_groupped = page_df[page_df['block_no'] == i]
        x0_arr = df_groupped['x0'].min()
        y0_arr = df_groupped['y0'].min()
        x1_arr = df_groupped['x1'].max()
        y1_arr = df_groupped['y1'].max()

        draw.rectangle((x0_arr, y0_arr, x1_arr, y1_arr), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_author_blocks(page_df, draw, color = '#a3a3a3', w = 1):
    epitet_list = page_df['epitet'].unique()
    for c in page_df['col'].unique():
        for s in epitet_list:
            temp_df = page_df[(page_df['col'] == c) & (page_df['epitet'] == s)]
            for g in temp_df['genus'].unique():
                temp_g_df = temp_df[(temp_df['genus'] == g) & (temp_df['word'] != s)]
                a_x0 = temp_g_df['x0'].min()
                a_y0 = temp_g_df['y0'].min()
                a_x1 = temp_g_df['x1'].max()
                a_y1 = temp_g_df['y1'].max()
                
                draw.rectangle((a_x0, a_y0, a_x1, a_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_sub_blocks(page_df, draw, color = '#ff6289', w = 1):
    sub_list = page_df[(~page_df['sub'].isnull())]['sub'].unique()
    for c in page_df['col'].unique():
        for s in sub_list:
            temp_df = page_df[(page_df['col'] == c) & (page_df['sub'] == s)]
            for b in temp_df['block_no'].unique():
                temp_b_df = temp_df[(temp_df['block_no'] == b)]
                for l in temp_b_df['line_no'].unique():
                    temp_l_df = temp_b_df[temp_b_df['line_no'] == l]
                    a_x0 = temp_l_df['x0'].min()
                    a_y0 = temp_l_df['y0'].min()
                    a_x1 = temp_l_df['x1'].max()
                    a_y1 = temp_l_df['y1'].max()
                
                    draw.rectangle((a_x0, a_y0, a_x1, a_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

# Vol3 Index

In [103]:
pdf_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf"
doc = fitz.open(pdf_dir)
pages = [doc[i] for i in range(doc.pageCount)]
index = list(range(555, 583))

TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

indent_groups = []
indent_err = 15

In [104]:
def process_df(page_num, genus = np.NaN, genus_block_no = np.NaN):
    def initiate_groups(row):
        #return row
        x_0, y_0, x_1, y_1 = row['x0'], row['y0'], row['x1'], row['y1']
        word_no = row['word_no']
        word = row['word']
        alphnum_word = ''.join(e for e in word if e.isalnum())
        alph_word = ''.join(e for e in word if e.isalpha())
        
        if row['line_no'] == 0 and (word.lower() == 'nouvelle' or word.lower() == 'flore'):
            return 
        if (word_no == 0) and (not alphnum_word.isnumeric()) and alph_word:
            #word_no == 0 => the word is a Family, Genus, Species
            new_group = True
            for g in indent_groups:
                g_arr = np.array(g)[:,0]
                if (x_0, y_0, x_1, y_1)[0] <= np.mean(g_arr) + indent_err and (x_0, y_0, x_1, y_1)[0] >= np.mean(g_arr) - indent_err:
                    g.append((x_0, y_0, x_1, y_1))
                    new_group = False
            if new_group:
                indent_groups.append([(x_0, y_0, x_1, y_1)])

    def get_indent_group(row):
        x_0, y_0, x_1, y_1 = row['x0'], row['y0'], row['x1'], row['y1']
        word_no = row['word_no']
        word = row['word']
        alphnum_word = ''.join(e for e in word if e.isalnum())
        
        if (len(word) > 1) and ((not alphnum_word.isnumeric()) and (word_no == 0)): 
            for g_i, g in enumerate(indent_groups):
                g_arr = np.array(g)[:,0]
                if (x_0, y_0, x_1, y_1)[0] <= np.mean(g_arr) + indent_err and (x_0, y_0, x_1, y_1)[0] >= np.mean(g_arr) - indent_err:
                    return g_i

    def get_col(row): 
        x_0, y_0, x_1, y_1 = row['x0'], row['y0'], row['x1'], row['y1']
        return int(x_0 > ((x_min + x_max) / 2))

    def n_leftmost_indent(indent_groups, n):
        indent_means = []
        for g_i, g in enumerate(indent_groups):
            g_arr = np.array(g)[:,0]
            indent_means.append([g_i, np.mean(g_arr)])

        n_smallest = indent_means.sort(key = lambda x : x[1])[:n] #n smallest
        return [el[0] for el in n_smallest]

    def process_col(row):
        nonlocal genus
        nonlocal x_0, y_0, x_1, y_1 
        nonlocal epitet
        nonlocal sub
        nonlocal genus_block_no
        word_no = row['word_no']
        block_no = row['block_no']
        line_no = row['line_no']
        word = row['word']
        row['epitet'] = np.NaN
        row['genus'] = np.NaN
        row['sub'] = np.NaN
        
        alph_word = ''.join(e for e in word if e.isalpha())

        if line_no == 0 and (word.lower() == 'nouvelle' or word.lower() == 'flore'):
            return row

        if (not (word.isupper() and word_no == 0)) and len(word) > 1  and alph_word:
            if word_no == 0: #epitet, or genus
                x_0, y_0, x_1, y_1 = row['x0'], row['y0'], row['x1'], row['y1']

            alphnum_word = ''.join(e for e in word if e.isalnum())
            if (not alphnum_word.isnumeric()): 
                if  x_0 <= g_x0 + indent_err and x_0 >= g_x0 - indent_err:
                    if word_no == 0:
                        genus = word
                        row['genus'] = genus
                        genus_block_no = row['block_no']
                    elif word_no != 0 and line_no == 0 and block_no == genus_block_no: #info on same line as genus
                        #epitet = word
                        #print(genus, alphnum_word)
                        row['epitet'] = np.NaN
                        row['genus'] = genus
                        #row['author']"""
                        row['sub'] = np.NaN
                    else:
                        row['epitet'] = epitet
                        row['genus'] = genus
                        row['sub'] = np.NaN
                elif x_0 <= s_x0 + indent_err and x_0 >= s_x0 - indent_err:
                    if word_no == 0:
                        epitet = word
                        sub = ''
                    row['epitet'] = epitet
                    row['genus'] = genus
                    row['sub'] = np.NaN
                elif x_0 <= sub_x0 + indent_err and x_0 >= sub_x0 - indent_err:
                    #word_no == 0 and (word == 'var.' or word == 'subsp.' or word == 'x' or word == 'X'): #assuming at least genus line exists
                    if word_no == 0:
                        sub = word
                    row['epitet'] = epitet
                    row['genus'] = genus
                    #print(word_no)
                    row['sub'] = sub + " " + genus + " " + epitet

        return row

    page_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    page_df['page_num'] = np.array([page_num]*page_df.shape[0])
    page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    indent_groups = []
    indent_err = 15
    sub = ''
    x_min = page_df['x0'].min()
    y_min = page_df['y0'].min()
    x_max = page_df['x1'].max()
    y_max = page_df['y1'].max()
    
    page_df.apply(initiate_groups, axis = 1)
    page_df['indent_group'] = page_df.apply(get_indent_group, axis = 1)

    page_df['col'] = page_df.apply(get_col, axis = 1)
    
    x_0, y_0, x_1, y_1, epitet = np.NaN, np.NaN, np.NaN, np.NaN, np.NaN
    for c in page_df['col'].unique():
        
        # what if just one largest? what would that even mean? Hm ... (like what if it's all)
        col_df = page_df[(page_df['col'] == c)]
        col_indent_groups =  list(col_df[~col_df['indent_group'].isnull()]['indent_group'].unique())
        #n_leftmost = n_leftmost_indent(col_indent_groups, 2)
        #col_df[col_df['indent_group'] == s_indent]
        #if len(n_leftmost) == 2:

        s_x0, g_x0, sub_x0 = float('inf'), float('inf'), float('inf')
        s_indent, g_indent, sub_indent = -1, -1, -1

        for g in col_indent_groups:
            mean_x0 = col_df[col_df['indent_group'] == g]['x0'].mean()
            if g_x0 > mean_x0:
                s_indent, g_indent = g_indent, g
                s_x0, g_x0 = g_x0, mean_x0 
            elif s_x0 > mean_x0: #and g_x0 <= mean_x0
                s_indent = g
                s_x0 = mean_x0
            elif sub_x0 > mean_x0:#and g_x0 <= mean_x0 and s_x0 <= mean_x0
                sub_indent = g
                sub_x0 = mean_x0


        #if col_df[col_df['indent_group'] == s_indent]['word'].str.contains('var.|subsp.').any():
        
        #print(genus, s_x0, g_x0)


        species_indent_df = col_df[col_df['indent_group'] == s_indent]
        if (species_indent_df['word'] == 'var.').any() or (species_indent_df['word'] == 'subsp.').any() or (species_indent_df['word'] == 'x').any() or (species_indent_df['word'] == 'X').any():
            s_x0, g_x0, sub_x0 = g_x0, float('inf'), s_x0
            s_indent, g_indent, sub_indent = g_indent, -1, s_indent
            #print("no genus in a column of page", page_num)

        if (s_x0  < g_x0): #the swap thing doesn't account for sub_indent level just yet ... 
            s_x0, g_x0 = g_x0, s_x0
            s_indent, g_indent = g_indent, s_indent

        if s_indent == -1: #subspecies must not exist in this case so won't worry about it 
            s_x0, g_x0 = g_x0, float('inf')
            s_indent, g_indent = g_indent, -1

        col_df = col_df.apply(process_col, axis = 1)
        page_df.loc[col_df.index, ['genus', 'epitet', 'sub']] = col_df.loc[col_df.index, ['genus', 'epitet', 'sub']]
        #print("g_x0, s_x0, g_indent, s_indent:", g_x0, s_x0, g_indent, s_indent)
    #print("genus", genus)

    return page_df, genus

In [105]:
def get_author(page_df):
    #pruned_df = page_df[(~page_df['genus'].isnull())].reset_index()
    epitet_names = page_df[~page_df['epitet'].isnull()]['epitet'].unique()
    genus_names = page_df[~page_df['genus'].isnull()]['genus'].unique()
    sub_names = page_df[~page_df['sub'].isnull()]['sub'].unique()
    for i in range(len(page_df['x0'])):
        word = page_df.loc[i, 'word']
        sub = page_df.loc[i, 'sub']
        line_no = page_df.loc[i, 'line_no']
        block_no = page_df.loc[i, 'block_no']
        if word in epitet_names:
            #print(word, i)
            s = page_df.loc[i, 'word']
            g = page_df.loc[i, 'genus']
            sub_df = page_df[(page_df['genus'] == g) & (page_df['epitet'] == s) & (page_df['word'] != s)]
            
            merged = sub_df.groupby('epitet')['word'].agg(' '.join).reset_index()
            
            concat_str = np.NaN
            if len(merged.index):
                concat_str = merged['word'].item()
            
            page_df.loc[i, 'author'] = concat_str
            page_df.loc[i, 'sub_type'] = np.NaN
            page_df.loc[i, 'sub_name'] = np.NaN

        if word in genus_names:
            g = page_df.loc[i, 'genus']
            g_block_no = page_df.loc[i, 'block_no']
            sub_df = page_df[(page_df['genus'] == g) & (page_df['block_no'] == g_block_no) & (page_df['word_no'] != 0) & (page_df['line_no'] == 0)]
            merged = sub_df.groupby('genus')['word'].agg(' '.join).reset_index()
            concat_str = np.NaN
            if len(merged.index):
                #print(g, word)
                concat_str = merged['word'].item()
            page_df.loc[i, 'author'] = concat_str
            page_df.loc[i, 'sub_type'] = np.NaN
            page_df.loc[i, 'sub_name'] = np.NaN
        if sub in sub_names:
            #print(sub.split(' '))
            sub_type, g, s = sub.split(' ')#[0]
            #s = page_df.loc[i, 'word']
            #g = page_df.loc[i, 'genus']
            sub_df = page_df[(page_df['genus'] == g) & (page_df['epitet'] == s) & \
                            (page_df['word'] != sub_type) & (page_df['sub'] == sub) & (page_df['line_no'] == line_no) & (page_df['block_no'] == block_no)]
            merged = sub_df.groupby('sub')['word'].agg(' '.join).reset_index()
            concat_str = np.NaN
            name, aut = np.NaN, np.NaN
            if len(merged.index):
                #print(g, word)
                concat_str = merged['word'].item()
                #print(concat_str)
                #print(concat_str.split(' ', 1))
                if len(concat_str.split(' ', 1)) == 2:
                    name, aut = concat_str.split(' ', 1)
                elif len(concat_str.split(' ', 1)) == 1:
                    name = concat_str
            page_df.loc[i, 'sub_type'] = sub_type
            page_df.loc[i, 'sub_name'] = name
            page_df.loc[i, 'sub_author'] = aut


In [107]:
genus = np.NaN
result_ims = []
df_list = []

for page_num in tqdm(index):
    page_df, genus = process_df(page_num, genus)
    get_author(page_df)
    
    pix_map = doc.get_page_pixmap(page_num,matrix=mat)
    image = Image.open(io.BytesIO(pix_map.tobytes()))
    draw = ImageDraw.Draw(image)

    plot_genus_blocks(page_df, draw, w = 4)
    plot_epitet_blocks(page_df, draw, w = 3)
    plot_author_blocks(page_df, draw, w = 2)
    plot_sub_blocks(page_df, draw, w = 1)

    df_list.append(page_df)
    result_ims.append(image)
    
result_ims[0].save('../output/index/PDF/vol3_withSub_ROI.pdf',save_all=True, append_images=result_ims[1:])


100%|██████████| 28/28 [00:50<00:00,  1.80s/it]


In [108]:
df = pd.concat(df_list, axis = 0)
df.to_csv('../output/index/CSV/vol3_sub.csv')

## TESTING + OLDER CODE

In [94]:
df[(df['page_num'] == 555) & ~df['sub'].isnull()]

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,...,y1,indent_group,col,genus,epitet,sub,author,sub_type,sub_name,sub_author
44,34.560001,310.602997,49.255562,322.608978,var.,7,0,0,555,144.000006,...,1344.204076,3.0,0,Acantholimon,libanoticum,var. Acantholimon libanoticum,,var.,ulicinum,(Willd.) Boiss.
45,51.979561,311.520996,83.479561,322.509979,ulicinum,7,0,1,555,216.581504,...,1343.79158,,0,Acantholimon,libanoticum,var. Acantholimon libanoticum,(Schultes) Boiss.,var.,ulicinum,(Willd.) Boiss.
46,87.474213,310.602997,116.923759,322.608978,(Willd.),7,0,2,555,364.475886,...,1344.204076,,0,Acantholimon,libanoticum,var. Acantholimon libanoticum,,var.,ulicinum,(Willd.) Boiss.
47,120.010582,310.602997,141.510406,322.608978,Boiss.,7,0,3,555,500.044092,...,1344.204076,,0,Acantholimon,libanoticum,var. Acantholimon libanoticum,,var.,ulicinum,(Willd.) Boiss.
65,35.040001,403.483032,58.603977,415.489014,subsp.,13,0,0,555,146.000004,...,1731.204224,3.0,0,Achillea,aleppica,subsp. Achillea aleppica,,subsp.,aleppica,
66,62.510517,403.483032,94.09745,415.489014,aleppica,13,0,1,555,260.460488,...,1731.204224,,0,Achillea,aleppica,subsp. Achillea aleppica,DC. subsp.,subsp.,aleppica,
111,35.759998,566.682983,59.724937,578.688965,subsp.,15,0,0,555,148.999993,...,2411.20402,3.0,0,Achillea,odorata,subsp. Achillea odorata,,subsp.,kotschyi,(Boiss.) Bornm.
112,62.082127,567.600952,91.280205,578.589966,kotschyi,15,0,1,555,258.675528,...,2410.791524,,0,Achillea,odorata,subsp. Achillea odorata,Boiss.,subsp.,kotschyi,(Boiss.) Bornm.
113,93.818565,566.682983,120.712914,578.688965,(Boiss.),15,0,2,555,390.910689,...,2411.20402,,0,Achillea,odorata,subsp. Achillea odorata,,subsp.,kotschyi,(Boiss.) Bornm.
114,123.612633,566.682983,152.078033,578.688965,Bornm.,15,0,3,555,515.052636,...,2411.20402,,0,Achillea,odorata,subsp. Achillea odorata,,subsp.,kotschyi,(Boiss.) Bornm.


In [33]:
sub_df = df[~df['sub'].isnull()]

In [35]:
sub_df[(sub_df['word'] != sub_df['sub'])]

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,y0,x1,y1,indent_group,col,genus,epitet,sub,author


In [36]:
df

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,y0,x1,y1,indent_group,col,genus,epitet,sub,author
0,188.399994,87.271027,229.411743,104.613022,INDEX,0,0,0,555,784.999975,363.629278,955.882263,435.887591,0.0,0,,,,
1,16.080000,160.320999,62.117981,171.309998,Aaronsohnia,1,0,0,555,67.000000,668.004163,258.824921,713.791656,1.0,0,Aaronsohnia,,,Warburg et Eig
2,64.119766,159.403000,98.654915,171.409012,Warburg,1,0,1,555,267.165693,664.179166,411.062145,714.204216,,0,Aaronsohnia,,,
3,100.824005,159.403000,107.433250,171.409012,et,1,0,2,555,420.100021,664.179166,447.638543,714.204216,,0,Aaronsohnia,,,
4,109.935974,159.403000,122.569366,171.409012,Eig,1,0,3,555,458.066559,664.179166,510.705694,714.204216,,0,Aaronsohnia,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,298.320038,235.362991,319.418884,247.369003,Boiss.,36,0,2,582,1243.000158,980.679131,1330.912018,1030.704180,,1,Zollikoferia,nudicaulis,,
130,401.028564,235.362991,414.587616,247.369003,535,36,1,0,582,1670.952352,980.679131,1727.448400,1030.704180,,1,,,,
131,242.880005,246.832001,272.801727,256.600006,tenuiloba,37,0,0,582,1012.000020,1028.466670,1136.673864,1069.166692,5.0,1,Zollikoferia,tenuiloba,,Boiss.
132,276.240021,246.016006,297.073578,256.687988,Boiss.,37,0,1,582,1151.000086,1025.066694,1237.806575,1069.533285,,1,Zollikoferia,tenuiloba,,


In [12]:
df[(~df['sub'].isnull()) & (df['page_num'] == 555)]

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,y0,x1,y1,indent_group,col,genus,epitet,sub,author
44,34.560001,310.602997,49.255562,322.608978,var.,7,0,0,555,144.000006,1294.179153,205.231508,1344.204076,3.0,0,Acantholimon,libanoticum,var.,
65,35.040001,403.483032,58.603977,415.489014,subsp.,13,0,0,555,146.000004,1681.179301,244.183238,1731.204224,3.0,0,Achillea,aleppica,subsp.,
111,35.759998,566.682983,59.724937,578.688965,subsp.,15,0,0,555,148.999993,2361.179097,248.853906,2411.20402,3.0,0,Achillea,odorata,subsp.,
115,36.0,576.28302,50.69556,588.289001,var.,15,1,0,555,150.0,2401.17925,211.231502,2451.204173,3.0,0,Achillea,odorata,var.,
202,239.039993,275.563019,253.588593,287.569,var.,29,0,0,555,995.999972,1148.179245,1056.619136,1198.204168,7.0,1,Ajuga,chia,var.,
205,239.039993,285.163025,253.37915,297.169006,var.,29,1,0,555,995.999972,1188.17927,1055.74646,1238.204193,7.0,1,Ajuga,chia,var.,
260,239.520004,469.963013,263.084015,481.968994,subsp.,33,0,0,555,998.000018,1958.17922,1096.183395,2008.204142,7.0,1,Alkanna,orientalis,subsp.,
264,239.759995,480.04303,254.423996,492.049011,var.,33,1,0,555,998.999977,2000.179291,1060.099983,2050.204213,7.0,1,Alkanna,orientalis,var.,


In [15]:
df[(df['block_no'] == 7) & (df['page_num'] == 555)]

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,y0,x1,y1,indent_group,col,genus,epitet,sub,author
44,34.560001,310.602997,49.255562,322.608978,var.,7,0,0,555,144.000006,1294.179153,205.231508,1344.204076,3.0,0,Acantholimon,libanoticum,var.,
45,51.979561,311.520996,83.479561,322.509979,ulicinum,7,0,1,555,216.581504,1298.00415,347.831504,1343.79158,,0,,,,
46,87.474213,310.602997,116.923759,322.608978,(Willd.),7,0,2,555,364.475886,1294.179153,487.182331,1344.204076,,0,,,,
47,120.010582,310.602997,141.510406,322.608978,Boiss.,7,0,3,555,500.044092,1294.179153,589.626694,1344.204076,,0,,,,


In [None]:
for i in sub_df['page_num']:
    for b in sub_df[sub_df['page_num'] == i]['block_no']:
        for l in sub_df[(sub_df['page_num'] == i) & (sub_df['block_no'] == b)]['line_no']:
            str_sub = ''
            for w in sub_df[(sub_df['page_num'] == i) & (sub_df['block_no'] == b) & (sub_df['line_no'] == l)]['word_no']:
                word = sub_df[(sub_df['page_num'] == i) & (sub_df['block_no'] == b) & (sub_df['line_no'] == l) & (sub_df['word_no'] == w)]['word']
                if w == 0 and (word == 'var.' or word == 'subsp.' or word == 'x' or word == 'X'): #assuming at least genus line exists
                    sub = word
                else: 
                    sub = word
                    str_sub += word + ' '


In [13]:
df[(df['sub'] == 'x') | (df['sub'] == 'X')]

Unnamed: 0,in_x0,in_y0,in_x1,in_y1,word,block_no,line_no,word_no,page_num,x0,y0,x1,y1,indent_group,col,genus,epitet,sub,author


In [None]:
def get_sub(page_df):
    #pruned_df = page_df[(~page_df['genus'].isnull())].reset_index()
    epitet_names = page_df[~page_df['epitet'].isnull()]['epitet'].unique()
    genus_names = page_df[~page_df['genus'].isnull()]['genus'].unique()
    sub_names = ['var.', 'subsp.', 'x', 'X'] 

    for i in range(len(page_df['x0'])):
        word = page_df.loc[i, 'word']
        if word in sub_names:
            #print(word, i)
            s = page_df.loc[i, 'word']
            g = page_df.loc[i, 'genus']
            sub_df = page_df[(page_df['genus'] == g) & (page_df['epitet'] == s) & (page_df['word'] != s)]
            
            merged = sub_df.groupby('epitet')['word'].agg(' '.join).reset_index()
            
            concat_str = np.NaN
            if len(merged.index):
                concat_str = merged['word'].item()
            
            page_df.loc[i, 'author'] = concat_str

        if word in genus_names:
            g = page_df.loc[i, 'genus']
            g_block_no = page_df.loc[i, 'block_no']
            sub_df = page_df[(page_df['genus'] == g) & (page_df['block_no'] == g_block_no) & (page_df['word_no'] != 0) & (page_df['line_no'] == 0)]
            merged = sub_df.groupby('genus')['word'].agg(' '.join).reset_index()
            concat_str = np.NaN
            if len(merged.index):
                #print(g, word)
                concat_str = merged['word'].item()
            page_df.loc[i, 'author'] = concat_str

In [10]:
author_pruned_df = df[(~df['author'].isnull()) | (df['word'] == df['genus'])]
simple_genus_species_author = author_pruned_df[["genus", "epitet", "author"]]
simple_genus_species_author.to_csv('../output/index/CSV/vol3_simplified.csv', index = False)