In [66]:
import re

from PyPDF2 import PdfReader
import pandas as pd
fern_species = pd.read_excel("Species list.xlsx")


In [47]:
from tqdm.auto import tqdm

conditional_newline = lambda sp: '\s*\n?'.join(sp.split(' ')).replace('.', '\.').replace('(', '\(').replace(')', '\)').replace('[', '\[').replace(']', '\]')
sp_descriptions = pd.DataFrame()
for fam in tqdm(fern_species.Family.unique()):
    text = ""
    with open(f"descriptions/{fam}.pdf", "rb") as pdf_file:
        read_pdf = PdfReader(pdf_file)
        for i, page in enumerate(read_pdf.pages):
            if i < 5:
                continue
            tx = page.extract_text()
            if tx.split("\n")[0].startswith('References'):
                # print(f"Breaking at page {i}")
                break
            text += page.extract_text()
    text = re.sub(r'[^\S\r\n]+', ' ', text)

    fam_species = fern_species[fern_species.Family.eq(fam)].Species.unique()
    sp_dict = {}
    for i, species in enumerate(fam_species):
        species = species
        if (cnt := len(re.findall(conditional_newline(species), text))) == 0:
            print(f"{fam} | {species} wrongly appearing (count: {cnt})")
            foo
        # start = text.find(species)
        # start is the first character of the species name assuming to find it in the text
        # with potentially some '\n' characters in between
        start = re.search(conditional_newline(species), text).start()
        end = [re.search(conditional_newline(sp), text).start() for sp in fam_species]
        end = min([e for e in end if e and e > start] + [len(text)])
        name_sp = species.split(')')[0] + ')'
        if name_sp not in sp_dict.keys():
            sp_dict[name_sp] = text[start:end]
        else:
            sp_dict[name_sp] += '\n' + text[start:end]

    sp_descriptions = pd.concat([sp_descriptions, pd.DataFrame(sp_dict.items(), columns=['Species', 'Description']).assign(Family=fam)], ignore_index=True)


def desc_to_sections(desc):
    """
    Split the description into sections:
    "Etymology", "Vernacular name", "Distribution", "Altitudinal range", "Biostatus", "Habitat", "First record ", "Recognition ", "Cytology", "Hybridisation", "Notes "
    not all the sections will be present in all descriptions, if present, they start with the section name followed by a colon, up to the next named section or the end of the description
    """
    sections = ["Etymology", "Vernacular name", "Distribution", "Altitudinal range", "Biostatus", "Habitat", "First record", "Recognition", "Cytology", "Hybridisation", "Notes"]
    section_dict = {}
    for i, section in enumerate(sections):
        if i == len(sections) - 1:
            section_dict[section] = desc
        else:
            start = re.search(fr"{section}s?:", desc)
            start = start.end() if start else -1
            if start == -1:
                continue
            end = re.search(fr"{sections[i+1]}s?:", desc)
            end = end.start() if end else -1

            this_desc = desc[start:end].strip()

            section_dict[section] = re.sub(r'\n+', ' ', this_desc.strip())
    return pd.Series(section_dict)

extra = sp_descriptions.Description.apply(desc_to_sections)  
sp_descriptions = pd.concat([sp_descriptions, extra], axis=1).drop(columns='Description')

  0%|          | 0/32 [00:00<?, ?it/s]

In [50]:
sp_descriptions.to_excel("fern_descriptions.xlsx", index=False)

# Extract features

In [67]:
sp_descriptions = pd.read_excel("fern_descriptions.xlsx").set_index('Species')

In [68]:
extra_features_df = pd.read_excel("../Words before and after traits_v2.xlsx", sheet_name="FernPrecedingWords")
extra_features_df = extra_features_df[
    :extra_features_df[extra_features_df.Rhizome.str.startswith('Sentences that talk a', na=False)].index[0]
    ].map(lambda s:s.lower().strip() if type(s) == str else s)

extra_features = extra_features_df.to_dict('list')
extra_features = {k.title().replace(' ', ''): [x for x in v if str(x) != 'nan'] for k, v in extra_features.items()}

In [131]:
unit = '[m|c|d|μ]?m'
number = r"(\d+\.?\d*)"
full_regex = rf"(({number}\s?-\s?)?{number})?({number}\s?-\s?)?{number}\s*{unit}(-wide)?(-long)?"

In [161]:
tmp = []
anomalies = set() # species with anomalies

def extract_features(i, feats:list):
	features = {}
	for feat in feats:
		if len(feat) < 1:
			continue
		feat = feat.replace(',', ' ')
		measures = re.finditer(full_regex, feat)
		for measure in measures:
			found = None
			for key, values in extra_features.items():
				if key.startswith('Habit'):
					# categorical feature, append all the values present in feat
					features['Habit'] = ';'.join([v for v in values if v.lower() in re.split(r'[^\w]', feat.lower())])
					continue
				if key.startswith('Venation'):
					# categorical feature, store feat as it is
					# features['Venation'] = feat
					continue

				feat = feat[:-1] if feat[-1] in ['.', ';'] else feat # remove any of .; at the end of the sentence
				matched_word = list(re.finditer( r'\b('+ '|'.join([w for w in set(values)]) + r')\b', feat.lower()))
				
				if any(matched_word):
				# "*Secondary* pinnae decreasing very gradually in length along each ~primary~ pinna to the distal end..." is supposed to be Secondary
					if key == 'PrimaryPinnae' and 'secondary' in feat.lower() and features.get('PrimaryPinnae') is not None:
						continue
					matched_word = [w for w in matched_word if w.span()[0] < measure.span()[0]]
					if not any(matched_word):
						continue
					matched_word = sorted(matched_word, key=lambda word: word.span()[1] - measure.span()[0])[0]
					this_distance = abs(matched_word.span()[1] - measure.span()[0])
					# this_distance = abs(word_match_position - measure_position)

					if key == 'Stipe':
						hair_or_scale_position = list(re.finditer(r'(hair|scale)', feat.lower()))
						# C1: Se nella frase dello stipe trovi le keyword "hair", "hairs", "scale", "scales", i valori dopo queste keyword vanno ignorati.
						if any(hair_or_scale_position) and hair_or_scale_position[0].start() < measure.start():
							continue
					if found:
						if (any([w in feat.lower() for w in ['achene', 'cypsela']]) and {key, found[0]} == {'FruitSize', 'SeedSize'}) or\
						   (any([w in feat.lower() for w in ['stigma-style']]) and {key, found[0]} == {'StigmaSize', 'StyleSize'}) or\
						   (any([w in feat.lower() for w in ['floret']]) and {key, found[0]} == {'RayFloretsSize', 'DiskFloretSize'}):
							pass
							# print(f'OK>> Multiple features found ({found}, {key}) in "{feat}"')
						else:
							if this_distance >= found[1]:
								continue
							features[found[0]].remove(found[2])
							anomalies.add(i)

					found = (key, this_distance, measure.group())
					if key in features:
						features[key].append(measure.group())
					else:
						features[key] = [measure.group()]
	return pd.Series(features)



In [162]:
def string_preprocessing(s):
	s = s.replace('\xa0', ' ').replace('×', 'x').replace('–', '-').replace('·', '.') # remove non-breaking space and replace multiplication sign with x
	s = re.sub(r'(?<=xcluding)\s+[\w-]+', ' ', s) # remove each word following "excluding" (Mericarps (excluding style) 2.5-3.0 mm should point to "Mericarps")
	s = re.sub(fr'-?\(-?{number}-?\)-?', '', s) # remove all parentesis surrounding a number and the number inside (e.g. (-1.5) --> **)
	# s = s.replace('--', '-').replace('-.', '-').replace('..', '.')
	# s = s.replace('(', '').replace(')', '')
	# s = re.sub(r'\s(c|ca|o)\.', ' foo ', s) # remove all ' c.'
	s = re.sub(rf'(?<=\d)\s+(?={unit})', '', s) # remove all spaces before measures (mm, cm, dm, m, these strings only if padded by a space)
	s = re.sub('m long;?', 'm-long', s) # remove space between measure and "long" (e.g. 2 mm long --> 2 mm-long)
	s = re.sub('m wide;?', 'm-wide', s) # remove space between measure and "wide" (e.g. 2 mm wide --> 2 mm-wide)
	s = re.sub(r'\s*-\s*', '-', s) # remove spaces around hyphens
	s = re.sub(r'(?<=\d)\s*\.(?=\d)', '.', s) # remove spaces before dot if followed and preceded by a number
	s = re.sub(r'(?<=\s)\.(?=\d)', '0.', s) # add a 0 before a dot if it is preceded by a space and a "not number" and followed by a number (e.g. foo .5 --> foo 0.5)
	# s = re.sub(r'(?<=[\dm])\s*x\s*(?=\d+)', 'x', s) # remove spaces around x in formulas
	s = re.sub(rf'(;\s*)({full_regex})', r' \2', s) # point to any ';' preceding a measure (full_regex) and remove it, without removing the measure
	return s

In [158]:
delimiter = r'(?<! c)\.(?!\d)'
tmp = sp_descriptions.Etymology.fillna('') + ' ' + sp_descriptions['Vernacular name'].fillna('')
feats = tmp[tmp.index.str.contains('Paesia scaberula')].apply(string_preprocessing).str.split(delimiter).iloc[0]
# feats = tmp[tmp.index.str.contains('Cyathea milnei Hook')].apply(string_preprocessing).str.split(delimiter).iloc[0]
[f for f in feats if len(f) > 18]

['From the Latin scaberulus (roughish), a reference to the slightly scabrid stipe and rachis',
 ' hard fern; lace fern; mātā; mātātā; pig fern; ring fern; scented fern; sticky pig fern Rhizomes long-creeping, 1-4mm diameter, with stipes arising 10-165mm apart; bearing chestnut-brown, multicellular, non-glandular hairs up to 4mm-long',
 ' Fronds 150-1175mm-long',
 ' Stipes 25-560mm-long, 1-2.5mm diameter, red-brown or chestnut-brown, bearing red-brown, multicellular, non-glandular hairs up to 5mm-long',
 ' Rachises chestnut-brown, becoming yellow-brown distally, zigzag, bearing red-brown, multicellular, non-glandular hairs up to 2mm-long, and very short yellowish glandular hairs up to 0.1mm-long (becoming white with age)',
 ' Laminae 3-4-pinnate, ovate or elliptic, tapering to a short pinnatisect apex, 100-780mm-long, 45-430mm-wide, yellow-green on both surfaces, coriaceous; abaxial surface of costae bearing red-brown, multicellular, non-glandular hairs up to 1mm-long costae and both la

In [163]:
features = sp_descriptions.Etymology.fillna('') + ' ' + sp_descriptions['Vernacular name'].fillna('')
# delimiter any of '. ', '.\n' and similar (all the dots not followed by numbers and not preceded by ' c', enforced with a negative lookbehind)
delimiter = r'(?<! c)\.(?!\d)'
features = features.apply(string_preprocessing).str.split(delimiter).reset_index().apply(lambda x: extract_features(x.Species, x[0]), axis=1)
features = features.map(lambda x: '; '.join(x) if not isinstance(x, float) else x)
features.index = sp_descriptions.index
# features[features.notna().sum(axis=1) > 0].to_csv('processed_features_fern.csv')

In [7]:
features_old = pd.read_csv('processed_features_fern.csv').set_index('Species')

In [159]:
features[features.index.str.contains('Paesia scaberula ')]

Unnamed: 0_level_0,FertileFronds,Frond,Habit,Indusia,Laminae,PrimaryPinnae,QuaternaryPinnae,Rachis,Rhizome,SecondaryPinnae,Sori,Sporangia,Spores,SterileFronds,Stipe,TertiaryPinnae
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Paesia scaberula (A.Rich.),,150-1175mm-long; 25-265mm-long; 11-90mm-wide,,,100-780mm-long; 45-430mm-wide; 1mm-long,9-60mm-long,,2mm-long; 0.1mm-long,1-4mm,4-23mm-wide,,,,,10-165mm; 4mm-long; 25-560mm-long; 1-2.5mm; 5m...,3-13mm-long; 1-4mm-wide


In [160]:
features_old[features_old.index.str.contains('Paesia scaberula ')]

Unnamed: 0_level_0,FertileFronds,Frond,Habit**,Laminae,PrimaryPinnae,Rachis,Rhizome,SecondaryPinnae,Sori,Sporangia,Spores,SterileFronds,Stipe,TertiaryPinnae,Venation*
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Paesia scaberula (A.Rich.),,150-1175mm-long; 25-560mm-long; 1-2.5mm; 5mm-l...,1-4mm; 10-165mm; 4mm-long,100-780mm-long; 45-430mm-wide; 1mm-long,25-265mm-long; 11-90mm-wide,,,9-60mm-long; 4-23mm-wide,,,,,,3-13mm-long; 1-4mm-wide,
