In [2]:
import re

from PyPDF2 import PdfReader
import pandas as pd
fern_species = pd.read_excel("Species list.xlsx")


In [47]:
from tqdm.auto import tqdm

conditional_newline = lambda sp: '\s*\n?'.join(sp.split(' ')).replace('.', '\.').replace('(', '\(').replace(')', '\)').replace('[', '\[').replace(']', '\]')
sp_descriptions = pd.DataFrame()
for fam in tqdm(fern_species.Family.unique()):
    text = ""
    with open(f"descriptions/{fam}.pdf", "rb") as pdf_file:
        read_pdf = PdfReader(pdf_file)
        for i, page in enumerate(read_pdf.pages):
            if i < 5:
                continue
            tx = page.extract_text()
            if tx.split("\n")[0].startswith('References'):
                # print(f"Breaking at page {i}")
                break
            text += page.extract_text()
    text = re.sub(r'[^\S\r\n]+', ' ', text)

    fam_species = fern_species[fern_species.Family.eq(fam)].Species.unique()
    sp_dict = {}
    for i, species in enumerate(fam_species):
        species = species
        if (cnt := len(re.findall(conditional_newline(species), text))) == 0:
            print(f"{fam} | {species} wrongly appearing (count: {cnt})")
            foo
        # start = text.find(species)
        # start is the first character of the species name assuming to find it in the text
        # with potentially some '\n' characters in between
        start = re.search(conditional_newline(species), text).start()
        end = [re.search(conditional_newline(sp), text).start() for sp in fam_species]
        end = min([e for e in end if e and e > start] + [len(text)])
        name_sp = species.split(')')[0] + ')'
        if name_sp not in sp_dict.keys():
            sp_dict[name_sp] = text[start:end]
        else:
            sp_dict[name_sp] += '\n' + text[start:end]

    sp_descriptions = pd.concat([sp_descriptions, pd.DataFrame(sp_dict.items(), columns=['Species', 'Description']).assign(Family=fam)], ignore_index=True)


def desc_to_sections(desc):
    """
    Split the description into sections:
    "Etymology", "Vernacular name", "Distribution", "Altitudinal range", "Biostatus", "Habitat", "First record ", "Recognition ", "Cytology", "Hybridisation", "Notes "
    not all the sections will be present in all descriptions, if present, they start with the section name followed by a colon, up to the next named section or the end of the description
    """
    sections = ["Etymology", "Vernacular name", "Distribution", "Altitudinal range", "Biostatus", "Habitat", "First record", "Recognition", "Cytology", "Hybridisation", "Notes"]
    section_dict = {}
    for i, section in enumerate(sections):
        if i == len(sections) - 1:
            section_dict[section] = desc
        else:
            start = re.search(fr"{section}s?:", desc)
            start = start.end() if start else -1
            if start == -1:
                continue
            end = re.search(fr"{sections[i+1]}s?:", desc)
            end = end.start() if end else -1

            this_desc = desc[start:end].strip()

            section_dict[section] = re.sub(r'\n+', ' ', this_desc.strip())
    return pd.Series(section_dict)

extra = sp_descriptions.Description.apply(desc_to_sections)  
sp_descriptions = pd.concat([sp_descriptions, extra], axis=1).drop(columns='Description')

  0%|          | 0/32 [00:00<?, ?it/s]

In [50]:
sp_descriptions.to_excel("fern_descriptions.xlsx", index=False)

# Extract features

In [27]:
sp_descriptions = pd.read_excel("fern_descriptions.xlsx").set_index('Species')

In [4]:
extra_features_df = pd.read_excel("../Words before and after traits_v2.xlsx", sheet_name="FernPrecedingWords")
extra_features_df = extra_features_df[
    :extra_features_df[extra_features_df.Rhizome.str.startswith('Sentences that talk a', na=False)].index[0]
    ].map(lambda s:s.lower().strip() if type(s) == str else s)

extra_features = extra_features_df.to_dict('list')
extra_features = {k.title().replace(' ', ''): [x for x in v if str(x) != 'nan'] for k, v in extra_features.items()}

In [42]:
unit = '[m|c|d|µ]?m'
number = r"(\d+\.?\d*)"
full_regex = rf"(({number}\s?-\s?)?{number})?({number}\s?-\s?)?{number}\s*{unit}(-wide)?(-long)?"

In [43]:
print(full_regex)

(((\d+\.?\d*)\s?-\s?)?(\d+\.?\d*))?((\d+\.?\d*)\s?-\s?)?(\d+\.?\d*)\s*[m|c|d|µ]?m(-wide)?(-long)?


In [44]:
tmp = []
anomalies = set() # species with anomalies

def extract_features(i, feats:list):
	features = {}
	for feat in feats:
		if len(feat) < 1:
			continue
		feat = feat.replace(',', ' ')
		measures = re.finditer(full_regex, feat)
		for measure in measures:
			found = None
			for key, values in extra_features.items():

				feat = feat[:-1] if feat[-1] in ['.', ';'] else feat # remove any of .; at the end of the sentence
				matched_word = list(re.finditer( r'\b('+ '|'.join([w for w in set(values)]) + r')\b', feat.lower()))
				
				if any(matched_word):	
					if key != 'Stature': # TODO: Caso up to 3m, petiole 2mm???
						matched_word = [w for w in matched_word if w.span()[0] < measure.span()[0]]
						if not any(matched_word):
							continue # if not stature and measure appears before the word, skip
					matched_word = sorted(matched_word, key=lambda word: word.span()[1] - measure.span()[0])[0]

					this_distance = abs(matched_word.span()[1] - measure.span()[0])
					# this_distance = abs(word_match_position - measure_position)
					if found:
						if (any([w in feat.lower() for w in ['achene', 'cypsela']]) and {key, found[0]} == {'FruitSize', 'SeedSize'}) or\
						   (any([w in feat.lower() for w in ['stigma-style']]) and {key, found[0]} == {'StigmaSize', 'StyleSize'}) or\
						   (any([w in feat.lower() for w in ['floret']]) and {key, found[0]} == {'RayFloretsSize', 'DiskFloretSize'}):
							pass
							# print(f'OK>> Multiple features found ({found}, {key}) in "{feat}"')
						else:
							if this_distance >= found[1]:
								continue
							else:
								# print(found, key, matched_word, measure)
								features[found[0]].remove(found[2])
							anomalies.add(i)
					found = (key, this_distance, measure.group())
					
					if key in features:
						features[key].append(measure.group())
					else:
						features[key] = [measure.group()]
	return pd.Series(features)



In [45]:
def string_preprocessing(s):
	s = s.replace('\xa0', ' ').replace('×', 'x').replace('–', '-').replace('·', '.') # remove non-breaking space and replace multiplication sign with x
	s = re.sub(r'(?<=xcluding)\s+[\w-]+', ' ', s) # remove each word following "excluding" (Mericarps (excluding style) 2.5-3.0 mm should point to "Mericarps")
	s = re.sub(fr'-?\(-?{number}-?\)-?', '', s) # remove all parentesis surrounding a number and the number inside (e.g. (-1.5) --> **)
	# s = s.replace('--', '-').replace('-.', '-').replace('..', '.')
	# s = s.replace('(', '').replace(')', '')
	# s = re.sub(r'\s(c|ca|o)\.', ' foo ', s) # remove all ' c.'
	s = re.sub(r'(?<=\d)\s+(?=[cmd]?m)', '', s) # remove all spaces before measures (mm, cm, dm, m, these strings only if padded by a space)
	s = re.sub('m long;?', 'm-long', s) # remove space between measure and "long" (e.g. 2 mm long --> 2 mm-long)
	s = re.sub('m wide;?', 'm-wide', s) # remove space between measure and "wide" (e.g. 2 mm wide --> 2 mm-wide)
	s = re.sub(r'\s*-\s*', '-', s) # remove spaces around hyphens
	s = re.sub(r'(?<=\d)\s*\.(?=\d)', '.', s) # remove spaces before dot if followed and preceded by a number
	s = re.sub(r'(?<=\s)\.(?=\d)', '0.', s) # add a 0 before a dot if it is preceded by a space and a "not number" and followed by a number (e.g. foo .5 --> foo 0.5)
	# s = re.sub(r'(?<=[\dm])\s*x\s*(?=\d+)', 'x', s) # remove spaces around x in formulas
	s = re.sub(rf'(;\s*)({full_regex})', r' \2', s) # point to any ';' preceding a measure (full_regex) and remove it, without removing the measure
	# now all measures are supposed to have no spaces between number and unit and spaces around them

	# s = re.sub(rf'(?<=\d{unit})(\s*long,?\s*)(?={number}{unit})', r'x', s) # remove any 'long' after a measure ("2 mm long X 3 mm wide" --> "2 mm x 3 mm wide")
	# s = re.sub(rf'(?<=\d)([m|c|d]m|m(?!m))(?!x)', r'\1 ', s) # fix situation in which a measure is not followed by a space, in the case, add that space
	# s = re.sub(r'(?<![\d\sx\.-])(\d)', r' \1', s) # fix the situation in which a measure (the whole number and measure) is not preceded by a space. In the case, add a space before the measure
	# s = re.sub(rf'(?<=\s)-(?=\d)', '', s) # remove '-' at the beginning of a measure (e.g. -1.5 --> 1.5)
	# s = re.sub(r'(?<=\d\.\d+)(\.\d?)', '', s) # fix the error in which there is a doubled dot in a number (e.g. 1.5.2), in the case, remove the second dot and the eventual numbers after it
	# s = re.sub(r'(?<=[a-ln-z])-(?=\d)', ' ', s) # remove all '-' preceded by a letter (different from m) and followed by a number (e.g. to-250mm --> to 250mm)
	# s = re.sub(r'(?<![a-z])(l|I)(?=[\s\.-]|\d)', '1', s) # replace all 'l' or "I" characters which should be '1' (e.g. l.5 --> 1.5). This should be followed by a space, a dot, a hyphen, or a number and not preceded by a letter
	return s

In [46]:
features = sp_descriptions.Etymology.fillna('') + ' ' + sp_descriptions['Vernacular name'].fillna('')
features = features.str.split(r'\s\.').reset_index()
features.apply(lambda x: extract_features(x.Species, x[0]), axis=1)

Unnamed: 0,FertileFronds,Frond,Habit**,Laminae,PrimaryPinnae,Rachis,Rhizome,SecondaryPinnae,Sori,Sporangia,Spores,SterileFronds,Stipe,TertiaryPinnae,Venation*
0,"[100 mm, 70 mm, 15 mm]",[380 mm],,"[170 mm, 130 mm]","[80 mm, 20 mm]",,[],,,,,[],[210 mm],,
1,,[950 mm],,"[690 mm, 400 mm, 1.5 mm]","[200 mm, 50 mm]","[8 mm, 1.5 mm]","[12 mm, 4 mm]","[27 mm, 10 mm]",[],,,,"[310 mm, 13 mm, 3 mm]","[6 mm, 3 mm]",[1 mm]
2,,[550 mm],,"[350 mm, 350 mm]","[200 mm, 60 mm]",[],[],"[40 mm, 12 mm]",,,,,[220 mm],,[2 mm]
3,,"[910 mm, 480 mm, 9 mm, 2.5 mm, 0.5 mm, 8 m]","[260 mm, 7 mm]","[510 mm, 270 mm, 1.5 mm, 1 mm]","[195 mm, 75 mm]","[0.5 mm, 1.5 mm]","[5 mm, 2.5 mm]","[40 mm, 10 mm]",[],,,,"[40 mm, 6 mm, 2 mm]",[1 mm],"[3 mm, 400 m, 400 m]"
4,,,,[],,,[],,,,,,,"[38 mm, 12 mm, 11 mm, 5 mm]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,,"[1450 mm, 970 mm]",[115 mm],"[580 mm, 215 mm, 0.4 mm]","[150 mm, 18 mm, 10 mm, 6 mm]",,[],,"[0.9 mm, 600 m, 600 m]",,,,"[30 mm, 4 mm, 3 mm, 1.5 mm]",,
266,,"[1200 mm, 2000 mm]","[5 mm, 10 mm, 1 mm]","[1000 mm, 700 mm, 0.6 mm, 2 mm]","[450 mm, 200 mm]",,[],"[120 mm, 40 mm]","[0.4 mm, 120 m, 120 m]",,,,[750 mm],"[24 mm, 6 mm]",[0.1 mm]
267,,[1.89 m],,"[1390 mm, 1500 mm, 400 mm, 160 mm, 50 mm, 0.2 ...","[220 mm, 27 mm, 34 mm, 70 mm, 17 mm, 7 mm]",,"[8 mm, 4 mm, 2070 mm, 200 mm]","[53 mm, 24 mm]",,,,,[770 mm],,
268,,,,,,,,"[10 m, 40 m]",,,,,,,


In [48]:
features = sp_descriptions.Etymology.fillna('') + ' ' + sp_descriptions['Vernacular name'].fillna('')
features = features.apply(string_preprocessing).str.split(r'\s\.').reset_index().apply(lambda x: extract_features(x.Species, x[0]), axis=1)
features = features.map(lambda x: '; '.join(x) if not isinstance(x, float) else x)
features.index = sp_descriptions.index
features[features.notna().sum(axis=1) > 0].to_csv('processed_features_fern.csv')

  features = features.applymap(lambda x: '; '.join(x) if not isinstance(x, float) else x)


In [71]:
sp_descriptions

Unnamed: 0,Species,Family,Etymology,Distribution,Altitudinal range,Biostatus,Habitat,First record,Recognition,Notes,Vernacular name,Cytology,Hybridisation
0,Anemia phyllitidis (L.),Anemiaceae,From the Greek phyllitidis (like Phyllitis ). ...,North Island: Northland,10 m. Known as a cultivation escape from one l...,Exotic; casual.,"Reported as growing on a steep, south-facing b...","Ogle et al. (2021). Voucher AK 327905, 2008.","New Zealand material, comprising one known col...","Anemia phyllitidis (L.) Sw., Syn. Fil. 155 (18...",,,
1,Athyrium filix-femina (L.),Athyriaceae,From the Latin filix (fern) and femina (female...,North Island: Auckland. South Island: Canterbu...,0–30 m. A widespread northern temperate specie...,Exotic; fully naturalised.,Athyrium filix-femina occurs most frequently o...,"Molloy (1976, p. 16). Voucher CHR 172587, WELT...",Athyrium filix-femina is a naturalised plant c...,"Athyrium filix-femina (L.) Roth, Tent. Fl. Ger...","lady fern Rhizomes erect, sometimes forming sh...",,
2,Athyrium otophorum (Miq.),Athyriaceae,From the Greek otos (ear) and phorus (bearing)...,North Island: Northland.,10 m. Known from one locality in Kerikeri. Occ...,Exotic; casual.,Recorded as sporadically self-sown along the e...,"Heenan et al. (2004, p. 802). Voucher AK 28399...","In New Zealand Athyrium otophorum has a short,...","Athyrium otophorum (Miq.) Koidz., Fl. Symb. Or...",,The base chromosome number in Deparia is x = 4...,
3,Deparia petersenii subsp. congrua (Brack.),Athyriaceae,From the Latin congruus (agreeable). Rhizomes ...,"North Island: Northland, Auckland, Volcanic Pl...",0–400 m. Deparia petersenii subsp. congrua occ...,Indigenous (Non-endemic).,Deparia petersenii subsp. congrua is a terrest...,,In New Zealand Deparia petersenii subsp. congr...,Deparia petersenii subsp. congrua (Brack.) M.K...,,n = 82 (Brownlie 1961). Notes: This species ha...,
4,Deparia petersenii (Kunze),Athyriaceae,"From the Greek diplasios (double), a reference...",North Island: Volcanic Plateau.,10 m. Deparia petersenii subsp. petersenii is ...,Exotic; casual.,Recorded from concrete and brick walls in a ga...,"New record. Voucher AK 363422, 2016.",Very similar to subsp. congrua but distinguish...,"Deparia petersenii (Kunze) M.Kato, Bot. Mag. (...",,The base chromosome number in Diplazium is x =...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,Cyclosorus interruptus (Willd.),Thelypteridaceae,From the Latin interruptus (interrupted). Rhiz...,"North Island: Northland, Auckland, Volcanic Pl...",0–600 m. Cyclosorus interruptus occurs in coas...,Indigenous (Non-endemic). The species was give...,"Occurs in swamps, on peaty soils and lake marg...",,Cyclosorus interruptus is recognised by its lo...,"Cyclosorus interruptus (Willd.) H.Itô, Bot. Ma...",,"n = 36 ( Brownlie 1961 , as Cyclosorus gongylo...",
266,Macrothelypteris torresiana (Gaudich.),Thelypteridaceae,Named in honour of Luís Vaz de Torres (b. 1565...,North Island: Northland Kermadec Islands,30–120 m. In New Zealand Macrothelypteris torr...,Indigenous (Non-endemic). The species was give...,Macrothelypteris torresiana has been recorded ...,,Macrothelypteris torresiana is recognised by i...,"Macrothelypteris torresiana (Gaudich.) Ching, ...",,2n = 124 ( de Lange et al. 2004 ). Notes: Name...,
267,Pakau pennigera (G.Forst.),Thelypteridaceae,From the Latin pennigerus (with feathery leave...,"North Island: Northland, Auckland, Volcanic Pl...",0–700 m. Pakau pennigera occurs in lowland are...,Indigenous (Non-endemic).,"A terrestrial fern that occurs under kauri, po...",,Pakau pennigera is recognised by its erect rhi...,Pakau pennigera (G.Forst.) S.E.Fawc. & A.R.Sm....,feather fern; gully fern; piupiu; pākau; pākau...,"n = 72 ( Brownlie 1954 , as Cyclosorus pennige...",
268,Pseudophegopteris aurita (Hook.),Thelypteridaceae,"From the Latin auritus (long-eared), a referen...",North Island: Northland.,c. 10 m. Pseudophegopteris aurita has been rec...,Exotic; casual.,"Recorded as a cultivation escape, growing on a...","Ogle et al. (2021) . AK 327895, 2008.",In New Zealand Pseudophegopteris aurita is rec...,"Pseudophegopteris aurita (Hook.) Ching, Acta P...",,The base chromosome number in Thelypteris is x...,
