In [1]:
import numpy as np
import pandas as pd
# import re
import regex as re

species = pd.read_excel('plant_info.xlsx').set_index('SpeciesName')
words = pd.read_excel('Words before and after traits.xlsx')
species['Features'] = species['Features'].fillna('')

In [2]:
extra_features_df = pd.read_excel("Words before and after traits.xlsx", sheet_name="Words", skiprows=1)
extra_features_df = extra_features_df[:extra_features_df[extra_features_df.Stature.eq('Following words')].index[0]].applymap(lambda s:s.lower() if type(s) == str else s)

extra_features = extra_features_df.to_dict('list')
extra_features = {k.title().replace(' ', ''): [x for x in v if str(x) != 'nan'] for k, v in extra_features.items()}

In [3]:
all_words = extra_features_df.to_numpy().flatten()
# duplicated words
duplicate_words = []
for word in set(all_words):
	if (all_words == word).sum() > 1:
		duplicate_words.append(word)
print(duplicate_words)

['cypsela', 'achene', 'fruits']


In [4]:
words_to_remove = []
# remove all words starting with petal, flower, fruit, seed with a letter after different from s. Select the entire word up to the first space or punctuation (,;.) excluded
reg_exp = r'(petal|flower|fruit|seed)[a-rt-z]+[^\s;,.\)]'
for i, row in species.Features.items():
	# print(row)
	for word in re.finditer(reg_exp, row):
		words_to_remove.append(word.group())

words_to_remove = list(set(words_to_remove))

## Errors
- ✅ **"Stigma" and "style" are both present within the string:** Place values in both stigma and style categories
- ✅ **"Cypselae" is present within the string:** Place in both seed and fruit categories
- ✅ **"Cypsela" is present within the string:** Place in both seed and fruit categories
- **"(" and ")" are detected within the string:** Remove them, along with everything within them. Then repeat search.
- ✅ **"petiole" or "petioles" are contained in the string:** Place values in petiol size, ignoring all other words
- ✅ **"Anther" or "Anthers" are contained in the string:** Place values in anther size, ignoring all other words
- ✅ **"Pedicel" or "Pedicels" are contained in the string:** Place values in pedicel size, ignoring all other words
- ✅ **"Calyx" is present within the string:** Place values in calyx size, ignoring all other words
- ✅ **word is a derivation of a key word (e.g. fruit-ing, flower-ing, leaf-y, anti-petal-ous):** Ignore/remove that word from the string. This should not apply if such words are part of multiple word key (e.g. "flowering stem")
- ✅ **word for fruit size is used, except cypsela or achene:** Ignore other words and place in fruit size
- ✅ **"achene" or "achenes" are contained in the string:** Place in both seed and fruit categories
- <s>**"hermaphrodite", "hermaphroditic", "male" or "female" are found within the string:** Store the word, then remove it, and repeat the search. Place what is found in the appropriate trait column, but add the stored word in.</s>


In [7]:
unit = '[m|c|d]?m'
# 150–400x100–300mm
number = r"(\d+\.?\d*)"
# full_regex = rf"(({number}-)?{number}{unit}?x)?{number}(-{number})?{unit}" ## Supposed to be correct
"""anomaies:
	0.05-0.35-1mx1.5-3-6mm
"""
full_regex = rf"({number}(-{number})?(-{number})?{unit}?x)?({number}(-{number})?(-{number})?{unit}?x)?{number}(-{number})?(-{number})?(-{number})?{unit}"

In [40]:
tmp = []
anomalies = set() # species with anomalies

def extract_features(i, feats:list, wordmeasure_distance=100): # TODO: automatic wordmeasure_distance
	features = {}
	for feat in feats:
		if len(feat) < 1:
			continue
		found = False
		for key, values in extra_features.items():
			feat = feat.replace(',', ' ')
			# remove any of .,; after one of the measures (cm, mm, dm, m), keeping the measure
			feat = re.sub(rf'{unit}[\.,;]', lambda x: x.group()[:-1], feat)
			if any([w == wf for w in set(values) for wf in feat.lower().split()]):
				if 'calyx' in feat.lower() and key != 'CalyxSize' or 'petiole' in feat.lower() and key != 'PetioleSize' or 'anther' in feat.lower() and key != 'AntherSize' or 'pedicel' in feat.lower() and key != 'PedicelSize':
					continue
				word_match = [w.lower() for w in feat.split() if w.lower() in set(values)][0]
				
				if found=='InflorescenceSize' and key == 'FlowerSize': # if inflorencence was already found, skip flower (e.g., "flower stem" associated with inflorescence only)
					continue
				if found=='FlowerSize' and key in ['StamenSize']: # avoid matching cases such as "Flowers large, white, about 8mm across, 4-petalled with 6 __stamens__"
					continue
				if found == 'FruitSize' and not any([w.lower() in feat.lower() for w in ['achene', 'cypsela']]):
					continue
				# measure is a number and a unit of measurement (e.g. 1.5 mm)
				# can match forms like 1.5-2.5 mm, 1.5 – 2.5 mm, 1.5×2.5 mm
				measure = re.search(full_regex, feat)
				if measure:
					word_match_position = np.where(np.array(feat.lower().split()) == word_match)[0][0]
					measure_position = np.where(np.array(feat.lower().split()) == measure.group())[0][0]

					if abs(word_match_position - measure_position) > wordmeasure_distance:
						continue
					if found:
						if (any([w.lower() in feat.lower() for w in ['achene', 'cypsela']]) and {key, found[0]} == {'FruitSize', 'SeedSize'}) or\
							{key, found[0]} == {'StigmaSize', 'StyleSize'}:
							pass
							# print(f'OK>> Multiple features found ({found}, {key}) in "{feat}"')
						else:
							this_distance = abs(word_match_position - measure_position)
							if this_distance > found[1]:
								continue
							else:
								features[found[0]].remove(found[2])
							tmp.append(f'({i}) Multiple features found ({found[0]}, {key}) in "{feat}"')
							anomalies.add(i)
					found = (key, abs(word_match_position - measure_position), measure.group())
					measure = measure.group()
					# print(key,[w.lower() for w in set(values)|set([key]) if w.lower() in feat.lower()], measure)
					
					if key in features:
						features[key].append(measure)
					else:
						features[key] = [measure]
			# primary features, if key is in petiole, anther, pedicel, calix
	return pd.Series(features)

In [41]:
species.Features.apply(string_preprocessing).str.split(r'(?<!\sc)[.;]\s|;').reset_index()

Unnamed: 0,SpeciesName,Features
0,abrotanella-rosulata,[]
1,abrotanella-spathulata,[]
2,aciphylla-dieffenbachii,"[Tufted, dioecious, perennial with extremely s..."
3,aciphylla-traversii,"[Stout, tufted, dioecious ? gynodioecious pere..."
4,astelia-chathamica,"[Robust tufted plant, Leaves 60-200x4-10cm ., ..."
...,...,...
2009,dacrydium-cupressinum,"[Dioecious conifer 35m tall, Adult trees with..."
2010,podocarpus-acutifolius,"[Shrub or small tree up to 15m tall, Trunk of..."
2011,podocarpus-cunninghamii,[]
2012,podocarpus-nivalis,"[Prostrate to suberect, spreading woody shrub ..."


In [43]:
def string_preprocessing(s):
	s = s.replace('\xa0', ' ').replace('×', 'x').replace('–', '-') # remove non-breaking space and replace multiplication sign with x
	s = re.sub(fr'-?\(-?{number}-?\)-?', '', s) # remove all parentesis surrounding a number (e.g. (-1.5))
	s = s.replace('--', '-').replace('-.', '-').replace('..', '.')
	s = s.replace('(', '').replace(')', '')
	s = re.sub(r'\s(c|ca|o)\.', ' ', s) # remove all ' c.'
	s = re.sub(r'\s+(?=[cmd]?m)', '', s) # remove all spaces before measures (mm, cm, dm, m, these strings only if padded by a space)
	s = re.sub(r'\s*-\s*', '-', s) # remove spaces around hyphens
	s = re.sub(r'(?<=\d)\s*\.(?=\d)', '.', s) # remove spaces before dot if followed and preceded by a number
	s = re.sub(r'(?<=\s)\.(?=\d)', '0.', s) # add a 0 before a dot if it is preceded by a space and a "not number" and followed by a number (e.g. foo .5 --> foo 0.5)
	s = re.sub(r'(?<=[\dm])\s*x\s*(?=\d+)', 'x', s) # remove spaces around x in formulas
	# now all measures are supposed to have no spaces between number and unit and spaces around them
	s = re.sub(rf'(?<=\d)({unit})(?!\s*x)', r'\1 ', s) # fix situation in which a measure is not followed by a space, in the case, add that space
	s = re.sub(r'(?<![\d\sx\.-])(\d)', r' \1', s) # fix the situation in which a measure (the whole number and measure) is not preceded by a space. In the case, add a space before the measure
	s = re.sub(r'(?<=\d\.\d+)(\.\d?)', '', s) # fix the error in which there is a doubled dot in a number (e.g. 1.5.2), in the case, remove the second dot and the eventual numbers after it
	s = re.sub(r'(?<=[a-ln-z])-(?=\d)', ' ', s) # remove all '-' preceded by a letter (different from m) and followed by a number (e.g. to-250mm --> to 250mm)
	s = re.sub(r'(?<![a-z])(l|I)(?=[\s\.-]|\d)', '1', s) # replace all 'l' or "I" characters which should be '1' (e.g. l.5 --> 1.5). This should be followed by a space, a dot, a hyphen, or a number and not preceded by a letter
	s = re.sub('|'.join(words_to_remove), '', s, flags=re.IGNORECASE) # remove words to remove
	return s

features = species.Features.apply(string_preprocessing).str.split(r'(?<!\sc)[.;]\s|;').reset_index().apply(lambda x: extract_features(x.SpeciesName, x.Features), axis=1)
features = features.applymap(lambda x: '; '.join(x) if not isinstance(x, float) else x)
features.index = species.index

In [49]:
features_stored = pd.read_csv('processed_features.csv', index_col=0)

In [47]:
features.xs('aciphylla-traversii')

AntherSize                            NaN
CalyxSize                             NaN
CorollaSize                           NaN
FlowerSize                            NaN
FruitSize                        8-10x6mm
Glumes                                NaN
InflorescenceSize     1.0x0.4m; 50mm; 8mm
LeafSize             0.15-0.46x0.15-0.8mm
Lemma                                 NaN
Lodicules                             NaN
OvarySize                             NaN
Palea                                 NaN
PedicelSize                           NaN
PetalSize                       2.0-2.5mm
PetioleSize                     0.1-0.25m
SeedSize                              NaN
StamenSize                            NaN
Stature                               NaN
StigmaSize                            NaN
StyleSize                             NaN
TubeSize                              NaN
Name: aciphylla-traversii, dtype: object

In [50]:
features_stored.xs('aciphylla-traversii')

LeafSize             0.15-0.46x0.15-0.8mm
InflorescenceSize           1.0x0.4m; 8mm
PetalSize                       2.0-2.5mm
FruitSize                        8-10x6mm
PetioleSize                     0.1-0.25m
FlowerSize                            NaN
PedicelSize                           NaN
Glumes                                NaN
Lemma                                 NaN
Palea                                 NaN
AntherSize                            NaN
OvarySize                             NaN
SeedSize                              NaN
Stature                               NaN
StyleSize                             NaN
StigmaSize                            NaN
StamenSize                            NaN
CorollaSize                           NaN
CalyxSize                             NaN
TubeSize                              NaN
Lodicules                             NaN
Name: aciphylla-traversii, dtype: object

In [55]:
feature[feature.apply(lambda x: isinstance(x, str) and string_preprocessing(match.group()).strip() in x.split('; '))].empty

True

In [59]:
[(meas, feat, '#00') for meas, feat in zip(feature, feature.index) if isinstance(meas, str) ]

[('0.7m; 150-400x100-300mm', 'LeafSize', '#00'),
 ('1.0x0.6m; 10mm', 'InflorescenceSize', '#00'),
 ('2mm; 1.5mm', 'PetalSize', '#00'),
 ('15x10mm', 'FruitSize', '#00')]

In [50]:
for sp, feature in processed_features.iterrows():
	features_text = species.loc[sp, 'Features'].replace('\xa0', ' ').replace('×', 'x').replace('–', '-')
	if features_text == '':
		continue
	# measure pattern with negative lookahead for a number (e.g. 1.5 mm) with any combination of spaces, hyphens, and dots, numbers, parentesis in between
	measures_in_text = re.finditer(r'\d[\d\.\s\(\)x-]*[cmd]?m\s?(?![\d-\(\)])', features_text)
	detected_features = {}
	for match in measures_in_text:
		# if '1.5 mm' in match.group():
		# 	foo
		try:
			print("? ", match.group())
			which_feature = feature[feature.apply(lambda x: isinstance(x, str) and string_preprocessing(match.group()).strip() in x.split('; '))].index[0]
			print('>>', which_feature, match.group())
			if which_feature in detected_features:
				detected_features[which_feature].append((match.group(), match.start(), match.end()))
			else:
				detected_features[which_feature] = [(match.group(), match.start(), match.end())]
		except:
			pass
			print('xx', string_preprocessing(match.group()))
	break


?  0.7m 
>> LeafSize 0.7m 
?  150-400 x 100-300 mm
>> LeafSize 150-400 x 100-300 mm
?  90 x 5 mm
xx  90x5mm 
?  1.0 x 0.6m 
>> InflorescenceSize 1.0 x 0.6m 
?  50-150 mm 
xx  50-150mm  
?  10 mm 
>> InflorescenceSize 10 mm 
?  2 mm 
>> PetalSize 2 mm 
?  1.5 mm 
>> PetalSize 1.5 mm 
?  15 x 10 mm
>> FruitSize 15 x 10 mm
?  2 m
xx  2m 
?  3 mm 
xx  3mm  


In [52]:
feature

LeafSize             0.7m; 150-400x100-300mm
InflorescenceSize             1.0x0.6m; 10mm
PetalSize                         2mm; 1.5mm
FruitSize                            15x10mm
PetioleSize                              NaN
FlowerSize                               NaN
PedicelSize                              NaN
Glumes                                   NaN
Lemma                                    NaN
Palea                                    NaN
AntherSize                               NaN
OvarySize                                NaN
SeedSize                                 NaN
Stature                                  NaN
StyleSize                                NaN
StigmaSize                               NaN
StamenSize                               NaN
CorollaSize                              NaN
CalyxSize                                NaN
TubeSize                                 NaN
Lodicules                                NaN
Name: aciphylla-dieffenbachii, dtype: object

In [38]:
feature

LeafSize             0.7m; 150-400x100-300mm
InflorescenceSize             1.0x0.6m; 10mm
PetalSize                         2mm; 1.5mm
FruitSize                            15x10mm
PetioleSize                              NaN
FlowerSize                               NaN
PedicelSize                              NaN
Glumes                                   NaN
Lemma                                    NaN
Palea                                    NaN
AntherSize                               NaN
OvarySize                                NaN
SeedSize                                 NaN
Stature                                  NaN
StyleSize                                NaN
StigmaSize                               NaN
StamenSize                               NaN
CorollaSize                              NaN
CalyxSize                                NaN
TubeSize                                 NaN
Lodicules                                NaN
Name: aciphylla-dieffenbachii, dtype: object

In [33]:
# find all the occurrences in features_text of the pattern starting with a number and ending with a measure (e.g. 1.5 mm) with any combination of spaces, hyphens, and dots, numbers, parentesis in between
# re.findall(r'\d[\d\.\s\(\)-]*[cmd]?m', features_text)

for r in re.finditer(r'\d[\d\.\s\(\)-]*[cmd]?m', features_text):
	print(r.start(), r.end(), r.group())

153 157 0.7m
279 285 300 mm
333 337 5 mm
408 412 0.6m
510 516 150 mm
599 604 10 mm
796 800 2 mm
888 894 1.5 mm
921 926 10 mm
1016 1019 2 m
1078 1082 3 mm


In [46]:
detected_features

{}

In [10]:
for species_name, feature in processed_features.iterrows():
	features_text = species.loc[species_name, 'Features']
	if features_text == '': continue
	
	# write the text with the highlighted words
	# detected_features = dictionary assigning each feature the position (start, end) in the text, if present
	detected_features = {feat: re.search(feat, features_text) for feat in feature.index if feature[feat]}
	# sort the features by the starting position in the text
	detected_features = {k: v for k, v in sorted(detected_features.items(), key=lambda item: item[1].start())}

AttributeError: 'NoneType' object has no attribute 'start'

Diminutive, tufted, stoloniferous, glabrescent to glabrous, brownish green perennial grass, up to -250 mm tall, culms overtopping leaves; branching extravaginal; leaf-blades persistent. Leaf-sheath membranous, glabrous, ribbed, keeled. Ligule 0.5-1.5 mm, entire, tapered, glabrous throughout. Leaf-blade 10-35 × 1-2 mm, flat or folded, subcoriaceous, smooth, but midrib scabrid near curved tip; margins finely scabrid. Culm 30-200 mm, very slender, erect or geniculate at base, internodes glabrous. Panicle 10-30, ± open or contracted, with few, ovate spikelets; rachis, branches and pedicels slender with sparse, scattered prickle-teeth. Spikelets 2.0-3.5 mm, 2-3-flowered, light green, tinged purple. Glumes unequal, submembranous with hyaline margins, a few prickle-teeth on midnerve near tip; lower 1.5-2 mm, 1-nerved, narrow-lanceolate, acute, upper 2.0-2.5 mm, (1-)3-nerved, elliptic-oblong, subobtuse to obtuse. Lemma 2.0-2.5 mm, 5-nerved, elliptic-ovate, obtuse, glabrous, but midnerve with short crinkled hairs to c.½ length and sparsely prickle-toothed near tip, lateral nerves with a few hairs near base. Palea 1.5-1.8 mm, keels minutely scabrid, interkeel glabrous. Callus with a few wispy hairs. Rachilla c.0.5 mm, glabrous. Lodicules c.0.1 mm. Anthers 0.3-0.4 mm. Ovary 0.4-0.5 mm; stigma-styles 0.8-1 mm. Seeds c.1.0 × 0.5 mm


In [50]:
species[species.Features.str.contains('petalled')].Features.str.replace('\xa0', '').str.replace('×', 'x').str.replace('|'.join(words_to_remove), '', regex=True).str.split(r'(?<!\sc)[.;]\s').iloc[:1].apply(extract_features)

Unnamed: 0_level_0,SeedSize
SpeciesName,Unnamed: 1_level_1
cardamine-bilobata,1mm


In [29]:
features[features.Gender.notna()].Gender.value_counts()

Gender
{male}                   291
{hermaphrodite}           60
{hermaphrodite, male}     37
{hermaphroditic}           1
Name: count, dtype: int64

In [30]:
features.join(species.Features).to_excel('features.xlsx')

In [None]:
print(species[species.index.str.startswith('carpodetus')].Features.str.split(r'(?<!\sc)[.]\s').values[0][0])
species[species.index.str.startswith('carpodetus')].Features\
	.str.replace('\xa0', '').str.replace('×', 'x')\
	.str.split(r'(?<!\sc)[.]\s').apply(extract_features).values[0]


Monoecious small tree up to 10 m tall
Stature ['tall', 'up to'] 10m
Leaf_Size ['leaves'] 10-30mm x 10-20mm
Leaf_Size ['leaves'] 40-60mm x 20-30mm
Petiole_Size ['petiole', 'petioles c.', 'petioles'] 10mm
Flower_PedicelSize ['pedicels'] 10mm
Inflorescence_Size ['panicle', 'panicles'] 50 x 50mm
Flower_Size ['flower', 'flowers'] 50 x 50mm
Flower_LobesSize ['lobes'] 50 x 50mm
Flower_PetalSize ['petal'] 50 x 50mm
Fruit_Size ['capsule', 'fruit'] 4-6mm
Fruit_Size ['capsule'] 1-2mm
Seed_Size ['seed'] 1-2mm


{'Stature': '10m',
 'Leaf_Size': '10-30mm x 10-20mm; 40-60mm x 20-30mm',
 'Petiole_Size': '10mm',
 'Flower_PedicelSize': '10mm',
 'Inflorescence_Size': '50 x 50mm',
 'Flower_Size': '50 x 50mm',
 'Flower_LobesSize': '50 x 50mm',
 'Flower_PetalSize': '50 x 50mm',
 'Fruit_Size': '4-6mm; 1-2mm',
 'Seed_Size': '1-2mm'}

In [74]:
species[species.index.str.startswith('carpodetus')].Features\
	.str.replace('\xa0', '').str.replace('×', 'x').str.split(r'(?<!\sc)[.]\s').values[0]

['Monoecious small tree up to 10m tall',
 'Trunk slender, bark rough, corky, mottled grey-white, often knobbled due to insect boring',
 'Juvenile plants with distinctive zig-zag branching which is retained to a lesser degree in branchlets of adult',
 'Leaves broad-elliptic to broad-ovate or suborbicular; dark green, marbled; membranous becoming thinly coriaceous; margin serrately toothed; tip acute to obtuse',
 'Juvenile leaves 10-30mm x 10-20mm',
 'Adult leaves 40-60mm x 20-30mm',
 'Petioles c. 10mm; petioles, peduncles and pedicels pubescent; lenticels prominent',
 'Flowers in panicles at branchlet tips; panicles to 50 x 50mm; flowers 5-6mm diam.; calyx lobes c. 1mm long, triangular-attenuate; petals white, ovate, acute, 3-4mm long',
 'Stamens 5-6, alternating with petals; filaments short',
 'Stigma capitate, tip dark; ovules many',
 'Fruit an indehiscent subfleshy-fleshy capsule, 4-6mm diam., black when mature; cupped in remains of calyx',
 'Seeds many per capsule, in 3-5 locules, s

In [5]:
print(species[species.SpeciesName.str.startswith('carpodetus')].Features.values[0].split('. '))

# features are presented in Features as plain text. They are separated by commas, semicolomns or dots.
# The interesting feautures are in the form
# <name_feature> (about) <value_feature> <unit_feature> (x <value_feature> <unit_feature>)
# e.g. 
# Flowers in panicles at branchlet tips; panicles to 50 x 50 mm; flowers 5-6 mm diam.

['Monoecious small tree up to 10\xa0m tall', 'Trunk slender, bark rough, corky, mottled grey-white, often knobbled due to insect boring', 'Juvenile plants with distinctive zig-zag branching which is retained to a lesser degree in branchlets of adult', 'Leaves broad-elliptic to broad-ovate or suborbicular; dark green, marbled; membranous becoming thinly coriaceous; margin serrately toothed; tip acute to obtuse', 'Juvenile leaves 10-30\xa0mm x 10-20\xa0mm', 'Adult leaves 40-60\xa0mm x 20-30mm', 'Petioles c', '10\xa0mm; petioles, peduncles and pedicels pubescent; lenticels prominent', 'Flowers in panicles at branchlet tips; panicles to 50 x 50\xa0mm; flowers 5-6\xa0mm diam.; calyx lobes c', '1\xa0mm long, triangular-attenuate; petals white, ovate, acute, 3-4\xa0mm long', 'Stamens 5-6, alternating with petals; filaments short', 'Stigma capitate, tip dark; ovules many', 'Fruit an indehiscent subfleshy-fleshy capsule, 4-6\xa0mm diam., black when mature; cupped in remains of calyx', 'Seeds ma

- Monoecious small tree up to ``10 m`` tall. 
- Trunk slender, bark rough, corky, mottled grey-white, often knobbled due to insect boring. 
- Juvenile plants with distinctive zig-zag branching which is retained to a lesser degree in branchlets of adult. 
- Leaves broad-elliptic to broad-ovate or suborbicular; dark green, marbled; membranous becoming thinly coriaceous; margin serrately toothed; tip acute to obtuse. 
- Juvenile leaves ``10-30 mm`` x ``10-20 mm``. 
- Adult leaves ``40-60 mm`` x ``20-30mm``. 
- Petioles c. ``10 mm``; petioles, peduncles and pedicels pubescent; lenticels prominent. 
- Flowers in panicles at branchlet tips; panicles to 50 x 50 mm; flowers 5-6 mm diam.; calyx lobes c. 1 mm long, triangular-attenuate; petals white, ovate, acute, 3-4 mm long. 
- Stamens 5-6, alternating with petals; filaments short. 
- Stigma capitate, tip dark; ovules many. 
- Fruit an indehiscent subfleshy-fleshy capsule, 4-6 mm diam., black when mature; cupped in remains of calyx. 
- Seeds many per capsule, in 3-5 locules, small, 1-2 mm long; testa reticulate.

In [15]:
species.loc[4, 'Features']

'Robust tufted plant. Leaves 60–200 × 4–10\xa0cm., keeled near the sheath, less so in main part of lamina; sheath-base white, with close scales on both surfaces; lamina adaxially silvered green with a metallic sheen, covered with a thin but long-persistent clear pellicle that lifts off in strips from old leaves; abaxial surface with a pale grey satiny indumentum of appressed scales with very little wool, the single main costa on each side of midrib little if at all stronger than midrib and not prominent. Inflorescence large and erect, most parts shaggy with narrow scales at least when young; lower spathes long; racemes numerous, all spathes except the smallest subtending sub-inflorescences of 2–3 or more racemes. Flowers pedicellate and usually well-spaced; male flower pale, tepals to 8 × 2.5–3\xa0mm., strongly reflexed soon after flower opens; perianth-tube very short, drooping around pedicel and so exposing the base of the pistillode; outer tepals scaly externally; female flower colo

In [13]:
species.Features.str.lower().str.split('fruit').str[-1].str[:6]

0          NaN
1          NaN
2        15 × 
3        8–10 
4        about
         ...  
2009    dioeci
2010    shrub 
2011       NaN
2012    prostr
2013       NaN
Name: Features, Length: 2014, dtype: object

**TODO:**
foglie: se non c'è giovane e adulto: inserire lo stesso in entrambi

## IDEA

- Dataset con valori mancanti
- Imputing missing values (dataset per mainland + dataset per isola, ora solo mainland?)
- Confronto distribuzioni isola con mainland

altrimenti
- Metric uncoupling