In [1]:
import sys

sys.path.append('..')

In [2]:
import csv
import string
from pathlib import Path

import pandas as pd
import regex as re
from tqdm import tqdm

In [3]:
DATA = Path('..') / '..' / 'data' / 'species'
VOCAB = Path('..') / '..' / 'mimosa' / 'vocabulary'

TEXT = DATA / 'Barneby_1991_Sensitivae_Censitae_sci_names.txt'
PDF = DATA / 'Barneby_1991_Sensitivae_Censitae_sci_names.pdf'
CSV = VOCAB / 'taxa.csv'

In [4]:
# !pdftotext $PDF

Manually remove headers and footers.

Join lines so that one taxon reference is on one line. Joining lines is easy because all references end in line numbers.

In [5]:
TAXA = {
    'genus': set(),
    'section': set(),
    'series': set(),
    'subseries': set(),
    'species': set(),
    'subspecies': set(),
    'variant': set(),
}

In [6]:
with open(CSV) as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        TAXA[row['label']].add(row['pattern'])

In [7]:
LINES = []
with open(TEXT) as in_file:
    for ln in in_file.readlines():
        parts = ln.split('=')
        LINES += [p.strip() for p in parts]

In [8]:
for level, names in TAXA.items():
    print(f'{level:<10} {len(names)}')

genus      2
section    0
series     0
subseries  0
species    1058
subspecies 0
variant    0


In [9]:
def labeled_taxa(label):
    label = re.escape(label)
    taxa = set()
    for ln in LINES:
        for word in re.findall(fr' {label} \s+ ( [\w-]+ )', ln, flags=re.X):
            if len(word) > 1:
                taxa.add(word)
    return taxa

In [10]:
TAXA['section']    |= labeled_taxa('sect.')
TAXA['series']     |= labeled_taxa('ser.')
TAXA['subseries']  |= labeled_taxa('subser.')
TAXA['subspecies'] |= labeled_taxa('subsp.')
TAXA['variant']    |= labeled_taxa('var.')

In [11]:
for level, names in TAXA.items():
    print(f'{level:<10} {len(names)}')

genus      2
section    11
series     91
subseries  37
species    1058
subspecies 32
variant    398


In [12]:
TAXA['section']

{'Ameria',
 'Batocaulon',
 'Calothamnos',
 'Eu-mimosa',
 'Eumimosa',
 'Habbasia',
 'Mimadenia',
 'Mimosa',
 'Rhodostachya',
 'Stachyomima',
 'Vulgares'}

In [13]:
def has_punct(word):
    return any(c in string.punctuation for c in word)

In [14]:
def get_species(words):
    if has_punct(words[0]):
        return '', ''

    species = words[0] if words[0].islower() else ''

    if species and words[1].islower() and not has_punct(words[1]):
        subsp = words[1]
    else:
        subsp = ''

    return species, subsp

In [15]:
for i, ln in enumerate(LINES):
    taxon = ln.split() + ['', '', '']

    genus, species, subsp = '', '', ''

    if has_punct(taxon[0]) and taxon[0] != 'M.':
        pass

    elif taxon[0] == 'M.' or taxon[0].istitle():
        genus = taxon[0] if taxon[0] != 'M.' else ''
        species, subsp = get_species(taxon[1:])

    elif taxon[0].islower():
        species, subsp = get_species(taxon)

    if len(genus) > 2:
        TAXA['genus'].add(genus)
    if len(species) > 2:
        TAXA['species'].add(species)
    if len(subsp) > 2:
        TAXA['subspecies'].add(subsp)

In [16]:
levels = list(TAXA.keys())

while levels:
    hi = levels.pop(0)
    for lo in levels:
        TAXA[lo] -= TAXA[hi]

In [17]:
for level, names in TAXA.items():
    print(f'{level:<10} {len(names)}')

genus      87
section    10
series     82
subseries  0
species    1249
subspecies 30
variant    143


In [23]:
TAXA['series']

{'Acanthocarpae',
 'Acantholobae',
 'Adversae',
 'Affines',
 'Andinae',
 'Antrorsae',
 'Aphyllae',
 'Asperatae',
 'Auriculatae',
 'Axillares',
 'Bahamenses',
 'Bipennatulae',
 'Bipinnatae',
 'BoUvianae',
 'Brevipedes',
 'Caesalpiniifoliae',
 'Campicolae',
 'Ceratoniae',
 'Cordistipulae',
 'Dicerasteae',
 'Diffusae',
 'Discobolae',
 'Dolentes',
 'Dryandroideae',
 'Dutranae',
 'Echinocaulae',
 'Ephedroideae',
 'Farinosae',
 'Filipedes',
 'Glanduliferae',
 'Glandulosae',
 'Hirsutae',
 'Lactifluae',
 'Lanatae',
 'Leiocarpae',
 'Lepidotae',
 'Leptopodae',
 'Leptostachyae',
 'Leucaenoideae',
 'Lundianae',
 'Macrocalycinae',
 'Meticulosae',
 'Microcarpae',
 'Microcephalae',
 'Modestae',
 'Morongianae',
 'Myriadeniae',
 'Myriophyllae',
 'Neptunioideae',
 'Nothacaciae',
 'Obstrigosae',
 'Pachycarpae',
 'Paucifoliatae',
 'Pectinatae',
 'Pedunculosae',
 'Piresianae',
 'Plurijugae',
 'Pogocephalae',
 'Polycarpae',
 'Polycephalae',
 'Pseudocymosae',
 'Pudicae',
 'Quadrivalves',
 'Ramentaceae',
 'Ra

In [19]:
data = {'label': [], 'pattern': []}
for level, taxa in TAXA.items():
    for taxon in sorted(taxa):
        data['label'].append(level)
        data['pattern'].append(taxon)

In [20]:
levels = """
    genus
    section
    series
    subseries
    species
    subspecies
    variant
    sect.
    ser.
    subser.
    subsp.
    var.
    sect
    ser
    subser
    subsp
    var
""".split()
data['pattern'] += levels
data['label'] += ['level'] * len(levels)

In [21]:
df = pd.DataFrame(data=data)
df.to_csv(CSV, index=False)