In [1]:
from pathlib import Path

import regex
import pandas as pd
from ipywidgets import interact
from IPython.display import display, display_html, HTML
import matplotlib.pyplot as plt
from PIL import Image

from pylib.label_babel.parsers.admin_unit import ADMIN_UNIT
from pylib.label_babel.parsers.collector import COLLECTOR
from pylib.label_babel.parsers.label_date import LABEL_DATE
from pylib.label_babel.parsers.plant_taxon import PLANT_TAXON, PLANT_FAMILY

In [2]:
DATA_DIR = Path('.') / 'data' / 'label_babel'

VALID_CSV = DATA_DIR / 'label_babel_valid_2020-01-02.csv'
TRAIN_CSV = DATA_DIR / 'label_babel_train_2020-01-02.csv'
NAMES_CSV = DATA_DIR / 'NYBG_Collectors_2020-01-02.csv'
NAME_PARTS_CSV = Path('.') / 'data' / 'name_parts.csv'

IMAGE_DIR = Path('..') / 'notes-from-nature' / 'label-babel' / 'data'
TYPE_DIR = IMAGE_DIR / 'typewritten'
HAND_DIR = IMAGE_DIR / 'handwritten'

INPUT_CSV = TRAIN_CSV
INPUT_COL = 'predicted_text'

SUB_ID = 'subject_id'
PRED_CAT = 'predicted_category'

In [3]:
df = pd.read_csv(INPUT_CSV, na_filter=False, dtype=str)

In [4]:
def show_results(idx):
    row = df.loc[idx]
    text = row[INPUT_COL]
    sub_id = int(row[SUB_ID])
    pred_cat = row[PRED_CAT]

    path = TYPE_DIR if pred_cat == 'typewritten' else HAND_DIR
    path = path / f'{sub_id}.jpg'
    image = Image.open(path)

    label_date = [d.value for d in LABEL_DATE.parse(text)]
    admin_unit = ADMIN_UNIT.parse(text)
    us_state = [s for u in admin_unit if (s:= getattr(u, 'us_state'))]
    us_county = [c for u in admin_unit if (c:= getattr(u, 'us_county'))]
    family = [f.value for f in PLANT_FAMILY.parse(text)]
    taxon = [t.value for t in PLANT_TAXON.parse(text)]
    collector = COLLECTOR.parse(text)
    col_name = [n for c in collector if (n := getattr(c, 'col_name'))]
    col_no = [n for c in collector if (n := getattr(c, 'col_no'))]
    
    print(f'Subject ID:   {sub_id}')
    print(f'Category:     {pred_cat}')
    print(f'Date:         {label_date}')
    print(f'US state:     {us_state}')
    print(f'US county:    {us_county}')
    print(f'Family:       {family}')
    print(f'Taxon:        {taxon}')
    print(f'Collector:    {col_name}')
    print(f'Collector no: {col_no}')
    print()

    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    plt.axis('off')

    # ax[0].set_axis_off()
    # ax[0].text(0, 0.5, text, fontsize=16, verticalalignment='center')

    ax.set_axis_off()
    ax.imshow(image)
    
    print(text)
    
    plt.show()

In [5]:
interact(show_results, idx=(0, df.shape[0] - 1));

interactive(children=(IntSlider(value=2432, description='idx', max=4864), Output()), _dom_classes=('widget-int…

In [7]:
dfn = pd.read_csv(NAMES_CSV, na_filter=False, dtype=str)

dfn.head()

Unnamed: 0,irn,SummaryData,Title,First,Middle,Last,Suffix,FullName,BriefName,CitedName,TaxonomicName,OtherNames,BirthDate,DeathDate,Location,Roles,PeriodOfActivityNotes
0,247901,Person - L. M. A (L M. A); Collector,,L,M,A,,L M. A,L. M. A,,L. M. A,,,,,Collector,
1,167839,Person - B. V. A.; Collector; Mexico,,B.,V.,A.,,B. V. A.,B. V. A.,B. V. A.,,,,,Mexico,Collector,"Collected on Herb. MEXU, Distrito Federal (197..."
2,247253,Person - R. Alvarado A. (Ramón Alvarado A.); C...,,Ramón,Alvarado,A.,,Ramón Alvarado A.,R. Alvarado A.,"Alvarado A, Ramón",,Ramón Alvarado,,,,Collector,
3,137564,Person - A. A. da Luz; Collector; Brazil,,A.,,A. da Luz,,A. A. da Luz,A. A. da Luz,"Luz, A. A. da",,,,,Brazil,Collector,
4,51290,Person - J. Aalbers; Collector; South Africa,,J.,,Aalbers,,J. Aalbers,J. Aalbers,"Aalbers, J.",,,,,South Africa,Collector,


In [None]:
names = set()
splitter = regex.compile(r'[^[:alpha:]]+', flags=regex.IGNORECASE)

for _, row in dfn.iterrows():
    for col in ('Title', 'First', 'Middle', 'Last', 'Suffix'):
        names |= {x.lower() 
                  for x in regex.split(splitter, row[col])
                  if x and len(x) > 1}

len(names)

In [None]:
data = {'name': sorted(names)}
df2 = pd.DataFrame(data)
df2.head()

In [None]:
# df2.to_csv(NAME_PARTS_CSV, index=False)

In [8]:
roles = set()
for _, row in dfn.iterrows():
    roles |= {x.lower() for x in row['Roles'].split()}
    
len(roles)

123

In [10]:
print(' '.join(sorted(roles)))

(american 15th 1st 26th a agriculturist anatomist and army articles/journals artist asian assistant astronomer author authors biochemist biologist botanical botanist botanist/plant brazil british carolina cartographer central civil collection collector collector, collector. collector; columbia conservator contributor- corator corps.) curator curatorial databases department determiner determiner, director director, division, east ecologist ecoparks, editor entomologist entrepreneur exhibition expedition explorer extractor florida former fundación gardener geographer geologist georeferencer georgia greenhouses grower head healer herbarium horticulturalist horticulturist. housemother iabl illustrator in informant inventor iowa keeper lichens, main maine manager manager, mathematician mexico michigan missionary mushroomobserver.org naturalist navy nolen not nova nurseryman of ornithologist photographer plant preparator professor project researcher royal scotia ship south sponsor staff stud

In [9]:
suffixes = set()
for _, row in dfn.iterrows():
    suffixes |= {x.lower() for x in row['Suffix'].split()}

suffixes

{'filho', 'ii', 'iii', 'jr.', 'sr.'}

In [22]:
css = """
.output_html {
    display: flex;
    flex-direction: row;
}
.output_html .row {
    color: blue;
    margin: 0 20px;
}
"""
html = """
<div class="row">Hello, one!</div>
<div class="row">Hello, two!</div>
"""
page = HTML(f'<style>{css}</style>{html}')
display(page)
# display_html('<h1>Hello, world!</h1>', raw=True)