# Download lists of mimosa species and create a combined list

In [1]:
import sys

sys.path.append('..')

In [2]:
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from mimosa.pylib import consts

In [3]:
DATA_DIR = Path('..') / 'data'
SPECIES_DIR = DATA_DIR / 'species'

In [4]:
TAXONS = set()

## Source: Catalog of Life

There seems to be no API on this site so I performed a search on "mimosa" set the rows per page to "1000" and downloaded the pages directly. We're only interested in the text so everything else can be discarded. The table layout seems to have 5 columns:
1. blank
2. the scientific name
3. accepted or a synonym
4. taxonomic level
5. parent phylogeny: kingdom & family

In [5]:
PAGE1 = SPECIES_DIR / 'mimosa_1.html'
PAGE2 = SPECIES_DIR / 'mimosa_2.html'

In [6]:
for path in [PAGE1, PAGE2]:

    with open(path) as in_file:
        page = in_file.read()

    soup = BeautifulSoup(page, features='lxml')

    for i, tr in enumerate(soup.find_all('tr')):
        tds = tr.find_all('td')

        if len(tds) != 5:
            continue

        texts = [tr.get_text() for tr in tr.find_all('td')]
        _, taxon, accepted, level, parents = texts

        if parents.find('Fabaceae') == -1:
            continue

        words = taxon.split()

        if level == 'genus':
            pass  # TAXONS.add(words[0])
        elif level == 'species':
            TAXONS.add(' '.join(words[:2]))
        elif level == 'subspecies':
            pass  # TAXONS.add(' '.join(words[:4]))
        elif level == 'variety':
            pass  # TAXONS.add(' '.join(words[:4]))
        else:
            raise(ValueError)

In [7]:
len(TAXONS)

902

## Source: Wikispecies

In [8]:
PAGE = SPECIES_DIR / 'Mimosa_Wikispecies.html'

In [9]:
with open(PAGE) as in_file:
    page = in_file.read()

soup = BeautifulSoup(page, features='lxml')

names = [i.get_text() for i in soup.find_all('i') if i.get_text().startswith('M')]
names = [n.replace('\xa0', ' ') for n in names]
names = [n.replace('M.', 'Mimosa') for n in names]
names = {n for n in names if len(n.split()) == 2}

TAXONS |= names

len(TAXONS)

1056

## Create a vocabulary file

In [10]:
specifics = sorted([t.split()[1] for t in TAXONS])

df = pd.DataFrame(data=specifics, columns=['pattern'])
df['label'] = 'species'
df['attr'] = ''

df = df[['label', 'pattern', 'attr']]

df.loc[len(df)] = ['genus', 'Mimosa', '']

df.tail()

Unnamed: 0,label,pattern,attr
1052,species,zacapana,
1053,species,zimapanensis,
1054,species,zygia,
1055,species,zygophylla,
1056,genus,Mimosa,


In [12]:
out_path = consts.VOCAB_DIR / 'taxa.csv'

df.to_csv(out_path, index=False)