# Build a vocabulary

In [1]:
import sys
sys.path.append('..')

In [2]:
import sqlite3
from collections import defaultdict
from pathlib import Path
from zipfile import ZipFile

import chardet
import nltk
import regex as re
from tqdm import tqdm

from digi_leap.pylib import db
from digi_leap.pylib import vocab

In [3]:
DATA = Path('..') / 'data' / 'raw'

IDIGBIO = DATA / 'occurrence_raw_idigbio_2021-02.sqlite3.db'
ITIS = DATA / 'ITIS.sqlite'
NLTK = DATA / 'nltk_corpora'
VOCAB = vocab.VOCAB_DB

In [4]:
WORDS = defaultdict(int)

## Common functions

In [5]:
def text_to_vocab(text):
    words = vocab.text_to_words(text)
    words = [w for w in words if not vocab.is_number(w)]
    for w in words:
        WORDS[w] += 1

## Get NLTK corpora

In [6]:
SKIPS = {'README', 'LICENSE'}

In [7]:
def read_nltk(zip_file):
    print(f'\n{zip_file}')
    zip_path = NLTK / zip_file
    stem = zip_path.stem
    with ZipFile(zip_path) as zippy:
        for info in zippy.infolist():
            if Path(info.filename).stem in SKIPS or info.is_dir():
                continue
            print(info.filename)
            for ln in zippy.open(info.filename):
                enc = chardet.detect(ln)['encoding']  # Slow
                ln = ln.decode(enc)
                text_to_vocab(ln)

In [8]:
read_nltk('words.zip')
read_nltk('gazetteers.zip')
read_nltk('gutenberg.zip')
read_nltk('names.zip')
read_nltk('webtext.zip')


words.zip
words/en
words/en-basic

gazetteers.zip
gazetteers/caprovinces.txt
gazetteers/countries.txt
gazetteers/isocountries.txt
gazetteers/mexstates.txt
gazetteers/nationalities.txt
gazetteers/uscities.txt
gazetteers/usstateabbrev.txt
gazetteers/usstates.txt

gutenberg.zip
gutenberg/austen-emma.txt
gutenberg/austen-persuasion.txt
gutenberg/austen-sense.txt
gutenberg/bible-kjv.txt
gutenberg/blake-poems.txt
gutenberg/bryant-stories.txt
gutenberg/burgess-busterbrown.txt
gutenberg/carroll-alice.txt
gutenberg/chesterton-ball.txt
gutenberg/chesterton-brown.txt
gutenberg/chesterton-thursday.txt
gutenberg/edgeworth-parents.txt
gutenberg/melville-moby_dick.txt
gutenberg/milton-paradise.txt
gutenberg/shakespeare-caesar.txt
gutenberg/shakespeare-hamlet.txt
gutenberg/shakespeare-macbeth.txt
gutenberg/whitman-leaves.txt

names.zip
names/female.txt
names/male.txt

webtext.zip
webtext/firefox.txt
webtext/grail.txt
webtext/overheard.txt
webtext/pirates.txt
webtext/singles.txt
webtext/wine.txt


## Get ITIS data

In [9]:
def itis_names():
    sql = """
        select expert as name from experts
        union select reference_author from publications
        union select shortauthor from strippedauthor
        union select short_author from taxon_authors_lkp
        """
    for row in tqdm(db.select_records(ITIS, sql)):
        text_to_vocab(row[0])

In [10]:
def itis_taxa():
    # Data is normalized so don't bother with union "all"
    sql = """
        select unit_name1 as name from taxonomic_units where unit_name1 is not null
        union select unit_name2 from taxonomic_units where unit_name2 is not null
        union select unit_name3 from taxonomic_units where unit_name3 is not null
        union select unit_name4 from taxonomic_units where unit_name4 is not null
        """
    for row in tqdm(db.select_records(ITIS, sql)):
        text_to_vocab(row[0])

In [11]:
def itis_vernaculars():
    sql = """select vernacular_name from vernaculars"""
    for row in tqdm(db.select_records(ITIS, sql)):
        text_to_vocab(row[0])

In [12]:
itis_names()
itis_taxa()
itis_vernaculars()

168721it [00:02, 79171.79it/s]
291801it [00:01, 159727.94it/s]
131983it [00:01, 98755.24it/s]


## Get iDigBio data

In [13]:
TAXON_COLS = """
    accepted_name_usage class family genus group
    higher_classification infraspecific_epithet
    kingdom order original_name_usage phylum
    previous_identifications scientific_name
    specific_epithet subgenus taxon_rank
    verbatim_taxon_rank vernacular_name parent_name_usage
    verbatim_scientific_name
    """.split()

NAME_COLS = """
    georeferenced_by identified_by location_according_to
    name_according_to recorded_by scientific_name_authorship
    record_entered_by 
    """.split()

PLACE_COLS = """
    continent country country_code county island
    island_group municipality higher_geography
    state_province verbatim_coordinate_system
    verbatim_coordinates verbatim_elevation
    verbatim_latitude verbatim_longitude water_body
    georeference_protocol verbatim_srs
    """.split()

WORD_COLS = """
    dataset_name dwc_rights_holder dwc_verbatim_event_date
    establishment_means event_remarks georeference_remarks
    georeference_sources habitat identification_remarks
    information_withheld life_stage locality location_remarks
    occurrence_remarks organism_remarks
    georeference_verification_status name_published_in
    owner_institution_code preparations reproductive_condition
    rights_holder sampling_protocol sex taxon_remarks
    taxonomic_status type_status verbatim_event_date
    verbatim_locality
    """.split()

COLUMNS = TAXON_COLS + NAME_COLS + PLACE_COLS + WORD_COLS

In [14]:
def idigbio_data():
    columns = [f"`{c}`" for c in COLUMNS]
    sql = f"""select {', '.join(columns)} from occurrence_raw"""
    for row in tqdm(db.select_records(IDIGBIO, sql)):
        row = dict(row)   
        for col in COLUMNS:
            text_to_vocab(row[col])

In [None]:
idigbio_data()

18220777it [1:49:44, 2928.15it/s]

## Write to the vocab database

In [None]:
def to_vocab_db():
    db.create_vocab_table(VOCAB, drop=True)
    batch = list(WORDS.items())
    db.insert_vocabulary_words(VOCAB, batch)

In [None]:
to_vocab_db()