# Build a vocabulary

In [1]:
import sys
sys.path.append('..')

In [21]:
import sqlite3
from collections import Counter, defaultdict
from pathlib import Path
from zipfile import ZipFile

import chardet
import nltk
import pandas as pd
import regex as re
from tqdm import tqdm

from digi_leap.pylib import db
from digi_leap.pylib import vocab

In [3]:
DATA = Path('..') / 'data' / 'raw'

IDIGBIO = DATA / 'occurrence_raw_idigbio_2021-02.sqlite3.db'
ITIS = DATA / 'ITIS.sqlite'
NLTK = DATA / 'nltk_corpora'
VOCAB = vocab.VOCAB_DB

In [4]:
WORDS = Counter()

## Common functions

In [5]:
def text_to_vocab(text):
    words = re.findall(r'\w+', text)
    words = [w for w in words if not vocab.is_number(w)]
    WORDS.update(words)

## Get NLTK corpora

In [6]:
SKIPS = {'README', 'LICENSE'}

In [7]:
def read_nltk(zip_file):
    print(f'\n{zip_file}')
    zip_path = NLTK / zip_file
    stem = zip_path.stem
    with ZipFile(zip_path) as zippy:
        for info in zippy.infolist():
            if Path(info.filename).stem in SKIPS or info.is_dir():
                continue
            print(info.filename)
            for ln in zippy.open(info.filename):
                enc = chardet.detect(ln)['encoding']  # Slow
                ln = ln.decode(enc)
                text_to_vocab(ln)

In [8]:
# read_nltk('words.zip')
# read_nltk('gazetteers.zip')
# read_nltk('gutenberg.zip')
# read_nltk('names.zip')
# read_nltk('webtext.zip')

## Get ITIS data

In [9]:
def itis_names():
    sql = """
        select expert as name from experts
        union select reference_author from publications
        union select shortauthor from strippedauthor
        union select short_author from taxon_authors_lkp
        """
    for row in tqdm(db.build_select(ITIS, sql)):
        text_to_vocab(row[0])

In [10]:
def itis_taxa():
    # Data is normalized so don't bother with union "all"
    sql = """
        select unit_name1 as name from taxonomic_units where unit_name1 is not null
        union select unit_name2 from taxonomic_units where unit_name2 is not null
        union select unit_name3 from taxonomic_units where unit_name3 is not null
        union select unit_name4 from taxonomic_units where unit_name4 is not null
        """
    for row in tqdm(db.build_select(ITIS, sql)):
        text_to_vocab(row[0])

In [11]:
def itis_vernaculars():
    sql = """select vernacular_name from vernaculars"""
    for row in tqdm(db.build_select(ITIS, sql)):
        text_to_vocab(row[0])

In [12]:
# itis_names()
# itis_taxa()
# itis_vernaculars()

## Get iDigBio data

In [14]:
TAXON_COLS = """
    accepted_name_usage class family genus group
    higher_classification infraspecific_epithet
    kingdom order original_name_usage phylum
    previous_identifications scientific_name
    specific_epithet subgenus taxon_rank
    verbatim_taxon_rank vernacular_name parent_name_usage
    verbatim_scientific_name
    """.split()

NAME_COLS = """
    georeferenced_by identified_by location_according_to
    name_according_to recorded_by scientific_name_authorship
    record_entered_by 
    """.split()

PLACE_COLS = """
    continent country country_code county island
    island_group municipality higher_geography
    state_province verbatim_coordinate_system
    verbatim_coordinates verbatim_elevation
    verbatim_latitude verbatim_longitude water_body
    georeference_protocol verbatim_srs
    """.split()

WORD_COLS = """
    dataset_name dwc_rights_holder dwc_verbatim_event_date
    establishment_means event_remarks georeference_remarks
    georeference_sources habitat identification_remarks
    information_withheld life_stage locality location_remarks
    occurrence_remarks organism_remarks
    georeference_verification_status name_published_in
    owner_institution_code preparations reproductive_condition
    rights_holder sampling_protocol sex taxon_remarks
    taxonomic_status type_status verbatim_event_date
    verbatim_locality
    """.split()

COLUMNS = TAXON_COLS + NAME_COLS + PLACE_COLS + WORD_COLS

In [15]:
def idigbio_data():
    columns = [f"`{c}`" for c in COLUMNS]
    sql = f"""select {', '.join(columns)} from occurrence_raw"""
    for row in tqdm(db.build_select(IDIGBIO, sql)):
        row = dict(row)
        for col in COLUMNS:
            text_to_vocab(row[col])

In [16]:
# idigbio_data()

## Write to the vocab database

In [17]:
def to_vocab_db():
    db.create_vocab_table(VOCAB, drop=True)
    batch = list(WORDS.items())
    db.insert_vocabulary_words(VOCAB, batch)

In [18]:
# to_vocab_db()

## Look at the character set

In [29]:
with sqlite3.connect(VOCAB) as cxn:
    sql = 'select * from vocab'
    WORDS = {r[0]: r[1] for r in cxn.execute(sql)}

In [30]:
chars = set()
for word, freq in words.items():
    chars |= set(word)

print(sorted(chars))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ª', 'µ', 'º', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'ā', 'ă', 'ą', 'ć', 'ĉ', 'ċ', 'č', 'ď', 'đ', 'ē', 'ĕ', 'ė', 'ę', 'ě', 'ĝ', 'ğ', 'ġ', 'ħ', 'ĩ', 'ī', 'ĭ', 'į', 'ı', 'ķ', 'ĺ', 'ļ', 'ľ', 'ł', 'ń', 'ņ', 'ň', 'ŋ', 'ō', 'ŏ', 'ő', 'œ', 'ŕ', 'ř', 'ś', 'ŝ', 'ş', 'š', 'ţ', 'ť', 'ũ', 'ū', 'ŭ', 'ů', 'ű', 'ŵ', 'ŷ', 'ź', 'ż', 'ž', 'ſ', 'ƒ', 'ơ', 'ư', 'ǀ', 'ǁ', 'ǎ', 'ǝ', 'ǥ', 'ǫ', 'ǿ', 'ȃ', 'ȋ', 'ș', 'ț', 'ȳ', 'ə', 'ɛ', 'ɶ', 'ʻ', 'ʼ', 'ʽ', 'ʿ', 'ˆ', '̀', '́', '̂', '̃', '̄', '̆', '̇', '̈', '̊', '̋', '̌', '̍', '̜', '̢', '̦', '̧', '̩', '̬', '̱', '̲', '͠', '͡', 'ͦ', 'ͱ', 'ʹ', 'ͻ', 'ΐ', 'ά', 'έ', 'ή', 'ί', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ω', 'ό', 'ύ', 'ώ', 'а', 'б', 'в', 'г',

## Remove words with certain characters

In [32]:
old = list(WORDS.items())
new = [(w, f) for w, f in old if min(list(w)) >= 'a' and max(list(w)) <= 'ž']

In [33]:
print(len(old))
print(len(new))

3038513
3037124


## Look at how often characters are used

In [39]:
chars = defaultdict(int)
for word, freq in new:
    counts = Counter(word)
    for c, n in counts.items():
        chars[c] += n * freq

In [40]:
for c, n in chars.items():
    print(c, n)

a 4037155287
l 1778385944
i 2576731874
m 1113043937
n 2124650937
r 2168575361
d 1158749086
v 297950477
k 229055867
w 242184169
o 2312754508
f 437132114
u 1099590274
b 508137107
c 1445873493
t 1972650864
e 3223351923
y 487128387
s 1995752635
h 847848271
x 94522787
g 581379950
z 77450151
j 102260471
p 966797908
q 42696521
å 4961264
ô 1610442
é 14101442
è 1129784
î 99838
æ 709600
ß 18698
ü 2702049
ä 5612947
ö 6614731
à 483978
ã 7476540
â 2840961
á 20488858
ø 4002871
ó 13543907
ç 3714903
ñ 1328256
ú 1283447
í 11208184
ë 160571
ê 1424301
ý 20719
ï 74536
ò 7803
ì 68566
û 66797
õ 242365
č 13588
ě 13309
œ 8064
ğ 1882
š 33523
ą 1037
ū 19616
þ 1660
ŕ 13
ń 1337
µ 16216
ř 2478
ġ 23
ħ 1917
ă 2436
ł 3880
ž 3237
ð 15412
ę 221
ś 1774
ĕ 1763
ć 2334
ō 3100
ı 1861
ş 2594
ù 20564
į 6
ż 972
ā 9502
º 50255
ŏ 574
ī 3984
ľ 62
ţ 1174
ů 158
ė 104
ň 3745
ŷ 10
ª 43946
ÿ 2340
ŋ 5
ē 180
ő 367
ď 115
ű 32
đ 574
ĩ 713
ũ 288
ĭ 63
ź 93
ŝ 4
ŭ 446
ť 42
ĉ 11
ŵ 408
ĺ 5
ĝ 2
ļ 8
ċ 36
ķ 2
ņ 2


In [43]:
use = [c for c, n in chars.items() if n >= 10_000]
''.join(sorted(use))

'abcdefghijklmnopqrstuvwxyzªµºßàáâãäåæçèéêëìíîïðñóôõöøùúûüýčěšū'