# Build locality terms from the iDigBio data

This is data that I gleaned from a raw iDigBio dump for gazetteer input.

In [1]:
import csv
import html
import sqlite3
import unicodedata as uni
from collections import defaultdict, namedtuple
from multiprocessing import Pool
from pathlib import Path

import pandas as pd
import regex as re
import spacy
from flora.pylib.traits import terms as f_terms
from tqdm.notebook import tqdm
from traiter.pylib import term_util as tu
from traiter.pylib.pipes import extensions, tokenizer
from traiter.pylib.traits import terms as t_terms

In [2]:
# You will definitely need to change these directories
INPUT_DATA_DIR = Path("..") / ".." / ".." / "digi-leap" / "gazetteer" / "data"
INPUT_DB = INPUT_DATA_DIR / "gazetteer_04_idigbio_2020-03-30.db"
FIELDS = """
    locality continent country countryCode county higherGeography island islandGroup
    locationRemarks municipality stateProvince waterBody
    """.split()

DATA_DIR = Path("..") / "data" / "idigbio"
TEMP_DIR = DATA_DIR / "temp"
DB = DATA_DIR / "localities.sqlite"
LOCALITIES = DATA_DIR / "locality_terms.csv"

PROCESSES = 16  # Number of parallel processes

# Spacy POS tags to skip
POS_ALIAS = " ADP CCONJ DET NUM PUNCT SCONJ SYM ".split()

In [3]:
Location = namedtuple("Location", "loc add error")

## Setup spaCy

In [4]:
extensions.add_extensions()
nlp = spacy.load("en_core_web_md", exclude=["ner"])
tokenizer.setup_tokenizer(nlp)

## Basic locality normalization

See [here](https://en.wikipedia.org/wiki/Unicode_character_property) for a description of the character class abbreviations.

The raw localities are rough, perform some simple steps to improve them.

In [5]:
TOO_SHORT = 3

PUNCT = """[&%$#!*,/;.:?'"_-]+"""

SUBS = [
    # Agressively remove brackets
    (re.compile(r" [()\[\]\{\}]+ ", flags=re.X), " "),
    # Like (...)
    (re.compile(rf"\(+ {PUNCT} \)+", flags=re.X), " "),
    # Like (9)
    (re.compile(r"\(+ \s* \d* \s* \)+ ", flags=re.X), " "),
    # Lat/long
    (re.compile(r"\(? [\d.-]+ [\s,]+ [\d.-]+ \)?", flags=re.X), " "),
    # CSV delimiters? The question marks are odd, I admit
    (re.compile(r"[.,?]{2,}"), " "),
    # Enclosing quotes
    (re.compile(r"""^ [({\['"/] \s* (.+) \s* [\]})'"/] $""", flags=re.X), r"\1"),
    # Leading PUNCT
    (re.compile(rf"^( \s* {PUNCT} \s* )+", flags=re.X), " "),
    # Trailing PUNCT
    (re.compile(rf"( \s* {PUNCT} \s* )+ $", flags=re.X), " "),
    # Handle contractions & possesives
    (re.compile(r" \s ( '[st] ) ", flags=re.X), r"\1"),
    # Handle abbreviations
    (re.compile(r" ([\p{L}\p{M}]{1,4}) \s ( \. ) ", flags=re.X), r"\1\2"),
    # Handle periods
    (re.compile(r" ([\p{L}\p{M}]{5,}) ( \. ) ", flags=re.X), r"\1 \2"),
    # Remove back slashes
    (re.compile(r"\\", flags=re.X), ""),
]

# Character classes
CONTROLS = " Cc Cf Cs Co Cn ".split()  # All control characters
SYMBOLS = " Sc ".split()  # Currency symbols
SEPARATORS = " Zl Zp ".split()  # Line & paragraph separators
REMOVE = CONTROLS + SYMBOLS + SEPARATORS

In [6]:
def substitute(loc, subs):
    prev = ""
    while prev != loc:
        prev = loc
        for regexp, repl in subs:
            loc = regexp.sub(repl, loc)
            loc = loc.strip()
    return loc

In [7]:
def normalize_location(loc):
    try:
        # Replace HTML entities
        loc = html.unescape(loc)

        # Lower case the string
        loc = loc.lower()

        # Remove control characters & some punct
        loc = [" " if uni.category(c) in REMOVE else c for c in loc]
        loc = "".join(loc)

        # Normalize chars to ASCII
        loc = uni.normalize("NFKD", loc)

        # Do some replacements
        loc = substitute(loc, SUBS)

        # Normalize spaces
        loc = " ".join(loc.split())

        # Too short
        if len(loc) <= TOO_SHORT:
            raise ValueError

        # Add it
        return Location(loc=loc, add=1, error=0)

    except (ValueError, TypeError):
        return Location(loc="", add=0, error=1)

In [8]:
def normalize():
    normals = defaultdict(int)
    errors = 0

    with sqlite3.connect(INPUT_DB) as cxn:
        cxn.row_factory = sqlite3.Row

        cur = cxn.execute("""select count(*) from gazetteer""")
        total = cur.fetchone()[0]

        for rec in tqdm(cxn.execute("""select * from gazetteer"""), total=total):
            for field in FIELDS:
                if field:
                    loc = rec[field]

                    norm = normalize_location(loc)
                    if norm.loc:
                        normals[norm.loc] += norm.add
                    errors += norm.error

    with sqlite3.connect(DB) as cxn:
        batch = [{"locality": k, "hits": v} for k, v in normals.items()]
        df = pd.DataFrame(batch)
        df.to_sql("normalized", cxn, index=False, if_exists="replace")

    return errors


# normalize()

## Get locality words

In [9]:
NUM_RE = re.compile(r"^ [\d.°/,\'\"+-]+ ", flags=re.X)

In [10]:
def get_words_proc(limit, offset):
    errors = 0
    all_words = defaultdict(int)

    taxa_dir = Path(f_terms.__file__).parent
    # binomial_terms = taxa_dir / "binomial_terms.zip"
    monomial_terms = taxa_dir / "monomial_terms.zip"

    taxa = tu.read_terms([monomial_terms])  # , binomial_terms])
    taxa = {t["pattern"] for t in taxa for w in t["pattern"].split()}

    with sqlite3.connect(DB) as cxn:
        cxn.row_factory = sqlite3.Row

        rows = cxn.execute(
            """select * from normalized limit ? offset ?""", (limit, offset)
        )

        for phrase, hits in rows:
            try:
                doc = nlp(phrase)
            except ValueError:
                errors += 1
                continue

            for token in doc:
                if token.pos_ in POS_ALIAS:
                    continue

                elif token.is_punct or token.is_quote:
                    continue

                word = NUM_RE.sub("", token.lower_)

                if len(word) <= 1:
                    continue

                if word in taxa:
                    continue

                all_words[word] += hits

        batch = [{"pattern": k, "hits": v} for k, v in all_words.items()]
        df = pd.DataFrame(batch)

        csv_path = TEMP_DIR / f"words_{offset}.csv"
        df.to_csv(csv_path, index=False)

    return errors

In [11]:
def get_words():
    processes = 12
    limit = 1_000_000
    results = []

    with sqlite3.connect(DB) as cxn:
        cur = cxn.execute("""select count(*) from normalized""")
        count = cur.fetchone()[0]

    total = sum(1 for _ in range(0, count, limit))

    with Pool(processes=processes) as pool, tqdm(total=total) as bar:
        for offset in range(0, count, limit):
            results.append(
                pool.apply_async(
                    get_words_proc,
                    args=(limit, offset),
                    callback=lambda _: bar.update(),
                )
            )

        return sum(r.get() for r in results)


get_words()

  0%|          | 0/12 [00:00<?, ?it/s]

0

In [12]:
def write_words():
    words = defaultdict(int)

    for path in tqdm(sorted(TEMP_DIR.glob("words_*.csv"))):
        with open(path) as csv_file:
            reader = csv.DictReader(csv_file)

            for row in reader:
                word = row["pattern"]
                words[word] += int(row["hits"])

    print(f"{len(words)=}")
    print(f"{sum(words.values())=}")

    batch = [{"pattern": k, "hits": v} for k, v in words.items()]
    df = pd.DataFrame(batch)

    with sqlite3.connect(DB) as cxn:
        df.to_sql("words", cxn, index=False, if_exists="replace")

    df.to_csv(LOCALITIES, index=False)


write_words()

  0%|          | 0/12 [00:00<?, ?it/s]

len(words)=1285747
sum(words.values())=190769141
