# Build locality terms from the BELS gazetteer

BELS has a different goal than Digi-Leap BELS is trying to pinpoint latitudes and longitudes from strings and Digi-Leap is parsing strings that may then be fed into BELS. I'm taking the data from BELS and trying to reduce it to a smaller number of patterns that can be fed into spaCy rule parsers.

In [1]:
import csv
import html
import sqlite3
import unicodedata as uni
from collections import defaultdict, namedtuple
from multiprocessing import Pool
from pathlib import Path

import pandas as pd
import regex as re
import spacy
from flora.pylib.traits import terms as f_terms
from tqdm.notebook import tqdm
from traiter.pylib import term_util as tu
from traiter.pylib.pipes import extensions, tokenizer
from traiter.pylib.traits import terms as t_terms

In [2]:
DATA_DIR = Path("..") / "data"

PROCESSES = 16  # Number of parallel processes

In [3]:
Location = namedtuple("Location", "loc add error")

## BELS localities given to me

In [4]:
BELS = DATA_DIR / "bels"
BELS_DB = BELS / "localities.sqlite"
BELS_ORI = BELS / "original"
BELS_TEMP = BELS / "temp"

LOCALITIES = BELS / "locality_terms.csv"

## Setup spaCy

In [5]:
extensions.add_extensions()
nlp = spacy.load("en_core_web_md", exclude=["ner"])
tokenizer.setup_tokenizer(nlp)

## Save raw data

In [6]:
def get_raw_data():
    paths = sorted(BELS_ORI.glob("*.csv.gz"))

    if_exists = "replace"

    with sqlite3.connect(BELS_DB) as cxn:
        for path in tqdm(paths):
            df = pd.read_csv(path)

            locs = df["v_locality"]
            locs.to_sql("raw", cxn, index=False, if_exists=if_exists)

            if_exists = "append"


# get_raw_data()

## Basic locality normalization

See [here](https://en.wikipedia.org/wiki/Unicode_character_property) for a description of the character class abbreviations.

The raw localities are rough, perform some simple steps to improve them.

In [7]:
TOO_SHORT = 3

PUNCT = """[&%$#!*,/;.:?'"_-]+"""

SUBS = [
    # Agressively remove brackets
    (re.compile(r" [()\[\]\{\}]+ ", flags=re.X), " "),
    # Like (...)
    (re.compile(rf"\(+ {PUNCT} \)+", flags=re.X), " "),
    # Like (9)
    (re.compile(r"\(+ \s* \d* \s* \)+ ", flags=re.X), " "),
    # Lat/long
    (re.compile(r"\(? [\d.-]+ [\s,]+ [\d.-]+ \)?", flags=re.X), " "),
    # CSV delimiters? The question marks are odd, I admit
    (re.compile(r"[.,?]{2,}"), " "),
    # Enclosing quotes
    (re.compile(r"""^ [({\['"/] \s* (.+) \s* [\]})'"/] $""", flags=re.X), r"\1"),
    # Leading PUNCT
    (re.compile(rf"^( \s* {PUNCT} \s* )+", flags=re.X), " "),
    # Trailing PUNCT
    (re.compile(rf"( \s* {PUNCT} \s* )+ $", flags=re.X), " "),
    # Handle contractions & possesives
    (re.compile(r" \s ( '[st] ) ", flags=re.X), r"\1"),
    # Handle abbreviations
    (re.compile(r" ([\p{L}\p{M}]{1,4}) \s ( \. ) ", flags=re.X), r"\1\2"),
    # Handle periods
    (re.compile(r" ([\p{L}\p{M}]{5,}) ( \. ) ", flags=re.X), r"\1 \2"),
    # Remove back slashes
    (re.compile(r"\\", flags=re.X), ""),
]

# Character classes
CONTROLS = " Cc Cf Cs Co Cn ".split()  # All control characters
SYMBOLS = " Sc ".split()  # Currency symbols
SEPARATORS = " Zl Zp ".split()  # Line & paragraph separators
REMOVE = CONTROLS + SYMBOLS + SEPARATORS

# Spacy POS tags
POS_ALIAS = " CCONJ DET NUM SCONJ ".split()

In [8]:
def substitute(loc, subs):
    prev = ""
    while prev != loc:
        prev = loc
        for regexp, repl in subs:
            loc = regexp.sub(repl, loc)
            loc = loc.strip()
    return loc

In [9]:
def normalize_location(loc):
    try:
        # Replace HTML entities
        loc = html.unescape(loc)

        # Lower case the string
        loc = loc.lower()

        # Remove control characters & some punct
        loc = [" " if uni.category(c) in REMOVE else c for c in loc]
        loc = "".join(loc)

        # Normalize chars to ASCII
        loc = uni.normalize("NFKD", loc)

        # Do some replacements
        loc = substitute(loc, SUBS)

        # Normalize spaces
        loc = " ".join(loc.split())

        # Too short
        if len(loc) <= TOO_SHORT:
            raise ValueError

        # Add it
        return Location(loc=loc, add=1, error=0)

    except (ValueError, TypeError):
        return Location(loc="", add=0, error=1)

In [10]:
def normalize():
    normals = defaultdict(int)
    errors = 0

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row

        cur = cxn.execute("""select count(*) from raw""")
        total = cur.fetchone()[0]

        for loc in tqdm(cxn.execute("""select * from raw"""), total=total):
            loc = loc["v_locality"]

            norm = normalize_location(loc)
            if norm.loc:
                normals[norm.loc] += norm.add
            errors += norm.error

        batch = [{"locality": k, "hits": v} for k, v in normals.items()]
        df = pd.DataFrame(batch)
        df.to_sql("normalized", cxn, index=False, if_exists="replace")

    return errors


# normalize()

## Alias localities

### Get words that get replaced in the BELS noun phrases

The are sets of common terms and patterns stored in CSV files that are used to categorize word or phrase types like colors or dates. We're going to use them to replace terms in the BELS noun phrases with hypernyms. For instance:

- Replace `12 North Main Street` with `<num> <dir> main street`

The hope is to cut down on the total number of patterns stored.

In [11]:
def get_tokens():
    term_dir = Path(t_terms.__file__).parent
    flora_dir = Path(f_terms.__file__).parent
    tokens = {}

    # ---------------------------
    # Get units
    path = [
        term_dir / "unit_distance_terms.csv",
        term_dir / "unit_length_terms.csv",
    ]

    terms = tu.read_terms(path)

    # Skip anything smaller than a foot
    tokens |= {t["pattern"]: t["label"] for t in terms if float(t["factor_cm"]) > 30.0}

    # ---------------------------
    # Get other tokens
    path = [
        term_dir / "about_terms.csv",
        term_dir / "color_terms.csv",
        term_dir / "direction_terms.csv",
        term_dir / "elevation_terms.csv",
        term_dir / "geocoordinate_terms.csv",
        term_dir / "habitat_terms.csv",
        term_dir / "month_terms.csv",
        term_dir / "name_terms.csv",
        term_dir / "numeric_terms.csv",
        term_dir / "us_location_terms.csv",
        flora_dir / "missing_terms.csv",
        flora_dir / "rank_terms.csv",
    ]

    terms = tu.read_terms(path)

    # We don't want all of the terms
    ignore = set(
        """
        numeric_units bad_habitat roman color_missing not_trs not_name
    """.split()
    )

    tokens |= {t["pattern"]: t["label"] for t in terms if t["label"] not in ignore}

    return tokens


TOKENS = get_tokens()

## Alias the noun phrases

In [12]:
TOO_FEW = 1
DOT_LIMIT = 5
WORD_RE = re.compile(
    r"^ ( [\p{L}\p{M}]+ ('[st])? \.? | [\p{L}\p{M}]+ [\p{L}\p{M}.]+ [\p{L}\p{M}]+ ) $",
    flags=re.X,
)

In [13]:
def get_aliases_proc(limit, offset):
    aliased = defaultdict(int)
    errors = 0

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row
        rows = list(
            cxn.execute(
                """select * from normalized limit ? offset ?""",
                (limit, offset),
            )
        )

    for phrase, hits in rows:
        try:
            doc = nlp(phrase)
        except ValueError:
            errors += 1
            continue

        pattern = []
        k = 0

        for token in doc:
            if hypernym := TOKENS.get(token.lower_):
                pattern.append(f"<{hypernym}>")

            elif token.pos_ in POS_ALIAS:
                pattern.append(f"<{token.pos_.lower()}>")

            elif token.is_punct or token.is_quote:
                pattern.append(token.text)

            else:
                k += len(token)
                pattern.append(token.lower_)

        if k <= TOO_FEW:  # Not enuf non-token characters
            continue

        pattern = " ".join(pattern)

        aliased[pattern] += hits

    batch = [{"phrase": k, "hits": v} for k, v in aliased.items()]
    df = pd.DataFrame(batch)

    csv_path = BELS_TEMP / f"aliased_{offset}.csv"
    df.to_csv(csv_path, index=False)

    return errors

In [14]:
def get_aliases():
    processes = 12
    limit = 1_000_000

    results = []

    with sqlite3.connect(BELS_DB) as cxn:
        cur = cxn.execute("""select count(*) from normalized""")
        count = cur.fetchone()[0]

    total = sum(1 for _ in range(0, count, limit))

    with Pool(processes=processes) as pool, tqdm(total=total) as bar:
        for offset in range(0, count, limit):
            results.append(
                pool.apply_async(
                    get_aliases_proc,
                    args=(limit, offset),
                    callback=lambda _: bar.update(),
                )
            )

        return sum(r.get() for r in results)


# get_aliases()

In [15]:
def save_aliases():
    phrases = defaultdict(int)

    for path in tqdm(sorted(BELS_TEMP.glob("aliased_*.csv"))):
        with open(path) as csv_file:
            reader = csv.DictReader(csv_file)

            for row in reader:
                phrase = substitute(row["phrase"], SUBS)
                phrases[phrase] += int(row["hits"])

    print(f"{len(phrases)=}")
    print(f"{sum(phrases.values())=}")

    batch = [{"phrase": k, "hits": v} for k, v in phrases.items()]
    df = pd.DataFrame(batch)

    with sqlite3.connect(BELS_DB) as cxn:
        df.to_sql("aliases", cxn, index=False, if_exists="replace")


# save_aliases()

## Get locality vocabulary

In [16]:
def get_vocabulary():
    all_words = defaultdict(int)

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row

        cur = cxn.execute("""select count(*) from aliases""")
        total = cur.fetchone()[0]

        rows = cxn.execute("""select * from aliases""")

        for phrase, hits in tqdm(rows, total=total):
            for word in phrase.split():
                word = word.strip()

                if len(word) > DOT_LIMIT and word[-1] == ".":
                    word = word[:-1]

                if WORD_RE.match(word):
                    all_words[word] += hits

        batch = [{"pattern": k, "hits": v} for k, v in all_words.items()]
        df = pd.DataFrame(batch)

        df.to_sql("words", cxn, index=False, if_exists="replace")


# get_vocabulary()

## Remove taxon names & other traits

In [17]:
POS_LIMIT = 1000


def clean_words():
    errors = 0
    all_words = defaultdict(int)

    taxa_dir = Path(f_terms.__file__).parent
    binomial_terms = taxa_dir / "binomial_terms.zip"
    monomial_terms = taxa_dir / "monomial_terms.zip"

    taxa = tu.read_terms([binomial_terms, monomial_terms])
    taxa = {t["pattern"] for t in taxa}  # for w in t["pattern"].split()}

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row

        cur = cxn.execute("""select count(*) from words""")
        total = cur.fetchone()[0]

        rows = cxn.execute("""select * from words""")

        for words, hits in tqdm(rows, total=total):
            if words in taxa:
                continue

            for word in set(words.split()):
                if word in taxa:
                    continue

                if len(word) <= 1:
                    continue

                try:
                    doc = nlp(word)
                except ValueError:
                    errors += 1
                    continue

                if doc[0].pos_ in ("ADP",) and hits < POS_LIMIT:
                    continue

                if doc[0].pos_ in ("AUX", "CCONJ", "DET", "NUM", "SCONJ"):
                    continue

                all_words[word] += hits

        batch = [{"pattern": k, "hits": v} for k, v in all_words.items()]
        df = pd.DataFrame(batch)

        df.to_sql("cleaned", cxn, index=False, if_exists="replace")
        return errors


# clean_words()

In [21]:
def split_words(hits=1):
    split = defaultdict(int)
    
    with sqlite3.connect(BELS_DB) as cxn:
        rows = cxn.execute("""select * from cleaned""")
        for pattern, hits in rows:
            words = pattern.split(".")
            end = len(words) - 1
            for i, word in enumerate(words):
                if not word:
                    continue
                if i < end:
                    split[f"{word}."] += hits
                else:
                    split[word] += hits

        batch = [{"pattern": k, "hits": v} for k, v in split.items()]
        df = pd.DataFrame(batch)

        df.to_sql("cleaned", cxn, index=False, if_exists="replace")


split_words()

In [22]:
def write_words(hits=1):
    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row
        df = pd.read_sql(
            "select pattern from cleaned where hits > ? order by pattern",
            cxn,
            params=[hits],
        )
        df.to_csv(LOCALITIES, index=False)


write_words()

# Parked code that may be useful later

## Get patterns

## Fix pattern issues

## Merge patterns

## Try building spacy patterns