# Build locality terms from the BELS gazetteer

BELS has a different goal than Digi-Leap BELS is trying to pinpoint latitudes and longitudes from strings and Digi-Leap is parsing strings that may then be fed into BELS. I'm taking the data from BELS and trying to reduce it to a smaller number of patterns that can be fed into spaCy rule parsers.

In [1]:
import csv
import html
import sqlite3
import unicodedata as uni
from collections import defaultdict, namedtuple
from multiprocessing import Pool
from pathlib import Path

import pandas as pd
import regex as re
import spacy
from tqdm.notebook import tqdm

# from traiter.pylib import const
from traiter.pylib import term_util as tu
from traiter.pylib.pipes import extensions, tokenizer
from traiter.pylib.traits import terms as t_terms

In [2]:
DATA_DIR = Path("..") / "data"

PROCESSES = 16  # Number of parallel processes

In [3]:
Location = namedtuple("Location", "loc add error")

## BELS localities given to me

In [4]:
BELS = DATA_DIR / "bels"
BELS_DB = BELS / "localities.sqlite"
BELS_ORI = BELS / "original"
BELS_TEMP = BELS / "temp"

## Setup spaCy

In [5]:
extensions.add_extensions()
nlp = spacy.load("en_core_web_md", exclude=["ner"])
tokenizer.setup_tokenizer(nlp)

## Save raw data

In [6]:
def get_raw_data():
    paths = sorted(BELS_ORI.glob("*.csv.gz"))

    if_exists = "replace"

    with sqlite3.connect(BELS_DB) as cxn:
        for path in tqdm(paths):
            df = pd.read_csv(path)

            locs = df["v_locality"]
            locs.to_sql("raw", cxn, index=False, if_exists=if_exists)

            if_exists = "append"


# get_raw_data()

## Basic locality normalization

See [here](https://en.wikipedia.org/wiki/Unicode_character_property) for a description of the character class abbreviations.

The raw localities are rough, perform some simple steps to improve them.

In [20]:
errors = 0

too_short = 3

punct = """[&%$#!*,/;.:?'"_-]+"""

subs = [
    # Like (...)
    (re.compile(rf"\(+ {punct} \)+", flags=re.X), " "),
    # Like (9)
    (re.compile(r"\(+ \s* \d* \s* \)+ ", flags=re.X), " "),
    # Lat/long
    (re.compile(r"\(? [\d.-]+ [\s,]+ [\d.-]+ \)?", flags=re.X), " "),
    # CSV delimiters? The question marks are odd, I admit
    (re.compile(r"[.,?]{2,}"), " "),
    # Enclosing quotes
    (re.compile(r"""^ [({\['"/] \s* (.+) \s* [\]})'"/] $""", flags=re.X), r"\1"),
    # Leading punct
    (re.compile(rf"^( \s* {punct} \s* )+", flags=re.X), " "),
    # Trailing punct
    (re.compile(rf"( \s* {punct} \s* )+ $", flags=re.X), " "),
]

subs2 = [
    # Agressively remove brackets
    (re.compile(r" [()\[\]\{\}]+ ", flags=re.X), " "),
]
subs2 += subs

# Character classes
controls = " Cc Cf Cs Co Cn ".split()  # All control characters
symbols = " Sc ".split()  # Currency symbols
separators = " Zl Zp ".split()  # Line & paragraph separators
remove = controls + symbols + separators

In [8]:
def replace(loc, subs):
    prev = ""
    while prev != loc:
        prev = loc
        for regexp, replace in subs:
            loc = regexp.sub(replace, loc)
            loc = loc.strip()
    return loc

In [9]:
def normalize_location(loc):
    try:
        # Replace HTML entities
        loc = html.unescape(loc)

        # Lower case the string
        loc = loc.lower()

        # Remove control characters & some punct
        loc = [" " if uni.category(c) in remove else c for c in loc]
        loc = "".join(loc)

        # Normalize chars to ASCII
        loc = uni.normalize("NFKD", loc)

        # Do some replacements
        # loc = replace(loc, subs)

        # Some more aggressive replacements
        loc = replace(loc, subs2)

        # Normalize spaces
        loc = " ".join(loc.split())

        # Too short
        if len(loc) <= too_short:
            raise ValueError

        # Add it
        return Location(loc=loc, add=1, error=0)

    except (ValueError, TypeError):
        return Location(loc="", add=0, error=1)

In [10]:
# normalize_location('"aasvogelberg, an steinigen oertern')

In [11]:
def normalize():
    normals = defaultdict(int)
    errors = 0

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row

        for loc in tqdm(cxn.execute("""select * from raw""")):
            loc = loc["v_locality"]

            norm = normalize_location(loc)
            if norm.loc:
                normals[norm.loc] += norm.add
            errors += norm.error

        batch = [{"locality": k, "hits": v} for k, v in normals.items()]
        df = pd.DataFrame(batch)
        df.to_sql("normalized", cxn, index=False, if_exists="replace")

    return errors


# normalize()

## Alias localities

### Get words that get replaced in the BELS noun phrases

The are sets of common terms and patterns stored in CSV files that are used to categorize word or phrase types like colors or dates. We're going to use them to replace terms in the BELS noun phrases with hypernyms. For instance:

- Replace `12 North Main Street` with `<num> <dir> main street`

The hope is to cut down on the total number of patterns stored.

In [12]:
def get_tokens():
    TERM_DIR = Path(t_terms.__file__).parent
    tokens = {}

    # ---------------------------
    # Get units
    path = [
        TERM_DIR / "unit_distance_terms.csv",
        TERM_DIR / "unit_length_terms.csv",
    ]

    terms = tu.read_terms(path)

    # Skip anything smaller than a foot
    tokens |= {t["pattern"]: t["label"] for t in terms if float(t["factor_cm"]) > 30.0}

    # ---------------------------
    # Get other tokens
    path = [
        TERM_DIR / "color_terms.csv",
        TERM_DIR / "direction_terms.csv",
        TERM_DIR / "habitat_terms.csv",
        TERM_DIR / "numeric_terms.csv",
    ]

    terms = tu.read_terms(path)

    # We don't want all of the terms
    ignore = """ numeric_units bad_habitat roman color_missing """.split()

    tokens |= {t["pattern"]: t["label"] for t in terms if t["label"] not in ignore}

    return tokens


TOKENS = get_tokens()

## Gather noun phrases

Use spaCy's language model to identify noun phrases and then replace words in them as needed.

In [13]:
def get_phrases_proc(limit, offset):
    phrases = defaultdict(int)
    errors = 0

    with sqlite3.connect(BELS_DB) as cxn:
        cxn.row_factory = sqlite3.Row
        rows = list(
            cxn.execute(
                """select * from normalized limit ? offset ?""",
                (limit, offset),
            )
        )

    for row in rows:
        loc, hits = row

        try:
            doc = nlp(loc)
        except ValueError:
            errors += 1
            continue

        for chunk in doc.noun_chunks:
            norm = normalize_location(chunk.text)
            if norm.loc:
                phrases[norm.loc] += norm.add
            errors += norm.error

    batch = [{"phrase": k, "hits": v} for k, v in phrases.items()]
    df = pd.DataFrame(batch)

    csv_path = BELS_TEMP / f"phrases_{offset}.csv"
    df.to_csv(csv_path, index=False)

    return errors

In [14]:
def get_phrases():
    processes = 12
    limit = 1_000_000

    results = []

    with sqlite3.connect(BELS_DB) as cxn:
        cur = cxn.execute("""select count(*) from normalized""")
        count = cur.fetchone()[0]

    with Pool(processes=processes) as pool, tqdm() as bar:
        for offset in range(0, count, limit):
            results.append(
                pool.apply_async(
                    get_phrases_proc,
                    args=(limit, offset),
                    callback=lambda _: bar.update(),
                )
            )

        return sum(r.get() for r in results)


get_phrases()

0it [00:00, ?it/s]

11096238

In [15]:
def save_phrases():
    phrases = defaultdict(int)

    for path in tqdm(sorted(BELS_TEMP.glob("phrases_*.csv"))):
        with open(path) as csv_file:
            reader = csv.DictReader(csv_file)

            for row in reader:
                phrases[row["phrase"]] += int(row["hits"])

    print(f"{len(phrases)=}")
    print(f"{sum(phrases.values())=}")

    batch = [{"phrase": k, "hits": v} for k, v in phrases.items()]
    df = pd.DataFrame(batch)

    with sqlite3.connect(BELS_DB) as cxn:
        df.to_sql("phrases", cxn, index=False, if_exists="replace")


save_phrases()

  0%|          | 0/36 [00:00<?, ?it/s]

len(phrases)=20370305
sum(phrases.values())=87833445


In [16]:
# def alias_phrases_proc(path):
#     aliasd = defaultdict(int)
#     errors = 0

#     df = pd.read_csv(path)

#     for _, row in df.iterrows():
#         try:
#             phrase = str(row["phrase"])

#             phrase = [c for c in phrase if uni.category(c) not in CHAR_CLASS]

#             phrase = "".join(phrase)
#             phrase = " ".join(phrase.split())

#             doc = nlp(phrase)
#         except ValueError:
#             errors += 1
#             continue

#         pattern = []
#         k = 0

#         for token in doc:
#             if hypernym := TOKENS.get(token.lower_):
#                 pattern.append(f"<{hypernym}>")

#             elif token.like_num:
#                 pattern.append("<num>")

#             elif token.is_punct or token.is_quote:
#                 pattern.append(token.text)

#             else:
#                 k += len(token)
#                 pattern.append(token.lower_)

#         if k <= 1:  # Not enuf non-token characters
#             continue

#         pattern = " ".join(pattern)
#         aliasd[pattern] += row["hits"]

#     batch = [{"phrase": k, "hits": v} for k, v in aliasd.items()]
#     df = pd.DataFrame(batch)

#     csv_path = BELS_ALIASED / f"{path.stem}.csv"
#     df.to_csv(csv_path, index=False)

#     return errors

In [17]:
# def alias_phrases():
#     paths = sorted(BELS_PHRASES.glob("*.csv"))

#     results = []

#     with Pool(processes=PROCESSES) as pool, tqdm(total=len(paths)) as bar:
#         for path in paths:
#             results.append(
#                 pool.apply_async(
#                     alias_phrases_proc,
#                     args=(path,),
#                     callback=lambda _: bar.update(),
#                 )
#             )
#         return sum(r.get() for r in results)


# # alias_phrases()

In [18]:
# def save_aliased():
#     aliased = defaultdict(int)

#     for path in tqdm(sorted(BELS_ALIASED.glob("*.csv"))):
#         with open(path) as csv_file:
#             reader = csv.DictReader(csv_file)

#             for row in reader:
#                 aliased[row["phrase"]] += int(row["hits"])

#     print(f"{len(aliased)=}")
#     print(f"{sum(aliased.values())=}")

#     batch = [{"phrase": k, "hits": v} for k, v in aliased.items()]
#     df = pd.DataFrame(batch)

#     with sqlite3.connect(BELS_DB) as cxn:
#         df.to_sql("aliased", cxn, index=False, if_exists="replace")


# save_aliased()

## Look at facets

In [19]:
# def get_fingerprints():
#     facets = defaultdict(int)
#     errors = 0
#     remove = CONTROLS + PUNCTS + SYMBOLS + SPACES + DIGITS

#     with sqlite3.connect(BELS_DB) as cxn:
#         cxn.row_factory = sqlite3.Row

#         for loc in tqdm(cxn.execute("""select * from raw""")):
#             loc = loc["v_locality"]

#             try:
#                 # Replace HTML entities
#                 loc = html.unescape(loc)

#                 # Remove control characters & some punct
#                 loc = [" " if uni.category(c) in remove else c for c in loc]
#                 loc = "".join(loc)

#                 # Normalize chars to ASCII
#                 loc = uni.normalize("NFKD", loc)

#                 # Lower case the string
#                 loc = loc.lower()

#                 # Sort words
#                 loc = " ".join(sorted(set(loc.split())))

#                 # Too short
#                 if len(loc) == 0:
#                     raise ValueError

#                 # Add it
#                 facets[loc] += 1

#             except (ValueError, TypeError):
#                 errors += 1
#                 continue

#         batch = [{"facet": k, "hits": v} for k, v in facets.items()]
#         df = pd.DataFrame(batch)
#         df.to_sql("fingerprints", cxn, index=False, if_exists="replace")

#     return errors


# get_fingerprints()