# Hash Locations

## Basic Setup

In [1]:
import io
import os
import sqlite3
import string
import zipfile
from hashlib import sha256
from pathlib import Path
from pprint import pp

import dask.dataframe as dd
import pandas as pd
import regex
from tqdm import tqdm

In [2]:
DATA_DIR = Path('..') / 'data'
INTERIM_DIR = DATA_DIR / '01_interim'

IN_DB = INTERIM_DIR / 'gazetteer_02_idigbio_2020-03-30.db'
OUT_DB = INTERIM_DIR / 'gazetteer_03_idigbio_2020-03-30.db'

CHUNK = 1_000_000

## Database Setup

In [3]:
if OUT_DB.exists():
    os.remove(OUT_DB)

In [4]:
def dict_factory(cursor, row):
    dict_ = {}
    for idx, col in enumerate(cursor.description):
        dict_[col[0]] = row[idx]
    return dict_

In [5]:
sql = """
    drop index if exists occ_l_hash;
    drop index if exists occ_v_hash;
    drop index if exists raw_l_hash;
    drop index if exists raw_v_hash;

    create index if not exists occ_coreid on occurrence (coreid);
    create index if not exists raw_coreid on occurrence_raw (coreid);
    """

with sqlite3.connect(IN_DB) as cxn:
    cxn.executescript(sql)

## Build Hashes

Get the columns that contribute to the hashes.

In [6]:
def get_columns(table, db=IN_DB):
    specials = """
        coreid l_hash v_hash
        locality verbatimLocality """.split()

    sql = f'PRAGMA table_info({table});'

    with sqlite3.connect(db) as cxn:
        cxn.row_factory = sqlite3.Row
        columns = [r[1] for r in cxn.execute(sql) if r[1] not in specials]

    return columns

The queries are used to build the hashes

In [7]:
def build_sql(table):
    select = f"""select * from {table};"""
    update = ' '.join(f"""
        update {table}
           set l_hash = ?, v_hash = ?
         where coreid = ?;
        """.split())
    return select, update

Build the hashes and write them to the database.

Python hashes use a different hash seed for every run so we are using the another hashing function where we can control the hash seed.

In [8]:
def write_hashes(select, update, l_cols, v_cols):
    batch = []

    with sqlite3.connect(IN_DB) as cxn:
        cxn.row_factory = dict_factory

        for row in tqdm(cxn.execute(select)):

            l_hash, v_hash = None, None

            if row['locality']:
                l_hash = b'|'.join(str(row[c]).encode() for c in l_cols)
                l_hash = sha256(l_hash).hexdigest()

            if (row['verbatimLocality']
                    and row['locality'] != row['verbatimLocality']):
                v_hash = b'|'.join(str(row[c]).encode() for c in v_cols)
                v_hash = sha256(v_hash).hexdigest()

            batch.append([l_hash, v_hash, row['coreid']])

            if len(batch) >= CHUNK:
                cxn.executemany(update, batch)
                cxn.commit()
                batch = []

        if batch:
            cxn.executemany(update, batch)
            cxn.commit()

Wrap the entire hash building process in a function.

In [9]:
def build_hashes(table):
    columns = get_columns(table)

    l_cols = ['locality'] + columns
    v_cols = ['verbatimLocality'] + columns

    select, update = build_sql(table)

    write_hashes(select, update, l_cols, v_cols)

Hashes for occurrence

In [10]:
build_hashes('occurrence')

60666995it [17:58, 56258.37it/s] 


Hashes for occurrence_raw

In [11]:
build_hashes('occurrence_raw')

60708188it [25:11, 40151.20it/s]


## Create Indexes on Hashes

In [12]:
sql = """
    create index occ_l_hash on occurrence (l_hash);
    create index occ_v_hash on occurrence (v_hash);
    create index raw_l_hash on occurrence_raw (l_hash);
    create index raw_v_hash on occurrence_raw (v_hash);
    """

with sqlite3.connect(IN_DB) as cxn:
    cxn.executescript(sql)

## Pull One Representative from Every Group of Hashes

In [13]:
hashes = """
    with hashes as (
            select l_hash as hash 
             from occurrence_raw
            where hash is not null
        union all
            select v_hash as hash
              from occurrence_raw
             where hash is not null
    )
    select distinct hash
      from hashes;
"""

represent = """
    select *
      from occurrence_raw
     where l_hash = ?
        or v_hash = ?
     limit 1;
    """

In [14]:
columns = get_columns('occurrence_raw')

In [15]:
create = f"""
    create table gazetteer (coreid,hash,source,locality,{','.join(columns)})
"""
with sqlite3.connect(OUT_DB) as cxn_out:
    cxn_out.execute(create)

In [16]:
insert = f"""
    insert into gazetteer (coreid,hash,source,locality,{','.join(columns)})
    values ({','.join(['?'] * (len(columns) + 4))})
"""

In [17]:
batch = []

with sqlite3.connect(IN_DB) as cxn_in:
    cxn_in.row_factory = dict_factory

    with sqlite3.connect(OUT_DB) as cxn_out:

        for hash_row in tqdm(cxn_in.execute(hashes)):
            hash_ = hash_row['hash']

            for row in cxn_in.execute(represent, (hash_, hash_)):

                if row['l_hash'] == hash_:
                    loc = 'locality'
                else:
                    loc = 'verbatimLocality'

                fields = [row[c] if row[c] != '' else None for c in columns]

                batch.append([row['coreid'], hash_, loc, row[loc]] + fields)

                if len(batch) >= CHUNK:
                    cxn_out.executemany(insert, batch)
                    cxn_out.commit()
                    batch = []

        if batch:
            cxn_out.executemany(insert, batch)
            cxn_out.commit()

16270042it [11:34, 23416.77it/s]
