In [1]:
import sys
sys.path.append('..')

In [2]:
import os
from pathlib import Path
import sqlite3
import regex
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [3]:
DATA = Path('..') / 'data'
PRUNED = DATA / '01_pruned'
INPUT = DATA / 'input'

RAW = PRUNED / 'raw.csv'
DB = PRUNED / 'gazetteer.db'

OUTPUT = PRUNED / 'unique.csv'
TEMP_DIR = PRUNED / 'unique.csv.temp'
TEMP_FILE = TEMP_DIR / '0.part'

TARGET = INPUT / 'mammals_no_geo.csv.gz'

In [4]:
df = dd.read_csv(RAW, dtype=str).fillna('')
df

Unnamed: 0_level_0,gbifid,coreid,locality,lat,lng,datum,uncert,country,state,county
npartitions=310,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [5]:
df = df.drop(columns=['gbifid', 'coreid', 'country', 'state', 'county'])

## Normalize locality values

- Space normalize
- Lower case
- Remove punctuation except for "." decimal points, dots surrounded by digits

In [6]:
remove = r'(?<!\d)[.,;/(){}"\'\[\]\-](?!\d)'

df['norm'] = df['locality'].str.replace(remove, ' ')
df['norm'] = df['norm'].str.lower().str.split().str.join(' ')

## Need to drop AFTER other manipulations because the drop reduces to a single thread

In [7]:
df = df.drop_duplicates()
df

Unnamed: 0_level_0,locality,lat,lng,datum,uncert,norm
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,object,object,object,object,object,object
,...,...,...,...,...,...


In [8]:
df.to_csv(TEMP_DIR, index=False)

['/home/rafe/work/other/gazetteer/data/01_pruned/unique.csv.temp/0.part']

## Rename output file

In [9]:
os.rename(TEMP_FILE, OUTPUT)
os.rmdir(TEMP_DIR)

## Update data types

In [10]:
df = pd.read_csv(
    str(OUTPUT),
    dtype={'locality': str,
           'lat': np.float32,
           'lng': np.float32,
           'dataum': str,
           'uncert': np.int32,
           })

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    df.to_sql('places', cxn, if_exists='replace', index=False)

In [12]:
with sqlite3.connect(DB) as cxn:
    sql = """
        CREATE INDEX places_locality ON places (locality);
        CREATE INDEX places_norm ON places (norm);
        """
    cxn.executescript(sql)