# Load iDigBio Data for Label Text Generation

In [1]:
import io
import os
import re
import sqlite3
import zipfile
from pathlib import Path

import duckdb
import pandas as pd
from tqdm import tqdm

In [2]:
DATA_DIR = Path('..') / 'data'

NAME = 'occurrence_raw_idigbio_2021-02'

ZIP = str(DATA_DIR / 'raw' / 'iDigBio_snapshot_2021-02.zip')
CSV = str(DATA_DIR / 'raw' / f'{NAME}.csv')

PREFIX = str(DATA_DIR / f'{NAME}_')
FIRST = f'{PREFIX}01.csv'
BACKUP = f'{PREFIX}01_header.csv'
GLOB = PREFIX + '*.csv'

CHUNK = 1_000_000
LINES = 5_000_000

In [3]:
def get_headers(zip_file):
    with zipfile.ZipFile(ZIP) as zippy:
        with zippy.open(zip_file) as in_file:
            headers = in_file.readline()
    return [h.decode().strip() for h in sorted(headers.split(b','))]

In [4]:
bad_names = """ group order references """.split()

def get_columns(headers):
    columns = {}
    used = set()

    for head in headers:
        col = head.split(':')[-1]
        col = re.sub(r'(?<![A-Z])([A-Z])', r'_\1', col).lower()
        col = re.sub(r'^_', '', col)
        if col in used:
            col = head.replace(':', '_')
            col = re.sub(r'(?<![A-Z])([A-Z])', r'_\1', col).lower()
            col = re.sub(r'^_', '', col)
        if col in bad_names:
            col += '_'
        columns[head] = col
        used.add(col)
    return columns

In [5]:
def insert(zip_file, columns):
    table = zip_file.split('.')[0]

    with sqlite3.connect(DB) as cxn:
        with zipfile.ZipFile(ZIP) as zippy:
            with zippy.open(zip_file) as in_file:

                reader = pd.read_csv(
                    in_file, dtype=str, keep_default_na=False, chunksize=CHUNK)

                if_exists = 'replace'

                for df in tqdm(reader):
                    df = df.rename(columns=columns)

                    df.to_sql(table, cxn, if_exists=if_exists, index=False)

                    if_exists = 'append'

In [6]:
def load_data(zip_file):
    headers = get_headers(zip_file)
    columns = get_columns(headers)
    insert(zip_file, columns)

In [7]:
# load_data('occurrence_raw.csv')

### Duck DB doubles the size of the data so we are not using it.

The basic plan is:

1. Extract occurrence_raw.csv from an iDigBio snapshot
1. Get the headers and rename the columns so they'll work in a DB
1. Split the CSV file into sizes that will fit in memory
1. Load the CSV splits into the database
1. Clean up the files

Make working with the database a bit easier.

Get the needed CSV file from the zip file.

In [8]:
# !unzip -p $ZIP occurrence_raw.csv > $CSV

Split the CSV file into manageable chunks that fit in memory

In [9]:
# !split --additional-suffix=.csv --numeric-suffixes=1 --lines=$LINES $CSV $PREFIX

Copy the first file with the header so it can be removed later.

In [10]:
# !mv $FIRST $BACKUP

Get the headers from the first file. We need to rename the column headers in the DB.

In [11]:
# with open(BACKUP) as in_file:
#     headers = in_file.readline()
# headers = [h.strip() for h in headers.split(',')]

Remove the old header from that first file.

In [12]:
# !tail -n +2 $BACKUP > $FIRST

Remove the backup file

In [13]:
# !rm $BACKUP

Rename the columns

In [14]:
# bad_names = """ group order references """.split()

# def get_columns(headers):
#     columns = {}
#     used = set()

#     for head in headers:
#         col = head.split(':')[-1]
#         col = re.sub(r'(?<![A-Z])([A-Z])', r'_\1', col).lower()
#         col = re.sub(r'^_', '', col)
#         if col in used:
#             col = head.replace(':', '_')
#             col = re.sub(r'(?<![A-Z])([A-Z])', r'_\1', col).lower()
#             col = re.sub(r'^_', '', col)
#         if col in bad_names:
#             col += '_'
#         columns[head] = col
#         used.add(col)
#     return columns


# columns = get_columns(headers)

Create a table with the new column names.

In [15]:
# columns = [f'{h} VARCHAR' for h in columns.values()]
# columns = ', '.join(columns)

# with duckdb_connect(DB) as cxn:
#     cxn.execute(f'CREATE TABLE raw_data ({columns});')

Copy CSV data into the database.

In [16]:
# paths = sorted(DATA_DIR.glob(GLOB))

# for path in tqdm(paths):
#     with duckdb_connect(DB) as cxn:
#         cxn.execute(f"COPY raw_data FROM '{str(path)}';")

Remove unneeded files.

In [17]:
# for path in paths:
#     path.unlink()

# Path(CSV).unlink()