In [11]:
import pandas as pd
import numpy as np

from dwca.read import DwCAReader
from os import getcwd, path, mkdir
from shutil import rmtree
from IPython.display import display, HTML
from xml.etree import ElementTree
from zipfile import ZipFile, ZIP_DEFLATED
from csv import QUOTE_ALL

In [2]:
DWCA_FILE = "scan-arthropods-dwca.zip"
TMP_DIR = path.join(getcwd(), ".tmp")

FILE_NAME_PREFIX = "{}".format(DWCA_FILE.strip(".zip"))
DIR_NAME = FILE_NAME_PREFIX

def get_idx_fmt(current_idx: int, total_idx: int):
    num_len = len(str(total_idx))
    num_fmt = "{{:0{}d}}".format(num_len)
    return num_fmt.format(current_idx)

In [4]:
dwca = DwCAReader(DWCA_FILE, tmp_dir=TMP_DIR)

dwca_meta = dwca.metadata
dwca_descriptor = dwca.descriptor

In [5]:
core_file = dwca_descriptor.core.file_location
core_df = dwca.pd_read(core_file, parse_dates=True)
# core_df.head()

In [6]:
NUM_FILES = 10
DF_LEN = len(core_df)
CHUNK_SIZE = DF_LEN // NUM_FILES

In [None]:
records_saved = 0

for idx, chunk in core_df.groupby(np.arange(DF_LEN) // CHUNK_SIZE):
    dwca_dir = "{}_{}".format(FILE_NAME_PREFIX, get_idx_fmt(idx, NUM_FILES))
    dwca_files = list()
    
    if path.exists(dwca_dir):
        rmtree(dwca_dir)
        
    print("Creating {}/...".format(dwca_dir))
    mkdir(dwca_dir)
    
    core_file_path = path.join(dwca_dir, dwca.descriptor.core.file_location)
    
    print("Writing {}....".format(core_file_path))
    field_term = dwca_descriptor.core.fields_terminated_by
    field_enclose = dwca_descriptor.core.fields_enclosed_by
    line_sep = dwca_descriptor.core.lines_terminated_by
    
    chunk.to_csv(
        core_file_path, 
        index=False, 
        sep=field_term,
        quotechar=field_enclose,
        line_terminator=line_sep,
        quoting=QUOTE_ALL
    )
    dwca_files.append(core_file_path)

    records_saved += len(chunk)
    
    for e in dwca_descriptor.extensions:
        ext_file = e.file_location
        ext_field_term = e.fields_terminated_by
        ext_field_enclose = e.fields_enclosed_by
        ext_line_sep = e.lines_terminated_by
        
        ext_df = dwca.pd_read(ext_file)
        ext_chunk = ext_df[ext_df["coreid"].isin(chunk["id"])]
        ext_file_path = path.join(dwca_dir, ext_file)
        
        print("Writing {}...".format(ext_file_path))
        ext_chunk.to_csv(
            ext_file_path, 
            index=False,
            sep=field_term,
            quotechar=field_enclose,
            line_terminator=line_sep,
            quoting=QUOTE_ALL
        )
        dwca_files.append(ext_file_path)

    meta_file_path = path.join(dwca_dir, dwca_descriptor.metadata_filename)
    desc_file_path = path.join(dwca_dir, "meta.xml")
    
    dwca_files.append(meta_file_path)
    dwca_files.append(desc_file_path)
    
    print("Writing {}...".format(meta_file_path))
    with open(meta_file_path, 'w') as f:
        f.write(ElementTree.tostring(dwca_meta, encoding="unicode"))
    
    print("Writing {}...".format(desc_file_path))
    with open(desc_file_path, 'w') as f:
        f.write(ElementTree.tostring(dwca_descriptor.raw_element, encoding="unicode"))

    dwca_file = "{}.zip".format(dwca_dir)
    print("Compressing {}...".format(dwca_file))
    with ZipFile(dwca_file, 'w', compression=ZIP_DEFLATED) as z:
        [z.write(f, arcname=f.replace(dwca_dir, "")) for f in dwca_files]
    
    print()
    
assert records_saved == DF_LEN

Creating scan-arthropods-dwca_00/...
Writing scan-arthropods-dwca_00/observations.csv....
Writing scan-arthropods-dwca_00/media.csv...
Writing scan-arthropods-dwca_00/metadata.eml.xml...
Writing scan-arthropods-dwca_00/meta.xml...
Compressing scan-arthropods-dwca_00.zip...

Creating scan-arthropods-dwca_01/...
Writing scan-arthropods-dwca_01/observations.csv....
Writing scan-arthropods-dwca_01/media.csv...
Writing scan-arthropods-dwca_01/metadata.eml.xml...
Writing scan-arthropods-dwca_01/meta.xml...
Compressing scan-arthropods-dwca_01.zip...

Creating scan-arthropods-dwca_02/...
Writing scan-arthropods-dwca_02/observations.csv....


In [None]:
dwca.close()