In [1]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps
from zipfile import ZipFile, ZIP_DEFLATED
from time import time
from os import path

pandarallel.initialize(use_memory_fs=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
TODAY = datetime.now().strftime("%F")
BEE_LOOKUP_LIST = "2021-02-22_scan-bee-list.csv"
DATA_DIR = "/data/disk/scratch"

INPUT_FILE = "{}/initial_combined_data.csv".format(DATA_DIR)
INPUT_FILE_BASE = path.basename(INPUT_FILE)

OUTPUT_FILE = "{}/{}_{}_cleaned.csv".format(
    DATA_DIR,
    TODAY,
    INPUT_FILE_BASE.replace(".csv", "")
)

OUTPUT_SCINAME_FILE = "{}_sci-names.csv".format(
    OUTPUT_FILE.replace(".csv", "")
)

OUTPUT_ZIP_FILE = OUTPUT_FILE.replace(".csv", ".zip")

print("Input file: {}".format(INPUT_FILE))
print("Output file (dataframe): {}".format(OUTPUT_FILE))
print("Output file (unique taxa): {}".format(OUTPUT_SCINAME_FILE))
print("Output file (archive)): {}".format(OUTPUT_ZIP_FILE))

Input file: /data/disk/scratch/initial_combined_data.csv
Output file (dataframe): /data/disk/scratch/2021-03-10_initial_combined_data_cleaned.csv
Output file (unique taxa): /data/disk/scratch/2021-03-10_initial_combined_data_cleaned_sci-names.csv
Output file (archive)): /data/disk/scratch/2021-03-10_initial_combined_data_cleaned.zip


In [3]:
bee_lookup_lst = [b.strip() for b in pd.read_csv(BEE_LOOKUP_LIST)["sciName"].to_numpy(np.dtype("unicode"))]
bee_lookup_lst = np.unique(bee_lookup_lst)
bee_lookup_memoized = dict()

def bee_lookup(row):
    global bee_lookup_lst
    
    sci_name = row["scientificName"]

    if pd.isna(sci_name) or sci_name == "":
        return row
    
    if sci_name in bee_lookup_memoized.keys():
        row["scientificName"] = bee_lookup_memoized[sci_name]
        return row
    else:
        bee_lookup_memoized[sci_name] = sci_name
    
    sci_name_lower = sci_name.lower()
    sci_name_words = tuple(sci_name_lower.split(" "))
    
    for bee_name in bee_lookup_lst:
        bee_name_lower = bee_name.lower()
        bee_name_words = tuple(bee_name_lower.split(" "))
        
        # If the first two or last two words match
        is_match = (
            len(sci_name_words) >= 2 and
            (
                (
                    sci_name_words[0] == bee_name_words[0] and
                    sci_name_words[1] == bee_name_words[1]
                ) or
                (
                    sci_name_words[-2] == bee_name_words[-2] and
                    sci_name_words[-1] == bee_name_words[-1]
                )
            )
        )
        
        if is_match:
            bee_lookup_memoized[sci_name] = bee_name
            row["scientificName"] = bee_name
            break
    
    return row

In [4]:
input_df = pd.read_csv(INPUT_FILE, low_memory=False).infer_objects()
input_df["scientificName"] = input_df["scientificName"].fillna("").astype(np.dtype("unicode"))
input_df.head()

Unnamed: 0.1,Unnamed: 0,id,language,modified,references,institutionCode,collectionCode,ownerInstitutionCode,basisOfRecord,informationWithheld,...,phylum,class,order,family,genus,specificEpithet,infraspecificEpithet,taxonRank,taxonRemarks,Source
0,1451979,2268194000.0,,,,naturgucker,naturgucker,,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Apis,mellifera,,SPECIES,,GBIF
1,2172105,723620800.0,,,,USDA-ARS,BBSL,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,w-scripta,,SPECIES,,GBIF
2,360871,1703255000.0,,,,naturgucker,naturgucker,,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Apis,mellifera,,SPECIES,,GBIF
3,360868,2268199000.0,,,,naturgucker,naturgucker,,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,,,GENUS,,GBIF
4,2395866,2268199000.0,,,,naturgucker,naturgucker,,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,terrestris,,SPECIES,,GBIF


In [5]:
print("Unique scientific names before cleaning: {:,}".format(len(np.unique(input_df["scientificName"]))))

start_time = time()

output_df = input_df.copy().drop("Unnamed: 0", axis="columns")
output_df = output_df.parallel_apply(bee_lookup, axis="columns")

unique_sci_names = np.unique(output_df["scientificName"])
print("Unique scientific names after cleaning: {:,}".format(len(unique_sci_names)))

print("Cleaning took {} minutes".format(round((time() - start_time) / 60)))

Unique scientific names before cleaning: 8,606
Unique scientific names after cleaning: 7,870
Cleaning took 2 minutes


In [None]:
output_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
with open(OUTPUT_SCINAME_FILE, 'w') as f:
    f.write("scientificName\n")
    for s in unique_sci_names:
        f.write('"{}"\n'.format(s))

In [None]:
with ZipFile(OUTPUT_ZIP_FILE, 'w', compression=ZIP_DEFLATED) as f:
    f.write(OUTPUT_FILE, arcname=path.basename(OUTPUT_FILE))
    f.write(OUTPUT_SCINAME_FILE, arcname=path.basename(OUTPUT_SCINAME_FILE))

In [None]:
before_len = len(output_df)
after_len = len(output_df[output_df["scientificName"].isin(bee_lookup_lst)])
print("Dropping unknown taxa would result in the loss of {:,} records".format(before_len - after_len))