In [None]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps

pandarallel.initialize(use_memory_fs=True)

In [None]:
INPUT_FILE = "initial_combined_data.csv"
BEE_LOOKUP_LIST = "2021-02-22_scan-bee-list.csv"
OUTPUT_FILE = "{}_{}_cleaned.csv".format(
    datetime.now().strftime("%F"),
    INPUT_FILE.replace(".csv", "")
)

In [None]:
def collapse_whitespace(input_str):
    return re.sub(r" +", " ", input_str).strip()


def remove_parentheses(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r"\([^)]+\)", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


def remove_authorship(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r" [A-Z][a-z]+,\s+\d{4}\s*$", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


In [None]:
bee_lookup_lst = pd.read_csv(BEE_LOOKUP_LIST)["sciName"].to_numpy(np.dtype("unicode"))
bee_lookup_lst = np.unique(bee_lookup_lst)

def bee_lookup(row):
    global bee_lookup_lst
    
    sciName = row["scientificName"]

    if pd.isna(sciName) or sciName == "":
        return row
    
    sci_name_lower = sciName.lower()
    
    for bee_name in bee_lookup_lst:
        bee_name = bee_name.strip()
        bee_name_lower = bee_name.lower()
        is_match = (
            sci_name_lower.startswith(bee_name_lower) or 
            sci_name_lower.endswith(bee_name_lower)
        )
        
        if is_match:
            row["scientificName"] = bee_name
            break
    
    return row

In [None]:
def do_cleaning(row):
    row = remove_parentheses(row)
    row = remove_authorship(row)
    row = bee_lookup(row)
    return row

In [None]:
input_df = pd.read_csv(INPUT_FILE, low_memory=False).infer_objects()
input_df["scientificName"] = input_df["scientificName"].fillna("").astype(np.dtype("unicode"))
input_df.head()

In [None]:
print("Unique scientific names before cleaning: {:,}".format(len(np.unique(input_df["scientificName"]))))

output_df = input_df.copy().drop("Unnamed: 0", axis="columns")
output_df = output_df.parallel_apply(do_cleaning, axis="columns")
      
print("Unique scientific names after cleaning: {:,}".format(len(np.unique(output_df["scientificName"]))))

In [None]:
output_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
sci_names = np.unique(output_df["scientificName"])
[print(s) for s in sci_names[40:80]]

In [None]:
before_len = len(output_df)
after_len = len(output_df[output_df["scientificName"].isin(bee_lookup_lst)])
print("Dropping unknown taxa would result in the loss of {:,} records".format(before_len - after_len))