In [None]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps

pandarallel.initialize(use_memory_fs=True)

In [None]:
INPUT_FILE = "/data/disk/jupyter-notebooks/bees.csv"
ORR_LOOKUP_LIST = "/data/disk/jupyter-notebooks/OrrList.csv"
OUTPUT_FILE = "/data/disk/jupyter-notebooks/{}_bees.csv".format(datetime.now().strftime("%F"))

In [None]:
def strip_scientificName(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    row["scientificName"] = re.sub(r" +", " ", sciName).strip()
    return row


def remove_parentheses(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r"\([^)]+\)", "", sciName)
    sciName = re.sub(r" +", " ", sciName).strip()
    
    row["scientificName"] = sciName
    return row


def remove_authorship(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r" [A-Z][a-z]+,\s+\d{4}\s*$", "", sciName)
    sciName = re.sub(r" +", " ", sciName).strip()
    
    row["scientificName"] = sciName
    return row


In [None]:
orr_lst = pd.read_csv(ORR_LOOKUP_LIST)["Name"].to_numpy(np.dtype("unicode"))

def orr_lookup(row):
    global orr_lst
    
    sciName = row["scientificName"]

    if pd.isna(sciName) or sciName == "":
        return row
    
    sci_name_lower = sciName.lower()
    
    for orr_name in orr_lst:
        orr_name_lower = orr_name.lower()
        if orr_name_lower in sci_name_lower:
            row["scientificName"] = orr_name
            break
    
    return row

In [None]:
def do_cleaning(row):
    row = remove_parentheses(row)
    row = remove_authorship(row)
    row = strip_scientificName(row)
    row = orr_lookup(row)
    return row

In [None]:
input_df = pd.read_csv(INPUT_FILE, encoding="latin1", low_memory=False)
input_df.head()

In [None]:
print("Unique scientific names before cleaning: {:,}".format(len(np.unique(input_df["scientificName"]))))

output_df = input_df.copy().drop("Unnamed: 0", axis="columns")
output_df = output_df.parallel_apply(do_cleaning, axis="columns")
      
print("Unique scientific names after cleaning: {:,}".format(len(np.unique(output_df["scientificName"]))))

In [None]:
output_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
[print(bytes(n, "latin1").decode("utf-8")) for n in np.unique(output_df["scientificName"])[:40]]