In [None]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps
from zipfile import ZipFile
from time import time

pandarallel.initialize(use_memory_fs=True)

In [None]:
INPUT_FILE = "/data/disk/scratch/evin_test.csv"
BEE_LOOKUP_LIST = "2021-02-22_scan-bee-list.csv"
OUTPUT_FILE = "/data/disk/scratch/{}_{}_cleaned.csv".format(
    datetime.now().strftime("%F"),
    INPUT_FILE.split("/")[-1].replace(".csv", "")
)

In [None]:
def collapse_whitespace(input_str):
    return re.sub(r" +", " ", input_str).strip()


def remove_parentheses(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r"\([^)]+\)", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


def remove_authorship(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r" [A-Z][a-z]+,\s+\d{4}\s*$", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


In [None]:
bee_lookup_lst = [b.strip() for b in pd.read_csv(BEE_LOOKUP_LIST)["sciName"].to_numpy(np.dtype("unicode"))]
bee_lookup_lst = np.unique(bee_lookup_lst)
bee_lookup_memoized = dict()

def bee_lookup(row):
    global bee_lookup_lst
    
    sci_name = row["scientificName"]

    if pd.isna(sci_name) or sci_name == "":
        return row
    
    if sci_name in bee_lookup_memoized.keys():
        row["scientificName"] = bee_lookup_memoized[sci_name]
        return row
    else:
        bee_lookup_memoized[sci_name] = sci_name
    
    sci_name_lower = sci_name.lower()
    sci_name_words = tuple(sci_name_lower.split(" "))
    
    for bee_name in bee_lookup_lst:
        bee_name_lower = bee_name.lower()
        bee_name_words = tuple(bee_name_lower.split(" "))
        
        # If the first two or last two words match
        is_match = (
            len(sci_name_words) >= 2 and
            (
                (
                    sci_name_words[0] == bee_name_words[0] and
                    sci_name_words[1] == bee_name_words[1]
                ) or
                (
                    sci_name_words[-2] == bee_name_words[-2] and
                    sci_name_words[-1] == bee_name_words[-1]
                )
            )
        )
                
#         is_match = (
#             sci_name_lower.startswith(bee_name_lower) or 
#             sci_name_lower.endswith(bee_name_lower)
#         )
        
        if is_match:
            bee_lookup_memoized[sci_name] = bee_name
            row["scientificName"] = bee_name
            break
            
    should_print = (
        sci_name_lower.startswith("triepeolus scel") or 
        sci_name_lower.startswith("protandrena poli")
    )
    if should_print:
        print("{} --> {}".format(sci_name, row["scientificName"]))
    
    return row

In [None]:
def do_cleaning(row):
    #row = remove_parentheses(row)
    #row = remove_authorship(row)
    row = bee_lookup(row)
    return row

In [None]:
input_df = pd.read_csv(INPUT_FILE, low_memory=False).infer_objects()
input_df["scientificName"] = input_df["scientificName"].fillna("").astype(np.dtype("unicode"))
input_df.head()

In [None]:
print("Unique scientific names before cleaning: {:,}\n".format(len(np.unique(input_df["scientificName"]))))

start_time = time()

output_df = input_df.copy().drop("Unnamed: 0", axis="columns").drop("X", axis="columns")
output_df = output_df.parallel_apply(do_cleaning, axis="columns")
      
print("\nUnique scientific names after cleaning: {:,}\n".format(len(np.unique(output_df["scientificName"]))))

print("Cleaning took {} minutes".format(round((time() - start_time) / 60)))

In [None]:
output_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
before_len = len(output_df)
after_len = len(output_df[output_df["scientificName"].isin(bee_lookup_lst)])
print("Dropping unknown taxa would result in the loss of {:,} records".format(before_len - after_len))