In [1]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps

pandarallel.initialize(use_memory_fs=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
INPUT_FILE = "/data/disk/jupyter-notebooks/bees.csv"
BEE_LOOKUP_LIST = "2021-02-15_scan-bee-list.csv"
OUTPUT_FILE = "/data/disk/jupyter-notebooks/{}_bees.csv".format(datetime.now().strftime("%F"))

In [3]:
def strip_scientificName(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    row["scientificName"] = re.sub(r" +", " ", sciName).strip()
    return row


def remove_parentheses(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r"\([^)]+\)", "", sciName)
    sciName = re.sub(r" +", " ", sciName).strip()
    
    row["scientificName"] = sciName
    return row


def remove_authorship(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r" [A-Z][a-z]+,\s+\d{4}\s*$", "", sciName)
    sciName = re.sub(r" +", " ", sciName).strip()
    
    row["scientificName"] = sciName
    return row


In [5]:
bee_lookup_lst = pd.read_csv(BEE_LOOKUP_LIST)["sciName"].to_numpy(np.dtype("unicode"))

def bee_lookup(row):
    global bee_lookup_lst
    
    sciName = row["scientificName"]

    if pd.isna(sciName) or sciName == "":
        return row
    
    sci_name_lower = sciName.lower()
    
    for bee_name in bee_lookup_lst:
        bee_name_lower = bee_name.lower()
        if bee_name_lower in sci_name_lower:
            row["scientificName"] = bee_name
            break
    
    return row

In [6]:
def do_cleaning(row):
    row = remove_parentheses(row)
    row = remove_authorship(row)
    row = strip_scientificName(row)
    row = bee_lookup(row)
    return row

In [7]:
input_df = pd.read_csv(INPUT_FILE, encoding="latin1", low_memory=False)
input_df.head()

Unnamed: 0.1,Unnamed: 0,id,language,modified,references,institutionCode,collectionCode,ownerInstitutionCode,basisOfRecord,informationWithheld,...,phylum,class,order,family,genus,specificEpithet,infraspecificEpithet,taxonRank,taxonRemarks,geodeticDatum
0,1,2310776000.0,,2018-12-07T00:00:00Z,http://www.ebi.ac.uk/ena/data/view/MH747946,,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,californicus,,SPECIES,,WGS84
1,2,1502550000.0,,,,KU,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,ternarius,,SPECIES,,WGS84
2,3,2562944000.0,,2018-05-10T13:34:45Z,https://scan-bugs.org:443/portal/collections/i...,UCD,BMEC,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,candida,,SPECIES,,WGS84
3,4,3027189000.0,en,,,BBSL,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,cyanura,,SPECIES,,WGS84
4,5,1270737000.0,en,,,Royal Ontario Museum,ROM,Royal Ontario Museum,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,griseocollis,,SPECIES,,WGS84


In [None]:
print("Unique scientific names before cleaning: {:,}".format(len(np.unique(input_df["scientificName"]))))

output_df = input_df.copy().drop("Unnamed: 0", axis="columns")
output_df = output_df.parallel_apply(do_cleaning, axis="columns")
      
print("Unique scientific names after cleaning: {:,}".format(len(np.unique(output_df["scientificName"]))))

Unique scientific names before cleaning: 6,859


In [None]:
output_df.to_csv(OUTPUT_FILE, index=False)