In [5]:
import re
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from datetime import datetime
from json import dumps as json_dumps

pandarallel.initialize(use_memory_fs=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
INPUT_FILE = "/data/disk/jupyter-notebooks/bees.csv"
BEE_LOOKUP_LIST = "2021-02-15_scan-bee-list.csv"
OUTPUT_FILE = "/data/disk/jupyter-notebooks/{}_bees.csv".format(datetime.now().strftime("%F"))

In [7]:
def collapse_whitespace(input_str):
    return re.sub(r" +", " ", input_str).strip()


def remove_parentheses(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r"\([^)]+\)", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


def remove_authorship(row):
    sciName = row["scientificName"]

    if pd.isna(sciName):
        return row

    sciName = re.sub(r" [A-Z][a-z]+,\s+\d{4}\s*$", "", sciName)
    sciName = collapse_whitespace(sciName)
    
    row["scientificName"] = sciName
    return row


In [22]:
bee_lookup_lst = pd.read_csv(BEE_LOOKUP_LIST)["sciName"].to_numpy(np.dtype("unicode"))
bee_lookup_lst = np.unique(bee_lookup_lst)

def bee_lookup(row):
    global bee_lookup_lst
    
    sciName = row["scientificName"]

    if pd.isna(sciName) or sciName == "":
        return row
    
    sci_name_lower = sciName.lower()
    
    for bee_name in bee_lookup_lst:
        bee_name = bee_name.strip()
        bee_name_lower = bee_name.lower()
        is_match = (
            sci_name_lower.startswith(bee_name_lower) or 
            sci_name_lower.endswith(bee_name_lower)
        )
        
        if is_match:
            row["scientificName"] = bee_name
            break
    
    return row

In [9]:
def do_cleaning(row):
    row = remove_parentheses(row)
    row = remove_authorship(row)
    row = bee_lookup(row)
    return row

In [12]:
input_df = pd.read_csv(INPUT_FILE, low_memory=False).infer_objects()
input_df["scientificName"] = input_df["scientificName"].fillna("").astype(np.dtype("unicode"))
input_df.head()

Unnamed: 0.1,Unnamed: 0,id,language,modified,references,institutionCode,collectionCode,ownerInstitutionCode,basisOfRecord,informationWithheld,...,phylum,class,order,family,genus,specificEpithet,infraspecificEpithet,taxonRank,taxonRemarks,geodeticDatum
0,1,2310776000.0,,2018-12-07T00:00:00Z,http://www.ebi.ac.uk/ena/data/view/MH747946,,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,californicus,,SPECIES,,WGS84
1,2,1502550000.0,,,,KU,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,ternarius,,SPECIES,,WGS84
2,3,2562944000.0,,2018-05-10T13:34:45Z,https://scan-bugs.org:443/portal/collections/i...,UCD,BMEC,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,candida,,SPECIES,,WGS84
3,4,3027189000.0,en,,,BBSL,,,PRESERVED_SPECIMEN,,...,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,cyanura,,SPECIES,,WGS84
4,5,1270737000.0,en,,,Royal Ontario Museum,ROM,Royal Ontario Museum,HUMAN_OBSERVATION,,...,Arthropoda,Insecta,Hymenoptera,Apidae,Bombus,griseocollis,,SPECIES,,WGS84


In [14]:
print("Unique scientific names before cleaning: {:,}".format(len(np.unique(input_df["scientificName"]))))

output_df = input_df.copy().drop("Unnamed: 0", axis="columns")
output_df = output_df.parallel_apply(do_cleaning, axis="columns")
      
print("Unique scientific names after cleaning: {:,}".format(len(np.unique(output_df["scientificName"]))))

Unique scientific names before cleaning: 6,859
Unique scientific names after cleaning: 6,138


In [15]:
output_df.to_csv(OUTPUT_FILE, index=False)

In [21]:
sci_names = np.unique(output_df["scientificName"])
[print(s) for s in sci_names[40:80]]

Ancylandrena larreae
Ancylandrena rozeni
Ancylandrena timberlakei
Ancyloscelis
Ancyloscelis apiformis
Ancyloscelis hertigi
Ancyloscelis melanostoma
Ancyloscelis sejunctus
Ancyloscelis wheeleri
Andrena
Andrena ablegata
Andrena accepta
Andrena acra-01
Andrena aculeata
Andrena aerifera
Andrena afimbriata
Andrena agilis
Andrena agoseridis
Andrena agricolarum Viereck &-28
Andrena alamonis
Andrena albohirta-29
Andrena albovirgata
Andrena alceae
Andrena algida
Andrena aliciae
Andrena aliciarum
Andrena alleghaniensis
Andrena amarilla
Andrena amphibola
Andrena anatolis
Andrena andrenoides
Andrena angelesia
Andrena angustella
Andrena angusticrus
Andrena angustifrons
Andrena angustior
Andrena angustitarsata
Andrena anisochlora
Andrena annectens
Andrena anograe


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [20]:
before_len = len(output_df)
after_len = len(output_df[output_df["scientificName"].isin(bee_lookup_lst)])
print("Dropping unknown taxa would result in the loss of {:,} records".format(before_len - after_len))

Dropping unknown taxa would result in the loss of 378,106 records
