In [3]:
import pandas as pd
from helper import *
import time

### This notebook processes raw OCR Text output from post 1990 images and analyzes accuracy

Load raw data with OCR text/barcodes, labeled data, nlp model and regex model

In [4]:
# Load raw data
df=pd.read_excel('Post_Raw_Data.xlsx')
mod = printed_model()
df_labels = pd.read_excel('specimens_post_1990.xlsx')
pd.set_option('mode.chained_assignment', None)

Make predictions using the regex model

In [6]:
# Retrieve entities using regex
starttime = time.time()
df['Clean_text'] = df['OCRText'].apply(mod.clean_text)
df['County'] = df['Clean_text'].apply(mod.find_county)
df['State'] = df['Clean_text'].apply(mod.find_state)
df['Species'] = df['Clean_text'].apply(mod.find_species)
df['Date'] = df['Clean_text'].apply(mod.find_date)
df['Collector'] = df['Clean_text'].apply(mod.find_collector)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 4.9 secs


Make predictions using the nlp model: County, State and Species are predicted using a custom NER model, Date and Collector with general 'en' model

In [7]:
# Predict entities using nlp
starttime = time.time()
df['Pred_County'] = df['Clean_text'].dropna().apply(mod.predict_county)
df['Pred_State'] = df['Clean_text'].dropna().apply(mod.predict_state)
df['Pred_Species'] = df['Clean_text'].dropna().apply(mod.predict_species)
df['Pred_Date'] = df['Clean_text'].dropna().apply(mod.predict_date)
df['Pred_Collector'] = df['Clean_text'].dropna().apply(mod.predict_collector)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 255.3 secs


Load labels from labeled data file

In [8]:
# Get ground truth labels
starttime = time.time()
df['Act_Barcode'] = df_labels['ColBarcode']
df['Act_County'] = df_labels['RDECounty']
df['Act_State'] = df_labels['RDEProvinceState']
df['Act_Date'] = df_labels['RDEDateFrom']
df['Act_Collector1'] = df_labels['NamBriefName']
df['Act_Collector2'] = df_labels['RDECollectionTeam']
df['Act_County'] = df['Act_County'].dropna().apply(lambda x:' '.join(x.split()[:-1]))
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 0.0 secs


Change dates to date formats

In [9]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Pred_Date'] = pd.to_datetime(df['Pred_Date'], errors='coerce')
df['Act_Date'] = pd.to_datetime(df['Act_Date'], errors='coerce')

In [10]:
def calculate_accuracy(df):
    reg_acc_state = 100 * df[df['State']==df['Act_State']].shape[0]/df.shape[0]
    reg_acc_county = 100 * df[df['County']==df['Act_County']].shape[0]/df.shape[0]
    reg_acc_date = 100 * df[df['Date']==df['Act_Date']].shape[0]/df.shape[0]
    reg_perc_species = 100 * df['Species'].count()/df.shape[0]


    nlp_acc_state = 100 * df[df['Pred_State']==df['Act_State']].shape[0]/df.shape[0]
    nlp_acc_county = 100 * df[df['Pred_County']==df['Act_County']].shape[0]/df.shape[0]
    nlp_acc_date = 100 * df[df['Pred_Date']==df['Act_Date']].shape[0]/df.shape[0]
    nlp_perc_species = 100 * df['Pred_Species'].count()/df.shape[0]

    acc_barcode = 100 * df[df['Barcode']==df['Act_Barcode']].shape[0]/df.shape[0]

    print('Regex Accuracy ======> State: {:.1f}%, County: {:.1f}%, Date: {:.1f}%, Species predicted: {:.1f}%'.format(
        reg_acc_state,reg_acc_county, reg_acc_date, reg_perc_species))
    print('NLP Accuracy ======> State: {:.1f}%, County: {:.1f}%, Date: {:.1f}%, Species predicted: {:.1f}%'.format(
        nlp_acc_state, nlp_acc_county, nlp_acc_date, nlp_perc_species))
    print('Barcode accuracy ======> {:.1f}%'.format(acc_barcode))

Predict accuracy with both models

In [11]:
calculate_accuracy(df)



Let's remove rows where either State or County is not in OCRText, most of these are manual entry errors

In [12]:
def find_errors(x):
    if x.OCRText:
        if x.Act_County:
            if x.Act_County.lower() not in x.OCRText.lower():
                return 'Error'
        if x.Act_State:
            if x.Act_State.lower() not in x.OCRText.lower():
                return 'Error'
    return None
    
df['Error'] = df.dropna(subset=['Act_State', 'Act_County', 'OCRText']).apply(find_errors, axis=1)
dfError = df[df['Error']=='Error']
dfError

Unnamed: 0.1,Unnamed: 0,OCRText,Barcode,Clean_text,County,State,Species,Date,Collector,Pred_County,...,Pred_Species,Pred_Date,Pred_Collector,Act_Barcode,Act_County,Act_State,Act_Date,Act_Collector1,Act_Collector2,Error
104,105,"Say\nScrophulariaceae\nCOLORADO, U.S.A.\nVeron...",3339039,"Scrophulariaceae\nCOLORADO, U.S.A.\nVeronica w...",Clear Creek,Colorado,Veronica wormskjoldii,2004-08-05,Erin Foley,Clear Creek,...,Veronica wormskjoldii,NaT,Erin Foley,3339039,Grand,Colorado,2004-08-05,E. Foley,,Error
106,107,"aa\n4 i\nTo\n‚Äòeet\nvu\nAsteraceae\nCOLORADO,...",3234787,"‚Äòeet vu\nAsteraceae\nCOLORADO, U.S.A.\nTones...",Clear Creek,Colorado,Tonestus pygmaeus,2004-08-16,Erin Foley,Clear Creek,...,Tonestus pygmaeus,NaT,Erin Foley\n,3234787,Grand,Colorado,2004-08-16,E. Foley,,Error
198,199,PLANTS OF COLORADO\nASTERACEAE\nLactuca oblong...,2925441,PLANTS OF COLORADO\nASTERACEAE\nLactuca oblong...,Ei Paso,Colorado,Lactuca oblongifolia,1999-08-05,Robert Merrill King Robert M Garvey,Paso,...,Lactuca oblongifolia,1999-08-05,Robert Merrill King,2925441,El Paso,Colorado,1999-08-05,R. M. King,R. M. Garvey,Error
298,299,"R-369339\nSan Miguel Co., Colorado, USA\nGilia...",3261965,"R-369339\nSan Miguel Co., Colorado, USA\nGilia...",San Miguel,Colorado,Gilia pinnatifida,1996-08-14,Duane Atwood,San Miguel,...,Gilia pinnatifida,1996-08-14,Duane Atwood,3261965,Saguache,Colorado,1996-08-14,D. Atwood,,Error
343,344,We\nPLANTS OF NEW YORK\nCOLLECTED FOR THE HERB...,3134482,PLANTS OF NEW YORK\nCOLLECTED FOR THE HERBARIU...,,New York,Maianthemum canadens,NaT,Martin Bennett,,...,Maianthemum canadens@,NaT,Martin Bennett,3134482,Nassau,New York,2012-05-08,M. Bennett,Gleason & Cronquist,Error
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4780,4781,Plants of\nScorzonera laciniata L.\nLaramie Co...,3110153,Plants of Scorzonera laciniata L.\nLaramie Co....,Laramie,,,1992-06-11,R Dorn,Laramie,...,Scorzonera laciniata,1992-06-11,R. Dorn,3110153,Laramie,Wyoming,1992-06-11,R. D. Dorn,,Error
4806,4807,"County, Utah\nUSA\nScripus cespitosus L.\nT3N ...",2903864,"County, Utah\nScripus cespitosus L.\nT3N R6W S...",,Utah,,1993-07-27,F Bartlett Plunkett Johnson Gunnell,,...,Scripus cespitosus,1993-07-27,F. Bartlett,2903864,Duchesne,Utah,1993-07-27,S. Goodrich,"F Bartlett, C Plunkett, Johnson, Gunnell",Error
4915,4916,FLORA QF\nU.S.A.\nArenaria congesta Nutt.\nvar...,3184705,FLORA QF\nU.S.A.\nArenaria congesta Nutt. var....,Albany,,Arenaria congesta,1995-08-26,Jim Jean Jewell,Albany,...,Arenaria congesta,1995-08-26,Jean Jewell,3184705,Albany,Wyoming,1995-08-26,J. Jewell,J. Jewell,Error
4958,4959,The New York Botanical Garden\nINSTITUTE OF EC...,9999999,INSTITUTE OF ECONOMIC BOTANY\nMerck Collection...,I,New Mexico,Psilostrophe tagetina,1997-05-25,Jay B Walker,Celtis,...,Psilostrophe tagetina,1997-05-25,Seth Baker,87303,Eddy,New Mexico,1997-05-25,J. B. Walker,S. Baker,Error


Let's calculate accuracy again

In [13]:
calculate_accuracy(df[df['Error']!='Error'])



Save files

In [14]:
df.to_csv('Post_90_Analysis.csv')
dfError.to_csv('Post_90_Label_Errors.csv')