In [20]:
import pandas as pd
from model.helper import *
from model.train_ner import *
import time

### This notebook processes raw OCR Text output from post 1990 images and analyzes accuracy

Load raw data with OCR text/barcodes, labeled data, nlp model and regex model

In [21]:
# Load raw data
df = pd.read_excel('data/printed_raw_ocr_data.xlsx')
mod = printed_model()
df_labels = pd.read_excel('data/specimens_post_1990.xlsx')
pd.set_option('mode.chained_assignment', None)

Make predictions using the regex model

In [22]:
# Retrieve entities using regex
starttime = time.time()
df['Clean_text'] = df['OCRText'].apply(mod.clean_text)
df['County'] = df['Clean_text'].apply(mod.find_county)
df['State'] = df['Clean_text'].apply(mod.find_state)
df['Species'] = df['Clean_text'].apply(mod.find_species)
df['Date'] = df['Clean_text'].apply(mod.find_date)
df['Collector'] = df['Clean_text'].apply(mod.find_collector)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 5.5 secs


Make predictions using the nlp model: County, State and Species are predicted using a custom NER model, Date and Collector with general 'en' model

In [23]:
# Predict entities using nlp
starttime = time.time()
df['Pred_County'] = df['Clean_text'].dropna().apply(mod.predict_county)
df['Pred_State'] = df['Clean_text'].dropna().apply(mod.predict_state)
df['Pred_Species'] = df['Clean_text'].dropna().apply(mod.predict_species)
df['Pred_Date'] = df['Clean_text'].dropna().apply(mod.predict_date)
df['Pred_Collector'] = df['Clean_text'].dropna().apply(mod.predict_collector)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 239.6 secs


In [24]:
df['Pred_State'] = df.apply(lambda x:x['State'] if x['Pred_State'] is None else x['Pred_State'], axis=1)

Load labels from labeled data file

In [25]:
# Get ground truth labels
starttime = time.time()
df['Act_Barcode'] = df_labels['ColBarcode']
df['Act_County'] = df_labels['RDECounty']
df['Act_State'] = df_labels['RDEProvinceState']
df['Act_Date'] = df_labels['RDEDateFrom']
df['Act_Collector1'] = df_labels['NamBriefName']
df['Act_Collector2'] = df_labels['RDECollectionTeam']
df['Act_County'] = df['Act_County'].dropna().apply(lambda x:' '.join(x.split()[:-1]))
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 0.0 secs


Change dates to date formats

In [26]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Pred_Date'] = pd.to_datetime(df['Pred_Date'], errors='coerce')
df['Act_Date'] = pd.to_datetime(df['Act_Date'], errors='coerce')


In [27]:
def calculate_accuracy(df):
    reg_acc_state = 100 * df[df['State']==df['Act_State']].shape[0]/df.shape[0]
    reg_acc_county = 100 * df[df['County']==df['Act_County']].shape[0]/df.shape[0]
    reg_acc_date = 100 * df[df['Date']==df['Act_Date']].shape[0]/df.shape[0]
    reg_perc_species = 100 * df['Species'].count()/df.shape[0]


    nlp_acc_state = 100 * df[df['Pred_State']==df['Act_State']].shape[0]/df.shape[0]
    nlp_acc_county = 100 * df[df['Pred_County']==df['Act_County']].shape[0]/df.shape[0]
    nlp_acc_date = 100 * df[df['Pred_Date']==df['Act_Date']].shape[0]/df.shape[0]
    nlp_perc_species = 100 * df['Pred_Species'].count()/df.shape[0]

    acc_barcode = 100 * df[df['Barcode']==df['Act_Barcode']].shape[0]/df.shape[0]
    
    acc_overall = 100 * df[(df['Pred_State']==df['Act_State']) &
                           (df['Pred_County']==df['Act_County']) &
                           (df['Pred_Species'].notnull()) &
                           (df['Barcode']==df['Act_Barcode']) &
                           (df['Date']==df['Act_Date'])].shape[0]/df.shape[0]

    print('Regex Accuracy ======> State: {:.1f}%, County: {:.1f}%, Date: {:.1f}%, Species predicted: {:.1f}%'.format(
        reg_acc_state,reg_acc_county, reg_acc_date, reg_perc_species))
    print('NLP Accuracy ======> State: {:.1f}%, County: {:.1f}%, Date: {:.1f}%, Species predicted: {:.1f}%'.format(
        nlp_acc_state, nlp_acc_county, nlp_acc_date, nlp_perc_species))
    print('Barcode accuracy ======> {:.1f}%'.format(acc_barcode))

Predict accuracy with both models

In [28]:
calculate_accuracy(df)



Let's remove rows where either State or County is not in OCRText, most of these are manual entry errors

In [29]:
def find_errors(x):
    if x.OCRText:
        if x.Act_County:
            if x.Act_County.lower() not in x.OCRText.lower():
                return 'County Mislabeled'
        if x.Act_State:
            if x.Act_State.lower() not in x.OCRText.lower():
                return 'State Mislabeled'
    return None
    
df['Error'] = df.dropna(subset=['Act_State', 'Act_County', 'OCRText']).apply(find_errors, axis=1)
dfError = df[df['Error'].notnull()]

In [30]:
def find_mismatches(x):
    if x.OCRText:
        if (x.Act_County) and (x.Pred_County):
            if x.Pred_County.lower() != x.Act_County.lower():
                return 'County Mismatched'
        if (x.Act_State) and (x.Pred_State):
            if x.Pred_State.lower() != x.Act_State.lower():
                return 'State Mismatched'
    return None
    
df['Mismatch'] = df.dropna(subset=['Act_State', 'Act_County', 'OCRText']).apply(find_mismatches, axis=1)
dfMismatch = df[df['Mismatch'].notnull()]

The errors/mismatches were manually evaluated for State and County and errors originating from specimen label or label entry errors were saved in a file, we'll remove this from our evaluation metrics

In [31]:
df_data_errors = pd.read_csv('data/printed_label_errors.csv')
df_data_errors.head()

Unnamed: 0,Barcode,County,State,Date,Collector,Collector 2,Error,Error Description,Pred Made?,Pred Right?,Close Match?,Category
0,3339039,Grand,Colorado,8/5/04,E. Foley,,County Mislabeled,Label wrong (should be Clear Creek),Yes,Yes,No,Label Entry Error
1,3234787,Grand,Colorado,8/16/04,E. Foley,,County Mislabeled,Label wrong (should be Clear Creek),Yes,Yes,No,Label Entry Error
2,3261965,Saguache,Colorado,8/14/96,D. Atwood,,County Mislabeled,Label wrong (should be San Miguel),Yes,Yes,No,Label Entry Error
3,3134482,Nassau,New York,5/8/12,M. Bennett,Gleason & Cronquist,County Mislabeled,Label wrong (no county listed),No,Yes,No,Specimen Label Error
4,3134483,Nassau,New York,5/8/12,M. Bennett,Gleason & Cronquist,County Mislabeled,Label wrong (no county listed),No,Yes,No,Specimen Label Error


Let's remove these from the dataframe and calculate accuracy again

In [32]:
df = df[~df.Act_Barcode.isin(df_data_errors.Barcode)]
calculate_accuracy(df)



Save files

In [33]:
df.to_csv('data/printed_prediction_analysis.csv')

We'll use NLP model for State, County and Species, and Regex model for Date

In [34]:
def calc_metrics(df, pred_col,act_col):
    TP = df[df[pred_col]==df[act_col]].shape[0]
    TP_FP = df[pred_col].notnull().sum()
    TP_FN = df[act_col].notnull().sum()
    Precision = TP/TP_FP
    Recall = TP/TP_FN
    F1Score = (2 * Precision * Recall) / (Precision + Recall)
    return [pred_col, Precision, Recall, F1Score]

Let's calculate precision, recall and f1-score

In [35]:
dfeval = pd.DataFrame()
dfeval = dfeval.append([calc_metrics(df, 'Pred_State', 'Act_State')],ignore_index=True)
dfeval = dfeval.append([calc_metrics(df, 'Pred_County', 'Act_County')],ignore_index=True)
dfeval = dfeval.append([calc_metrics(df, 'Barcode', 'Act_Barcode')],ignore_index=True)
dfeval = dfeval.append([calc_metrics(df, 'Date', 'Act_Date')],ignore_index=True)
dfeval.columns = ['Entity', 'Precision', 'Recall', 'F1Score']
dfeval.round(3)

Unnamed: 0,Entity,Precision,Recall,F1Score
0,Pred_State,0.991,0.99,0.991
1,Pred_County,0.977,0.972,0.974
2,Barcode,0.992,0.99,0.991
3,Date,0.99,0.948,0.968


Let's also calculate scores using spacy scorer

In [36]:
df = pd.read_csv('data/printed_prediction_analysis.csv')

In [37]:
X = df['OCRText']
Y = df[['Pred_State','Pred_County','Pred_Species']]
Y.rename(columns={'Pred_State': 'State', 'Pred_County': 'County',
                  'Pred_Species': 'Species'}, inplace=True)
test_data, missing_data, overlap_data = gen_data(X,Y)

Found errors in data, see data_errors.txt


In [38]:
len(test_data)

4917

In [39]:
mod = printed_model()
score = evaluate_model(mod.nlp_cus,test_data)

In [40]:
print(score['ents_per_type'])

{'Species': {'p': 96.73257023933402, 'r': 99.55022488755623, 'f': 98.12117373865316}, 'State': {'p': 97.82202862476665, 'r': 97.45815251084935, 'f': 97.63975155279503}, 'County': {'p': 98.53151131960024, 'r': 99.6493399339934, 'f': 99.08727310019485}}


The scores on states look slightly worse here than calculations above; this makes sense becuase we applied regex model after the predicted model to increase the prediction rate

For our evaluation, we'll use the scores that we calculated above