In [1]:
import pandas as pd
from helper import *
import time
import ocr

### This notebook processes raw OCR Text output from post 1990 images and analyzes accuracy

Load raw data with OCR text/barcodes, labeled data, nlp model and regex model

In [2]:
# Load raw data
df=pd.read_excel('Pre_Raw_Data.xlsx')
mod = handwritten_model()
df_labels = pd.read_excel('specimens_pre_1900.xlsx')
pd.set_option('mode.chained_assignment', None)

Make predictions using the model

In [4]:
# Retrieve entities using nlp (for state, species) and regex (for date, barcode)
starttime = time.time()
df['State'] = df['OCRText'].dropna().apply(mod.predict_state)
df['Species'] = df['OCRText'].dropna().apply(mod.predict_species)
df['Year'] = df['OCRText'].dropna().apply(mod.find_year)
df['Barcode'] = df['OCRText'].dropna().apply(ocr.find_barcode)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 39.2 secs


Load labels from labeled data file

In [5]:
# Get actual labels
starttime = time.time()
df['Act_Barcode'] = df_labels['Barcode']
df['Act_State'] = df_labels['ProvinceState']
df['Act_Date'] = df_labels['DateFrom']
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 0.0 secs


In [7]:
df

Unnamed: 0.1,Unnamed: 0,OCRText,State,Species,Year,Barcode,Act_Barcode,Act_County,Act_State,Act_Date,Act_Collector
0,1,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,Eriogonum corymbosum,1966.0,3103066.0,3103066,Montezuma,Colorado,1898-06-29,C. S. Crandall
1,2,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,A. globosa,1962.0,2739619.0,2739619,,Colorado,1894-07-09,C. S. Crandall
2,3,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,,1965.0,3097297.0,3097297,Chaffee,Colorado,1897-08-19,C. S. Crandall
3,4,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,Pulialiller ludoviciana,,2740422.0,2740422,Larimer,Colorado,1893-05-12,C. S. Crandall
4,5,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,,1945.0,2742023.0,2742023,,Colorado,1895-08-01,C. S. Crandall
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,,1898.0,1894858.0,1894858,Clear Creek,Colorado,1895-08-27,C. L. Shear
4996,4997,CARDEN THE NEW YORK BOTANICAL GARDEN copyright...,,mermeria exicana,1896.0,1900752.0,1900752,,Colorado,1896-07-20,C. L. Shear
4997,4998,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,Chrysothamnus parryi,1926.0,2040491.0,2040491,Lake,Colorado,1896-08-20,C. L. Shear
4998,4999,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,,1896.0,2883277.0,2883277,,Colorado,1896-08-24,C. L. Shear


Change dates to date formats

In [8]:
df['Act_Date'] = pd.to_datetime(df['Act_Date'], errors='coerce')
df['Act_Year'] = df['Act_Date'].dropna().apply(lambda x: x.year).astype('int64')

In [9]:

df

Unnamed: 0.1,Unnamed: 0,OCRText,State,Species,Year,Barcode,Act_Barcode,Act_County,Act_State,Act_Date,Act_Collector,Act_Year
0,1,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,Eriogonum corymbosum,1966.0,3103066.0,3103066,Montezuma,Colorado,1898-06-29,C. S. Crandall,1898.0
1,2,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,A. globosa,1962.0,2739619.0,2739619,,Colorado,1894-07-09,C. S. Crandall,1894.0
2,3,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,,1965.0,3097297.0,3097297,Chaffee,Colorado,1897-08-19,C. S. Crandall,1897.0
3,4,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,Pulialiller ludoviciana,,2740422.0,2740422,Larimer,Colorado,1893-05-12,C. S. Crandall,1893.0
4,5,THE NEW YORK BOTANICAL GARDEN copyright reserv...,Colorado,,1945.0,2742023.0,2742023,,Colorado,1895-08-01,C. S. Crandall,1895.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,,1898.0,1894858.0,1894858,Clear Creek,Colorado,1895-08-27,C. L. Shear,1895.0
4996,4997,CARDEN THE NEW YORK BOTANICAL GARDEN copyright...,,mermeria exicana,1896.0,1900752.0,1900752,,Colorado,1896-07-20,C. L. Shear,1896.0
4997,4998,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,Chrysothamnus parryi,1926.0,2040491.0,2040491,Lake,Colorado,1896-08-20,C. L. Shear,1896.0
4998,4999,THE NEW YORK BOTANICAL GARDEN copyright reserv...,,,1896.0,2883277.0,2883277,,Colorado,1896-08-24,C. L. Shear,1896.0


In [14]:
def calculate_accuracy(df):
    acc_state = 100 * (df[df['State']==df['Act_State']]).shape[0]/df['OCRText'].count()
    acc_date = 100 * (df[df['Year']==df['Act_Year']]).shape[0]/df['OCRText'].count()
    acc_barcode = 100 * (df[df['Barcode']==df['Act_Barcode']]).shape[0]/df['OCRText'].count()
    
    perc_species = 100 * df['Species'].count()/df['OCRText'].count()

    print('Accuracy ======> State: {:.1f}%, Year: {:.1f}%, Species predicted: {:.1f}%'.format(
        acc_state, acc_date, perc_species))
    print('Barcode accuracy ======> {:.1f}%'.format(acc_barcode))

Predict accuracy

In [15]:
calculate_accuracy(df)



Save files

In [17]:
df.to_csv('Pre_90_Analysis.csv')