In [1]:
import pandas as pd
from model.helper import *
import time
from model.ocr import *

### This notebook processes raw OCR Text output from pre 1900 images and analyzes accuracy

Load raw data with OCR text/barcodes, labeled data, nlp model and regex model

In [2]:
# Load raw data
df=pd.read_excel('data/handwritten_raw_ocr_data.xlsx')
mod = handwritten_model()
df_labels = pd.read_excel('data/specimens_pre_1900.xlsx')
pd.set_option('mode.chained_assignment', None)

Make predictions using the model

In [3]:
# Retrieve entities using nlp (for state, species) and regex (for date, barcode)
starttime = time.time()
df['State'] = df['OCRText'].dropna().apply(mod.predict_state)
df['Species'] = df['OCRText'].dropna().apply(mod.predict_species)
df['Year'] = df['OCRText'].dropna().apply(mod.find_year)
df['Barcode'] = df['OCRText'].dropna().apply(find_barcode)
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 38.8 secs


Load labels from labeled data file

In [4]:
# Get actual labels
starttime = time.time()
df['Act_Barcode'] = df_labels['Barcode']
df['Act_State'] = df_labels['ProvinceState']
df['Act_Date'] = df_labels['DateFrom']
print('Total time: {:.1f} secs'.format(time.time()-starttime))

Total time: 0.0 secs


Change dates to date formats

In [5]:
df['Act_Date'] = pd.to_datetime(df['Act_Date'], errors='coerce')
df['Act_Year'] = df['Act_Date'].dropna().apply(lambda x: x.year).astype('int64')

In [6]:
def calculate_accuracy(df):
    acc_state = 100 * (df[df['State']==df['Act_State']]).shape[0]/df['OCRText'].count()
    acc_date = 100 * (df[df['Year']==df['Act_Year']]).shape[0]/df['OCRText'].count()
    acc_barcode = 100 * (df[df['Barcode']==df['Act_Barcode']]).shape[0]/df['OCRText'].count()
    
    perc_species = 100 * df['Species'].count()/df['OCRText'].count()

    print('Accuracy ======> State: {:.1f}%, Year: {:.1f}%, Species predicted: {:.1f}%'.format(
        acc_state, acc_date, perc_species))
    print('Barcode accuracy ======> {:.1f}%'.format(acc_barcode))

Predict accuracy

In [7]:
calculate_accuracy(df)



Save files

In [10]:
df.to_csv('data/handwritten_prediction_analysis.csv')