In [18]:
from datasets import load_dataset
import numpy as np
import cv2
import math
import pytesseract
import re
import time

In [19]:
ds = load_dataset("lansinuote/ocr_id_card")

In [20]:
# okay time for some actual functionality: let's make an accuracy checker
def accuracyChecker(words, testData):
    name = testData[0]['word']
    accuracy = 0
    yearFound = False
    monthFound = False
    dayFound = False
    IDFound = False

    for i in range(len(words)):
        if words[i] == name:
            accuracy += 1
        if re.fullmatch(r"\d{4}", words[i]) and not yearFound:
            if 2025 > int(words[i]) >= 1900:
                accuracy += 1
                yearFound = True
        if re.fullmatch(r"\d{1,2}", words[i]) and not monthFound:
            if 12 > int(words[i]) >= 1:
                accuracy += 1
                monthFound = True
        if re.fullmatch(r"\d{1,2}", words[i]) and not dayFound:
            if 31 > int(words[i]) >= 1:
                accuracy += 1
                dayFound = True
        if re.fullmatch(r"\d{18}", words[i]) and not IDFound:
            accuracy += 1
            IDFound = True
    return accuracy / 5


In [21]:
# okay gonna run all the data through the whole pipeline, this should make it possible to change a pipeline and compute average accuracy of that pipeline

accuracy = []
latency = []
numImages = len(ds['train'])
numImages = 100
for imgNum in range(numImages):
    # collect an image from the dataset and turn it into an array for edge detection, use greyscale
    currImg = ds['train'][imgNum]['image']
    testingImageArray = np.array(currImg)
    edgeDetectionArray = cv2.cvtColor(testingImageArray, cv2.COLOR_RGB2GRAY)

    # apply canny edge detection
    edges = cv2.Canny(edgeDetectionArray, 100, 200)

    # do a hough lines transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 75, None, 50, 10)
    linesTest = np.zeros(edges.shape) + 250
    for i in range(0, len(lines)):
        l = lines[i][0]
        cv2.line(linesTest, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)

    # find the angles at which each line lies
    theta = np.zeros(len(lines))
    for i in range(0, len(lines)):
        l = lines[i][0]
        if abs(l[3] - l[1]) == 0:
            # horizontal line
            theta[i] = 0
        else:
            theta[i] = math.atan(abs(l[2] - l[0])/abs(l[3] - l[1]))

    # find smallest angle greater than pi/4
    optTheta = 100
    optIndex = 0
    for k in range(len(theta)):
        if theta[k] > np.pi/4:
            if theta[k] < optTheta:
                optTheta = theta[k]
                optIndex = k
    mostHorz = np.float32(lines[optIndex][0])
    # this actually ends up being the least horizontal of the still pretty horizontal lines, deals with noise from extra lines created from the text

    # find some points to warp the image, use the midpoint of the lines offset a bit, and the new destination of the endpoints of the least horizontal but still horizontal line
    mdpnt = [(mostHorz[0] + mostHorz[2])/2,(mostHorz[1] + mostHorz[3])/2]
    adjustment = 100
    if mdpnt[1] > 250:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] - adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] - adjustment]])
    else:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] + adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] + adjustment]])

    # warp it
    M = cv2.getAffineTransform(initialPts, finalPts)
    warpedImg = cv2.warpAffine(testingImageArray, M, (testingImageArray.shape[1], testingImageArray.shape[0]))

    # red it
    redshiftImg = warpedImg
    for m in range(warpedImg.shape[0]):
        for n in range(warpedImg.shape[1]):
            redshiftImg[m][n][1] = 0
            redshiftImg[m][n][2] = 0

    # tesseract it, also find latency
    start = time.time()
    wordsRaw = pytesseract.image_to_string(redshiftImg, lang='chi_sim')
    stop = time.time()
    latency.append(stop - start)

    # clean and tokenize the data from OCR
    words = wordsRaw.replace(" ","").splitlines()
    testingData = ds['train'][imgNum]['ocr']

    pattern = r'\d+|[\u4e00-\u9fff]+|[A-Za-z]+|\s|[^\w\s]'
    tokens = []
    for i in range(len(words)):
        tokens.append(re.findall(pattern, words[i]))

    # flatten it out
    wordsData = []
    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            wordsData.append(tokens[i][j])

    # run the accuracy checker on the final data
    accuracy.append(accuracyChecker(wordsData, testingData))

print("Average accuracy: ", np.mean(accuracy))
print("Average latency: ", np.mean(latency))



Average accuracy:  0.5619999999999999
Average latency:  0.9740526795387268


In [22]:
# okay try another pipeline, this will be greyscale instead of redshift

accuracy = []
latency = []
numImages = len(ds['train'])
numImages = 100
for imgNum in range(numImages):
    # collect an image from the dataset and turn it into an array for edge detection, use greyscale
    currImg = ds['train'][imgNum]['image']
    testingImageArray = np.array(currImg)
    edgeDetectionArray = cv2.cvtColor(testingImageArray, cv2.COLOR_RGB2GRAY)

    # apply canny edge detection
    edges = cv2.Canny(edgeDetectionArray, 100, 200)

    # do a hough lines transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 75, None, 50, 10)
    linesTest = np.zeros(edges.shape) + 250
    for i in range(0, len(lines)):
        l = lines[i][0]
        cv2.line(linesTest, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)

    # find the angles at which each line lies
    theta = np.zeros(len(lines))
    for i in range(0, len(lines)):
        l = lines[i][0]
        if abs(l[3] - l[1]) == 0:
            # horizontal line
            theta[i] = 0
        else:
            theta[i] = math.atan(abs(l[2] - l[0])/abs(l[3] - l[1]))

    # find smallest angle greater than pi/4
    optTheta = 100
    optIndex = 0
    for k in range(len(theta)):
        if theta[k] > np.pi/4:
            if theta[k] < optTheta:
                optTheta = theta[k]
                optIndex = k
    mostHorz = np.float32(lines[optIndex][0])
    # this actually ends up being the least horizontal of the still pretty horizontal lines, deals with noise from extra lines created from the text

    # find some points to warp the image, use the midpoint of the lines offset a bit, and the new destination of the endpoints of the least horizontal but still horizontal line
    mdpnt = [(mostHorz[0] + mostHorz[2])/2,(mostHorz[1] + mostHorz[3])/2]
    adjustment = 100
    if mdpnt[1] > 250:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] - adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] - adjustment]])
    else:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] + adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] + adjustment]])

    # warp it
    M = cv2.getAffineTransform(initialPts, finalPts)
    warpedImg = cv2.warpAffine(testingImageArray, M, (testingImageArray.shape[1], testingImageArray.shape[0]))

    # grey it
    greyshiftImg = cv2.cvtColor(warpedImg, cv2.COLOR_RGB2GRAY)

    # tesseract it, also find latency
    start = time.time()
    wordsRaw = pytesseract.image_to_string(greyshiftImg, lang='chi_sim')
    stop = time.time()
    latency.append(stop - start)

    # clean and tokenize the data from OCR
    words = wordsRaw.replace(" ","").splitlines()
    testingData = ds['train'][imgNum]['ocr']

    pattern = r'\d+|[\u4e00-\u9fff]+|[A-Za-z]+|\s|[^\w\s]'
    tokens = []
    for i in range(len(words)):
        tokens.append(re.findall(pattern, words[i]))

    # flatten it out
    wordsData = []
    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            wordsData.append(tokens[i][j])

    # run the accuracy checker on the final data
    accuracy.append(accuracyChecker(wordsData, testingData))

print("Average accuracy: ", np.mean(accuracy))
print("Average latency: ", np.mean(latency))



Average accuracy:  0.452
Average latency:  0.9122025275230408


In [23]:
# okay this time I'm gonna try the redshift image with some cleaning up of the noise

accuracy = []
latency = []
numImages = len(ds['train'])
numImages = 100
for imgNum in range(numImages):
    # collect an image from the dataset and turn it into an array for edge detection, use greyscale
    currImg = ds['train'][imgNum]['image']
    testingImageArray = np.array(currImg)
    edgeDetectionArray = cv2.cvtColor(testingImageArray, cv2.COLOR_RGB2GRAY)

    # apply canny edge detection
    edges = cv2.Canny(edgeDetectionArray, 100, 200)

    # do a hough lines transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 75, None, 50, 10)
    linesTest = np.zeros(edges.shape) + 250
    for i in range(0, len(lines)):
        l = lines[i][0]
        cv2.line(linesTest, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)

    # find the angles at which each line lies
    theta = np.zeros(len(lines))
    for i in range(0, len(lines)):
        l = lines[i][0]
        if abs(l[3] - l[1]) == 0:
            # horizontal line
            theta[i] = 0
        else:
            theta[i] = math.atan(abs(l[2] - l[0])/abs(l[3] - l[1]))

    # find smallest angle greater than pi/4
    optTheta = 100
    optIndex = 0
    for k in range(len(theta)):
        if theta[k] > np.pi/4:
            if theta[k] < optTheta:
                optTheta = theta[k]
                optIndex = k
    mostHorz = np.float32(lines[optIndex][0])
    # this actually ends up being the least horizontal of the still pretty horizontal lines, deals with noise from extra lines created from the text

    # find some points to warp the image, use the midpoint of the lines offset a bit, and the new destination of the endpoints of the least horizontal but still horizontal line
    mdpnt = [(mostHorz[0] + mostHorz[2])/2,(mostHorz[1] + mostHorz[3])/2]
    adjustment = 100
    if mdpnt[1] > 250:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] - adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] - adjustment]])
    else:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] + adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] + adjustment]])

    # warp it
    M = cv2.getAffineTransform(initialPts, finalPts)
    warpedImg = cv2.warpAffine(testingImageArray, M, (testingImageArray.shape[1], testingImageArray.shape[0]))

    # red it
    redshiftImg = warpedImg
    for m in range(warpedImg.shape[0]):
        for n in range(warpedImg.shape[1]):
            redshiftImg[m][n][1] = 0
            redshiftImg[m][n][2] = 0

    # denoising
    denoiseImg = cv2.fastNlMeansDenoising(redshiftImg, h=10)

    # tesseract it, also find latency
    start = time.time()
    wordsRaw = pytesseract.image_to_string(denoiseImg, lang='chi_sim')
    stop = time.time()
    latency.append(stop - start)

    # clean and tokenize the data from OCR
    words = wordsRaw.replace(" ","").splitlines()
    testingData = ds['train'][imgNum]['ocr']

    pattern = r'\d+|[\u4e00-\u9fff]+|[A-Za-z]+|\s|[^\w\s]'
    tokens = []
    for i in range(len(words)):
        tokens.append(re.findall(pattern, words[i]))

    # flatten it out
    wordsData = []
    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            wordsData.append(tokens[i][j])

    # run the accuracy checker on the final data
    accuracy.append(accuracyChecker(wordsData, testingData))

print("Average accuracy: ", np.mean(accuracy))
print("Average latency: ", np.mean(latency))

# okay that was terrible, try denoising before warping


Average accuracy:  0.41400000000000003
Average latency:  1.123834764957428


In [24]:
# denoise before warping

accuracy = []
latency = []
numImages = len(ds['train'])
numImages = 100
for imgNum in range(numImages):
    # collect an image from the dataset and turn it into an array for edge detection, use greyscale
    currImg = ds['train'][imgNum]['image']
    testingImageArray = np.array(currImg)
    edgeDetectionArray = cv2.cvtColor(testingImageArray, cv2.COLOR_RGB2GRAY)

    # apply canny edge detection
    edges = cv2.Canny(edgeDetectionArray, 100, 200)

    # do a hough lines transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 75, None, 50, 10)
    linesTest = np.zeros(edges.shape) + 250
    for i in range(0, len(lines)):
        l = lines[i][0]
        cv2.line(linesTest, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)

    # find the angles at which each line lies
    theta = np.zeros(len(lines))
    for i in range(0, len(lines)):
        l = lines[i][0]
        if abs(l[3] - l[1]) == 0:
            # horizontal line
            theta[i] = 0
        else:
            theta[i] = math.atan(abs(l[2] - l[0])/abs(l[3] - l[1]))

    # find smallest angle greater than pi/4
    optTheta = 100
    optIndex = 0
    for k in range(len(theta)):
        if theta[k] > np.pi/4:
            if theta[k] < optTheta:
                optTheta = theta[k]
                optIndex = k
    mostHorz = np.float32(lines[optIndex][0])
    # this actually ends up being the least horizontal of the still pretty horizontal lines, deals with noise from extra lines created from the text

    # find some points to warp the image, use the midpoint of the lines offset a bit, and the new destination of the endpoints of the least horizontal but still horizontal line
    mdpnt = [(mostHorz[0] + mostHorz[2])/2,(mostHorz[1] + mostHorz[3])/2]
    adjustment = 100
    if mdpnt[1] > 250:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] - adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] - adjustment]])
    else:
        initialPts = np.float32([[mostHorz[0], mostHorz[1]],
                                 [mostHorz[2], mostHorz[3]],
                                 [mdpnt[0], mdpnt[1] + adjustment]])
        finalPts = np.float32([[mostHorz[0], mdpnt[1]],
                               [mostHorz[2], mdpnt[1]],
                               [mdpnt[0], mdpnt[1] + adjustment]])

    # denoise it
    denoiseImg = cv2.fastNlMeansDenoising(testingImageArray, h=10)

    # warp it
    M = cv2.getAffineTransform(initialPts, finalPts)
    warpedImg = cv2.warpAffine(denoiseImg, M, (denoiseImg.shape[1], denoiseImg.shape[0]))

    # red it
    redshiftImg = warpedImg
    for m in range(warpedImg.shape[0]):
        for n in range(warpedImg.shape[1]):
            redshiftImg[m][n][1] = 0
            redshiftImg[m][n][2] = 0

    # tesseract it, also find latency
    start = time.time()
    wordsRaw = pytesseract.image_to_string(redshiftImg, lang='chi_sim')
    stop = time.time()
    latency.append(stop - start)

    # clean and tokenize the data from OCR
    words = wordsRaw.replace(" ","").splitlines()
    testingData = ds['train'][imgNum]['ocr']

    pattern = r'\d+|[\u4e00-\u9fff]+|[A-Za-z]+|\s|[^\w\s]'
    tokens = []
    for i in range(len(words)):
        tokens.append(re.findall(pattern, words[i]))

    # flatten it out
    wordsData = []
    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            wordsData.append(tokens[i][j])

    # run the accuracy checker on the final data
    accuracy.append(accuracyChecker(wordsData, testingData))

print("Average accuracy: ", np.mean(accuracy))
print("Average latency: ", np.mean(latency))

# still not great

Average accuracy:  0.456
Average latency:  0.9744262456893921
