In [1]:
import pytesseract
import cv2
from io import BytesIO
import numpy
import re
import PyPDF2
import os, os.path
try:
    import Image
except ImportError:
    from PIL import Image

In [7]:
def formatSSN(ssn):
    tempList = []
    for val in ssn:
        #Format all SSNs to XXX-XX-XXXX
        tempList.append(val[:3]+'-'+val[4:6]+'-'+val[7:11])
    return tempList


def checkDict(ssnDict,outList,outFilename):
    for ssn in outList:
        if ssn in ssnDict:
            ssnDict[ssn].append(outFilename)
        else:
            ssnDict[ssn] = [outFilename]
    return ssnDict

In [3]:
def passFolder(path):
    #Initialize SSN Regex Query
    ssnRE = re.compile('[0-9][0-9][0-9].[0-9][0-9].[0-9][0-9][0-9][0-9]', re.MULTILINE|re.DOTALL)
    ssnDict = {}
    
    for f in os.listdir(path):
        ext = os.path.splitext(f)[1]
        if ext.lower() in ['.jpg','.png']:
            #Func outputs list of found SSNs and the str(filename)
            outList, outFilename = doJPGPNG(os.path.join(path,f),(ext.lower() is '.png'),ssnRE)
            ssnDict = checkDict(ssnDict,outList,outFilename)
        elif ext.lower() in ['.pdf']:
            outList, outFilename = doPDF(os.path.join(path,f),ssnRE)
            ssnDict = checkDict(ssnDict,outList,outFilename)
        else:
            continue
    
    print(ssnDict)

In [4]:
def doJPGPNG(file,isPNG,ssnRE):
   
    ima = Image.open(file)

    f = lambda x,y,z: (int(z), int(y/x*z))
    nWidth,nHeight = f(ima.size[0],ima.size[1],1500)
    image = ima.resize((nWidth, nHeight), Image.LANCZOS) 

    if not isPNG:
        with BytesIO() as f:
            image.save(f, format='PNG')
            f.seek(0)
            image = Image.open(f).convert('RGB')
    else:
            image = image.convert('RBG')
    
    #OpenCV Preprocessing to make img grayscale, make bg black and foreground/text white
    open_cv_image = numpy.array(image) 
    open_cv_image = open_cv_image[:, :, ::-1].copy() 
    gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    #Uncomment if you want to see the image
    #cv2.imshow('image',thresh)
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()

    #Tesseract find text from img
    pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'
    final = ssnRE.findall(pytesseract.image_to_string(thresh).replace(' ', ''))
    final = formatSSN(final)

    return (final,os.path.basename(file))

In [5]:
def doPDF(file,ssnRE):
    pdfFileObj = open(file, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    pdfReader.numPages

    allTxt = ""

    for i in range(pdfReader.numPages):
        allTxt+=pdfReader.getPage(i).extractText()
        
    final = ssnRE.findall(allTxt.replace('\n', '').replace('\r', ''))
    final = formatSSN(final)
    
    return (final,os.path.basename(file))

In [8]:
passFolder('C:\\Users\\scasella\\Desktop\\SSN Project')

{'400-00-4060': ['four.jpg'], '444-44-4444': ['one.jpg'], '155-44-1234': ['TestDoc SSN.pdf'], '123-43-6543': ['TestDoc SSN.pdf'], '123-45-6789': ['TestDoc SSN.pdf']}
