In [1]:
#from pyxpdf import Document, Config
from matplotlib import pyplot as plt
import numpy as np
#import pdfplumber
import pandas as pd
import re
import pytesseract
import cv2
from pdf2image import convert_from_path

In [2]:
def display(image):
    plt.rcParams["figure.figsize"] = (30,30)
    plt.imshow(image,cmap='gray')
    plt.show()

In [3]:
def extract_text(image):
    
    img_bin = 255 - image
    thresh1,img_bin_otsu = cv2.threshold(img_bin,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))

    # Vertical erosion and dilation (for columns)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, np.array(image).shape[1]//100))
    eroded_image = cv2.erode(img_bin_otsu, vertical_kernel, iterations=12)
    vertical_lines = cv2.dilate(eroded_image, vertical_kernel, iterations=12)

    # Horizontal erosion and dilation (for rows)
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (np.array(image).shape[1]//100, 1))
    horizontal_lines = cv2.erode(img_bin, hor_kernel, iterations=15)
    horizontal_lines = cv2.dilate(horizontal_lines, hor_kernel, iterations=15)
    
    # Combining the vertical and horizontal lines
    vertical_horizontal_lines = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    vertical_horizontal_lines = cv2.erode(~vertical_horizontal_lines, kernel, iterations=3)
    thresh, vertical_horizontal_lines = cv2.threshold(vertical_horizontal_lines,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    bitxor = cv2.bitwise_xor(image,vertical_horizontal_lines)
    bitnot = cv2.bitwise_not(bitxor)
    
    contours, hierarchy = cv2.findContours(vertical_horizontal_lines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    boundingBoxes = [cv2.boundingRect(contour) for contour in contours]
    (contours, boundingBoxes) = zip(*sorted(zip(contours, boundingBoxes),key=lambda x:x[1][1]))    
    boxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if (w<1000 and h<500):
            image = cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
            boxes.append([x,y,w,h])
    #display(image)
    
    # Getting the rows and columns
    rows=[]
    temp=[]
    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))] #Bounding box is a list having x,y,w,h
    mean = np.mean(heights)
    print(len(boxes))
    temp.append(boxes[0])
    previous=boxes[0]
    for i in range(1,len(boxes)):
        if(boxes[i][1]<=previous[1]+mean/2):
            temp.append(boxes[i])
            previous=boxes[i]
            if(i==len(boxes)-1):
                rows.append(temp)
        else:
            rows.append(temp)
            temp=[]
            previous = boxes[i]
            temp.append(boxes[i])
    total_rows = len(rows)
    #print("Total rows = ",total_rows)
    
    # The total columns 
    total_columns=0
    for i in range(total_rows):
        if len(rows[i]) > total_columns:
            total_columns = len(rows[i])
    #print("Total columns = ",total_columns)
    
    # Getting centres of cells and sorting them
    center = [int(rows[i][j][0]+rows[i][j][3]/2) for j in range(len(rows[i])) if rows[0]]
    center=np.array(center)
    center.sort()
    #print(center)
    
    # List of coordinates of the boxes
    boxes_list = []
    for i in range(len(rows)):
        l=[]
        for k in range(total_columns):
            l.append([])
        for j in range(len(rows[i])):
            diff = abs(center-(rows[i][j][0]+rows[i][j][2]/total_rows))
            minimum = min(diff)
            indexing = list(diff).index(minimum)
            l[indexing].append(rows[i][j])
        boxes_list.append(l)
       
    # Extracting the text using PyTesseract
    final=[]

    for i in range(len(boxes_list)):
        for j in range(len(boxes_list[i])):
            s=''
            if(len(boxes_list[i][j])==0):
                final.append(' ')
            else:
                for k in range(len(boxes_list[i][j])):
                    y,x,w,h = boxes_list[i][j][k][0],boxes_list[i][j][k][1], boxes_list[i][j][k][2],boxes_list[i][j][k][3]
                    roi = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,1))
                    border = cv2.copyMakeBorder(roi,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel,iterations=1)
                    erosion = cv2.erode(dilation, kernel,iterations=2)                
                    out = pytesseract.image_to_string(erosion)
                    s = s +" "+ out
                final.append(s)
    
    arr = np.array(final)
    arr = arr.reshape(total_rows, total_columns)
    print(len(boxes))

    return arr,total_rows,total_columns

In [None]:
def get_table(path):
    text = []
    index = 0
    images = convert_from_path(path)
    header = ['Value Date','Particulars','Tran Type','Cheque Details','Withdrawals','Deposits','Balance','Dr/Cr']
    df = pd.DataFrame(columns=header)
    for i in range(len(images)):
        print('Page',i+1,'---------Extracting---------')
        open_cv_image = np.array(images[i]) 
        # Convert RGB to BGR 
        open_cv_image = open_cv_image[:, :, ::-1].copy() 
        open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        text,rows,cols = extract_text(open_cv_image)

        for r in range(rows):
            res = []
            for w in text[r]:
                res.append(w.replace("\x0c", ""))
            df.loc[index] = res
            index += 1
        
    #df = df.style.set_properties(align="left")
    df = df.replace(r'\n',' ', regex=True)
    return df
    
data = get_table('1-4-19 to 30-12-19.pdf')

Page 1 ---------Extracting---------
56
56
Page 2 ---------Extracting---------
120
120
Page 3 ---------Extracting---------
120
120
Page 4 ---------Extracting---------
120
120
Page 5 ---------Extracting---------
120
120
Page 6 ---------Extracting---------
120
120
Page 7 ---------Extracting---------
120
120
Page 8 ---------Extracting---------
120
120
Page 9 ---------Extracting---------
120
120
Page 10 ---------Extracting---------
120
120
Page 11 ---------Extracting---------
120


In [None]:
data

In [None]:
type(data)

In [None]:
data.loc[:,['Value Date', 'Tran Type', 'Cheque Details', 'Balance']]

In [None]:
#data.to_dict()

In [None]:
#data.to_dict('dict')

In [None]:
#data.to_dict('list')

In [None]:
#data.to_dict('series')

In [None]:
#data.to_dict('split')

In [None]:
data.to_dict('records') ##

In [None]:
#data.to_dict('index')

In [None]:
def get_text(path):
    text_data = []
    with pdfplumber.open(path) as pdf:
        page = pdf.pages[0]
        text = page.extract_text()
        text_data.append(text)
    return text_data

text1 = get_text('1-4-19 to 30-12-19.pdf')

In [None]:
type(text1)

In [None]:
print(text1)

In [None]:
text1[0].split(':')

In [None]:
doc = Document('1-4-19 to 30-12-19.pdf')
t = doc[0].text()
Config.load_file("")
t = doc[0].text()

In [None]:
print(t)

In [None]:
t1 = t.strip()

In [None]:
print(t1)

In [None]:
import re
re.split('\n\n, :',t)

In [None]:
t = t.replace(r'\n\n', '')
print(t)

In [None]:
t1.split(':')

In [None]:
pip install slate3k

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

In [None]:
file = open('1-4-19 to 30-12-19.pdf', 'rb') #load file
parser = PDFParser(file) #open pasring object
document = PDFDocument(parser) #store parsing object structure and check for password

if not document.is_extractable:
    print(PDFTextExtractionNotAllowed)

rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

In [None]:
def parse_obj(lt_objs):
    list1 = []

    
    for obj in lt_objs:
        
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            #print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '')))
            list1.append(obj.get_text().replace('\n', ''))
        
        
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
    return list1


main_list=[]
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    x=parse_obj(layout._objs)
    main_list.append(x)
print(main_list[0])
##(33, 728); (418,727); (33,470); (418,470)

In [None]:
type(main_list)

In [None]:
dict1 = main_list[0][:46]
dict1

In [None]:
def Convert(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct


lst = dict1
data_dict = Convert(lst)
print(data_dict)

In [None]:
data_dict = { x.translate({32:None}) : y 
                for x, y in data_dict.items()}

In [None]:
print(data_dict)