In [None]:
#mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#installing required modules
!pip install pillow 
!pip install pytesseract
!pip install easyocr
!apt install tesseract-ocr

In [None]:
#importing required libraries
from pytesseract import pytesseract, Output
import cv2
import os
import csv
from csv import writer
import easyocr

In [None]:
# creates a file name with a padded number, specified prefix and extension
def create_image_file_name(number, file_prefix="PDF-A-Image-"):
    file_ext = ".png"
    count = 1
    temp = number
    while temp >= 10:
        count += 1
        temp //= 10
    padded_number = str(number).zfill(4) # Pad the number with zeros to a length of 4
    file_name = f"{file_prefix}{padded_number}{file_ext}"
    if os.path.exists(file_name):
        raise FileExistsError("File already exists.")
    return file_name

# calculates the differences between adjacent slice indices along a given axis
def slice_differences(image, slice_indices, slice_axis):
    result = []
    size = len(slice_indices)

    for i, val in enumerate(slice_indices):
        if i == size - 1:
            last_diff = image.shape[slice_axis] - val
            result.append(last_diff)
        else:
            diff = slice_indices[i+1] - val
            result.append(diff)
    return result

# PRE-PROCESSING
# convert a given image to binary using OpenCV's thresholding functionality
def convert_to_binary(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    return binary

# convert a given image to absolute scale using OpenCV'sconvertScaleAbs function
def convert_to_abs_scale(image):
    image = cv2.convertScaleAbs(image)
    return image

# create an EasyOCR reader object for English language
reader = easyocr.Reader(['en'])

In [None]:
def subset_1(file_name, file_number):

    # define the path to the image files
    # image_file = os.path.join(os.getcwd(), "PDF A Images", file_name)
    image_file = "/content/drive/MyDrive/1Extraction/PDF A - Sample Images/PDF A-Image-0011.png"

    # read the image file using cv2
    image = cv2.imread(image_file) #reference: https://www.geeksforgeeks.org/python-opencv-cv2-imread-method/ 

    # create an empty list to store the variable heights
    var_heights = []

    # pre-processing the image before data extraction
    # image = convert_to_binary(image)

    # Define the region of interest for project number bounding boxes
    x1, x2, y1, y2 = 30, 150, 180, 1600
    proj_bboxes = image[y1:y2, x1:x2]

    # Define the region of interest of image for data extraction
    x1, x2, y1, y2 = 30, 1200, 180, 1600
    image_roi = image[y1:y2, x1:x2]

    # Define the markers for pages
    odd_start_markers = [10, 170, 324, 485, 695, 820, 925, 1090]
    even_start_markers = [51, 195, 395, 515, 690, 870, 1000, 1195, 1330, 1480]


    #OCR data extraction
    data = pytesseract.image_to_data(proj_bboxes, output_type=Output.DICT) #reference: https://stackoverflow.com/questions/20831612/getting-the-bounding-box-of-the-recognized-words-using-python-tesseract

    # Clean the output and extract relevant heights and widths
    cleaned_data = {"top": []}
    for idx in range(len(data["text"])):
        if data["text"][idx] != "" and len(data["text"][idx]) > 2:
            cleaned_data["top"].append(data["top"][idx])
    
    # Calculate height differences between bounding boxes and cleaned data
    height_slices = slice_differences(proj_bboxes, cleaned_data["top"], 0)

    # Calculate width differences in odd numbered page cells
    width_slices_odd = slice_differences(image_roi, odd_start_markers, 1)

    # Extract data from odd pages
    odd_data = {}
    for index, top in enumerate(cleaned_data["top"]):
        start_pos = top - 10
        strip_image = image_roi[start_pos:start_pos + height_slices[index], 0:image_roi.shape[1]]
        row_data = []
        for col_index, left in enumerate(odd_start_markers):
            cell = strip_image[0:strip_image.shape[1], left:left + width_slices_odd[col_index]]
            cell_data_array = reader.readtext(cell, detail=0) #detail=0 for extracting only the text without bounding box coordinates and conf
            cell_string = " ".join(cell_data_array).strip() or "null"
            row_data.append(cell_string)
        odd_data[index] = row_data + ["null"] * 5


    # Extract data from even pages
    # new_even_image = os.path.join(os.getcwd(), "PDF A Images", create_image_file_name(number+1))
    new_even_image = "/content/drive/MyDrive/1Extraction/PDF A - Sample Images/PDF A-Image-0012.png"
    even_image = cv2.imread(new_even_image, 0)

    # Calculate width differences in even numbered page cells
    width_slices_even = slice_differences(even_image, even_start_markers, 1)


    # Extract data from even pages
    even_data = {}
    for index, top in enumerate(cleaned_data["top"]):
        start_pos = top + 140
        strip_image = even_image[start_pos:start_pos + height_slices[index], 0:even_image.shape[1]]
        row_data = []
        for col_index, left in enumerate(even_start_markers):
            cell = strip_image[0:strip_image.shape[1], left:left + width_slices_even[col_index]]
            cell_data_array = reader.readtext(cell, detail=0)
            cell_string = " ".join(cell_data_array).strip() or "null"
            row_data.append(cell_string)
        even_data[index] = row_data + [f"{file_number}-{file_number+1}"]
        

    # # Combine odd and even page data
    combined_data = {}
    for index in range(len(odd_data)):
        combined_data[index] = odd_data[index] + even_data[index]


    # Return a list of data rows
    return list(combined_data.values())

# Writing the data into a CSV file
start = 1
for i in range(1,1544,2):
    page_data = subset_1(create_image_file_name(i), i)
    print("Processing data for pages {} and {} is done.".format(i, i+1))
    
    extracted_data = "/content/drive/MyDrive/1Extraction/Output files in .csv/output 1-PDF A.csv"
    page_headers = ["ProjectNumber", "Lead", "RecipientName", "RecipientType", "Indigenous", 
                    "VulnerableGroups", "Town_City_Community", "Province_Territory", "PostalCode","ContactName",
                   "ContactPhone","ContactEmail", "Status", "TotalRequested", "ApprovedFunding_NonAAFC", 
                   "ApprovedFunding_AAFC", "TotalApproved_Funding","DataApproved_Rejected", "Date_Paid", "Type_of_Investment", "Description",
                   "NumberPeopleServed", "Notes", "PageIndex"]

    if i == start:
        with open(extracted_data, "w+", newline='') as init_file:
            csv_writer = csv.writer(init_file) #csv header object
            csv_writer.writerow(page_headers) #writing the header row

    with open(extracted_data, 'a', newline='') as file:
      csv_writer = csv.writer(file)
      for row in page_data:
        csv_writer.writerow(row) #writing the extracted data

In [None]:
def subset_2(file_name, file_number):
   
    # image_1_path = os.path.join(os.getcwd(), "PDF A Images", file_name)
    image_1_path = "/content/drive/MyDrive/1Extraction/PDF A - Sample Images/PDF A-Image-1605.png"
    image = cv2.imread(image_1_path, 0) 

    # pre-processing the image before data extraction
    # image = convert_to_abs_scale(image)
    proj_bboxes = image[150:1600, 50:300]

    #OCR data extraction
    data = pytesseract.image_to_data(proj_bboxes, output_type=Output.DICT)

    # Define the markers for each page
    widths_set_page1 = [50, 320, 535, 850, 1150, 1360, 1650, 1900]
    widths_set_page2 = [60, 268, 509, 806, 1116, 1300, 1481, 1765]
    widths_set_page3 = [150, 425, 630, 750, 1080, 1300, 1580]
    
    text_recognition_count = len(data["text"])

    cleaned_data = {
        "top": [data["top"][i] for i in range(text_recognition_count) if data["text"][i] != "" and len(data["text"][i]) > 2]
    }

    # Calculates the height differences between bounding boxes and cleaned data
    height_slices = slice_differences(proj_bboxes , cleaned_data["top"], 0)

    # Calculates the width differences between the image and width markers
    width_differences = slice_differences(image, widths_set_page1, 1)

    temp_data = {}

    temp_array = []

    for i in range(len(cleaned_data["top"])):
        temp_start = cleaned_data["top"][i] + 140
        temp_strip_image = image[ temp_start : temp_start + height_slices[i], 0:image.shape[1]]
        temp_array = []
        for j in range(len(width_differences)):
            cell = temp_strip_image[0:temp_strip_image.shape[1] , widths_set_page1[j] : widths_set_page1[j] + width_differences[j]]
            cell_data_array = reader.readtext(cell, detail = 0)
            cell_string = " ".join(cell_data_array)
            if cell_string == "":
                cell_string = "null"
            temp_array.append(cell_string)
        temp_data[i] = temp_array


    # Extract data from second page
    # image_2_path = os.path.join(os.getcwd(), "PDF A Images", create_image_file_name(number+1))
    image_2_path = "/content/drive/MyDrive/1Extraction/PDF A - Sample Images/PDF A-Image-1606.png"
    image_2 = cv2.imread(image_2_path, 0)
    # image_2 = convert_to_abs_scale(image_2)

    width_differences = slice_differences(image_2, widths_set_page2, 1)

    temp_array = []

    for i in range(len(cleaned_data["top"])):
        temp_start = cleaned_data["top"][i] + 140
        temp_strip_image = image_2[ temp_start : temp_start + height_slices[i], 0:image_2.shape[1]]
        temp_array = []
        for j in range(len(width_differences)):
            cell = temp_strip_image[0:temp_strip_image.shape[1] , widths_set_page2[j] : widths_set_page2[j] + width_differences[j]]
            cell_data_array = reader.readtext(cell, detail = 0)
            cell_string = " ".join(cell_data_array)
            if cell_string == "":
                cell_string = "null"
            temp_array.append(cell_string)
        
        temp_data[i] += temp_array

    # Extract data from third page
    # image_3_path = os.path.join(os.getcwd(), "PDF A Images", create_image_file_name(number+2))
    image_3_path = "/content/drive/MyDrive/1Extraction/PDF A - Sample Images/PDF A-Image-1607.png"
    image_3 = cv2.imread(image_3_path, 0)
    # image_3 = convert_to_abs_scale(image_3)

    width_differences = slice_differences(image_3, widths_set_page3, 1)

    for i in range(len(cleaned_data["top"])):
        temp_start = cleaned_data["top"][i] + 140
        temp_strip_image = image_3[ temp_start : temp_start + height_slices[i], 0:image_3.shape[1]]
        temp_array = []
        for j in range(len(width_differences)):
            cell = temp_strip_image[0:temp_strip_image.shape[1] , widths_set_page3[j] : widths_set_page3[j] + width_differences[j]]
            cell_data_array = reader.readtext(cell, detail = 0)
            cell_string = " ".join(cell_data_array)
            if cell_string == "":
                cell_string = "null"
            temp_array.append(cell_string)

    # combining the data from three pages into one
        temp_data[i] += temp_array + [f"{file_number}-{file_number+1}-{file_number+2}"]
    return temp_data.values()

# Writing the data into a CSV file
start = 1544
for i in range(1544,2354,3):
    page_data = subset_2(create_image_file_name(i), i)
    print("Processing data for pages {}, {} and {} is done.".format(i, i+1, i+2))
    
    extracted_data = "/content/drive/MyDrive/1Extraction/Output files in .csv/output 2-PDF A.csv"
    page_headers = ["ProjectNumber", "Lead", "RecipientName", "RecipientType", "Indigenous", 
                    "VulnerableGroups", "Town_City_Community", "Province_Territory", "PostalCode","ContactName",
                   "ContactPhone","ContactEmail", "Status", "TotalRequested", "ApprovedFunding_NonAAFC", 
                   "ApprovedFunding_AAFC", "TotalApproved_Funding","DataApproved_Rejected", "Date_Paid", "Type_of_Investment", "Description",
                   "NumberPeopleServed", "Notes", "PageIndex"]

    if i == start:
        with open(extracted_data, "w+", newline='') as init_file:
            csv_writer = csv.writer(init_file) #csv header object
            csv_writer.writerow(page_headers) #writing the header row

    with open(extracted_data, 'a', newline='') as file:
      csv_writer = csv.writer(file)
      for row in page_data:
        csv_writer.writerow(row) #writing the extracted data