In [None]:
#mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#installing required modules
!pip install pillow 
!pip install pytesseract
!pip install easyocr
!apt install tesseract-ocr

In [None]:
#importing required libraries
from pytesseract import pytesseract, Output
import cv2
import os
import csv
from csv import writer
import easyocr

In [None]:
# creates a file name with a padded number, specified prefix and extension
def create_image_file_name(number, file_prefix="PDF-B-Image-"):
    file_ext = ".png"
    count = 1
    temp = number
    while temp >= 10:
        count += 1
        temp //= 10
    padded_number = str(number).zfill(3) # Pad the number with zeros to a length of 3
    file_name = f"{file_prefix}{padded_number}{file_ext}"
    if os.path.exists(file_name):
        raise FileExistsError("File already exists.")
    return file_name

# calculates the differences between adjacent slice indices along a given axis
def slice_differences(image, slice_indices, slice_axis):
    result = []
    size = len(slice_indices)

    for i, val in enumerate(slice_indices):
        if i == size - 1:
            last_diff = image.shape[slice_axis] - val
            result.append(last_diff)
        else:
            diff = slice_indices[i+1] - val
            result.append(diff)
    return result

# PRE-PROCESSING
# convert a given image to binary using OpenCV's thresholding functionality
def convert_to_binary(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    return binary

# convert a given image to absolute scale using OpenCV's convertScaleAbs function
def convert_to_abs_scale(image):
    image = cv2.convertScaleAbs(image)
    return image

# create an EasyOCR reader object for English language
reader = easyocr.Reader(['en'])

In [None]:
def subset(file_name, file_number):

    # define the path to the image files
    # image_file = os.path.join(os.getcwd(), "PDF B Images", file_name)
    image_file = "/content/drive/MyDrive/1Extraction/PDF B - Sample Images/PDF-B-Image-072.png"

    # read the image file using cv2
    image = cv2.imread(image_file)

    # create an empty list to store the variable heights
    var_heights = []

    # pre-processing the image before data extraction
    image = convert_to_binary(image)

    # Define the region of interest for project number bounding boxes
    x1, x2, y1, y2 = 30, 150, 100, 1600
    proj_bboxes = image[y1:y2, x1:x2]

    # Define the markers for pages
    odd_start_markers = [30, 155, 350, 720, 965, 1065, 1175, 1260, 1365, 1510, 1700]
    even_start_markers = [35, 133, 236, 335, 495, 800, 1490, 1590]


    #OCR data extraction
    data = pytesseract.image_to_data(proj_bboxes, output_type=Output.DICT)

    text_recognition_count = len(data["text"])

    cleaned_data = {
        "top": [data["top"][i] for i in range(text_recognition_count) if data["text"][i] != "" and len(data["text"][i]) > 2]
    }

    # Clean the output and extract relevant heights and widths
    cleaned_data = {"top": []}
    for idx in range(len(data["text"])):
        if data["text"][idx] != "" and len(data["text"][idx]) > 2:
            cleaned_data["top"].append(data["top"][idx])
    
    # Calculate height differences between bounding boxes and cleaned data
    height_slices = slice_differences(proj_bboxes, cleaned_data["top"], 0)

    # Calculate width differences in odd numbered cells
    width_slices_odd = slice_differences(image, odd_start_markers, 1)

    # Extract data from odd pages
    odd_data = {}
    for index, top in enumerate(cleaned_data["top"]):
        start_pos = top + 90
        strip_image = image[start_pos:start_pos + height_slices[index], 0:image.shape[1]]
        row_data = []
        for col_index, left in enumerate(odd_start_markers):
            cell = strip_image[0:strip_image.shape[1], left:left + width_slices_odd[col_index]]
            cell_data_array = reader.readtext(cell, detail=0)  #detail=0 for extracting only the text without bounding box coordinates and conf
            cell_string = " ".join(cell_data_array).strip() or "null"
            row_data.append(cell_string)
        odd_data[index] = row_data


    # Extract data from even pages
    # new_even_image = os.path.join(os.getcwd(), "PDF B Images", create_image_file_name(number+1))
    new_even_image = "/content/drive/MyDrive/1Extraction/PDF B - Sample Images/PDF-B-Image-073.png"
    even_image = cv2.imread(new_even_image, 0)

    # pre-processing the image before data extraction
    # even_image = convert_to_binary(even_image)

    width_slices_even = slice_differences(even_image, even_start_markers, 1)


    # Extract data from even pages
    even_data = {}
    for index, top in enumerate(cleaned_data["top"]):
        start_pos = top + 140
        strip_image = even_image[start_pos:start_pos + height_slices[index], 0:even_image.shape[1]]
        row_data = []
        for col_index, left in enumerate(even_start_markers):
            cell = strip_image[0:strip_image.shape[1], left:left + width_slices_even[col_index]]
            cell_data_array = reader.readtext(cell, detail=0)
            cell_string = " ".join(cell_data_array).strip() or "null"
            row_data.append(cell_string)
        even_data[index] = row_data + [f"{file_number}-{file_number+1}"]
        

    # # Combine odd and even page data
    combined_data = {}
    for index in range(len(odd_data)):
        combined_data[index] = odd_data[index] + even_data[index]


    # Return a list of data rows
    return list(combined_data.values())

# Writing the data into a CSV file
start = 46
for i in range(46,669,2):
    page_data = subset(create_image_file_name(i), i)
    print("Processing data for pages {} and {} is done.".format(i, i+1))
    
    extracted_data = "/content/drive/MyDrive/1Extraction/Output files in .csv/output -PDF B.csv"
    page_headers = ["ProjectNumber", "Lead", "RecipientName", "RecipientType", "Indigenous", 
                    "Town_City_Community", "Province_Territory", "PostalCode","ContactName",
                   "ContactPhone","ContactEmail", "Funding_Status", "Requested", "Approved", 
                   "Investment_Type", "Investment_Description","Investment_Output", "Describe_Remaining_Need",
                    "Additonal_Funding_Needed", "PageIndex"]

    if i == start:
        with open(extracted_data, "w+", newline='') as init_file:
            csv_writer = csv.writer(init_file) #csv header object
            csv_writer.writerow(page_headers) #writing the header row

    with open(extracted_data, 'a', newline='') as file:
      csv_writer = csv.writer(file)
      for row in page_data:
        csv_writer.writerow(row) #writing the extracted data