# import

In [1]:
import cv2
import numpy as np
from collections import Counter

import cv2


from skimage import io 

import pytesseract

from PIL import Image, ImageEnhance 
import json

# find vertical line

In [9]:

def return_index_of_first_vertical_line_in_image(image_path) -> int:
    # open image
    if "jpeg" in image_path:
        image = io.imread(image_path)
    else:
        image = image_path
    # Apply adaptive threshold

    image_thr = cv2.adaptiveThreshold(image, 255, cv2.THRESH_BINARY_INV, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 51, 0)  # try this for text

    # Apply morphological opening with vertical line kernel
    kernel = np.ones((image.shape[0], 1), dtype=np.uint8) * 255
    image_mop = cv2.morphologyEx(image_thr, cv2.MORPH_OPEN, kernel)

    line_index = np.where(image_mop[0][20:80] == 255)[0] # 20 is to crop out number maybe

    if len(line_index) > 0:
        return line_index[0] + 20

    else:
        index_values = []
        for i in image[:40]:
            index_of_black_pixel = np.where(i[20:80] == 0)[0]
            if len(index_of_black_pixel) > 0:
                index_values.extend([p + 20 for p in index_of_black_pixel])
        return Counter(index_values).most_common(1)[0][0]
    

# get rows from image

In [10]:

def get_six_rows_from_image(image_path) ->list:
    """
    ex: get_six_rows_from_image("./path/testimage1.jpeg")
    use "./testimage1.jpeg" if file is in same directory as script
    ***include file extension***
    """
    im = Image.open(image_path)
    height_of_rows = int(im.getbbox()[3]/6)
    image_right = im.getbbox()[2]
    image_left = return_index_of_first_vertical_line_in_image(image_path)
    image_lower = height_of_rows
    image_top = 0

    image_data = []
    for j in range(1,7):

        box = (image_left, image_top, image_right, image_lower)

        image_top = image_lower
        image_lower = image_lower + height_of_rows
        region = im.crop(box)
        image_data.append(region)

    return image_data


# split rows into columns

In [11]:

def get_columns_from_row(image_data_row,num_of_columns) -> list:
    im = image_data_row

    row_squares = []
    box_width = im.getbbox()[2] /num_of_columns
    image_height = im.getbbox()[3]

    box = (0 , 0,  box_width, image_height)
    #columns 1-7
    for i in range(8):
        row_squares.append(im.crop(box))
        box = (box[0] + box_width, 0, box[2] + box_width, image_height)
    
    #columns 8 and 9
    box = (box[0], 0, box[2] + box_width + int(box_width*.25), image_height)
    row_squares.append(im.crop(box))

    # columns 10 - 13
    box = (box[2], 0, box[2] + box_width, image_height)
    for i in range(4):
        row_squares.append(im.crop(box))
        box = (box[0] + box_width, 0, box[2] + box_width, image_height)

    # column 14 and 15
    box = (box[0], 0, box[2] + box_width , image_height)
    row_squares.append(im.crop(box))
    #column 16 and 17
    box = (box[2], 0, box[2] + box_width*1.7 , image_height)
    row_squares.append(im.crop(box))
    # column 18 and 19
    box = (box[2], 0, box[2] + box_width, image_height)
    for i in range(4):
        row_squares.append(im.crop(box))
        box = (box[0] + box_width, 0, box[2] + box_width, image_height)

    
    return row_squares


def get_text_from_square(image_square, num_only=False) -> list:
    imenhance = ImageEnhance.Sharpness(image_square)
    im = imenhance.enhance(5.0)
    if num_only == True:
        return pytesseract.image_to_string(np.asarray(im),  config="outputbase digits")           # pass preprocessed image to tesseract
    else:
        return pytesseract.image_to_string(np.asarray(im),  config="")


In [12]:
def process_images_return_dict(image_folder_path,num_images_to_process):
    """
    ex: process_images_return_dict('./Images',5)
    """
        
    image_data_dict = {}
    for i in range(1,num_images_to_process+1):
        file_name = f'{image_folder_path}/testimage{i}.jpeg'
        key = f'testimage{i}'
        image_rows = get_six_rows_from_image(file_name)
        image_rows_columns = [get_columns_from_row(row,20) for row in image_rows]
        image_text_data = []
        for row in image_rows_columns:
            row_data = []
            for c,column in enumerate(row):
                if 6 < c < 17:
                    row_data.append(get_text_from_square(column, num_only=False))
                else:
                    row_data.append(get_text_from_square(column, num_only=True))

            # clean row_data
            row_data = [i.replace('\x0c','').replace('\n','').replace('\f','').replace('\u201c','') for i in row_data]
            image_text_data.append(row_data)
        image_data_dict[key] = image_text_data

    return image_data_dict


# export to json

In [13]:
export_data = process_images_return_dict('./Images',5)

In [14]:
# Convert the dictionary to a JSON string
json_data = json.dumps(export_data)

# Open a file and write the JSON string to it
with open('mckee_robert_results.json', 'w') as f:
    f.write(json_data)