<a href="https://colab.research.google.com/github/sandhyaparna/CV-Training/blob/main/notebooks/OCR/01_vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Apple Vision Framework

### Install packages

In [None]:
# !pip install opencv-python pillow pyobjc-framework-Vision pyobjc jaro_winkler

### Import Packages

In [None]:
import json
import os
import string
import time
import jaro
import cv2
import Quartz
import Vision
from Cocoa import NSURL
from IPython.display import Image, display
from PIL import Image, ImageDraw, ImageFont
from typing import List
from apple_vision_utils import get_ocr_observations, extract_recognized_text, remove_after_special_char, split_text, draw_text_on_image

### Perform OCR on a single image

##### Run Vision framework on one of the images
* RAW OUTPUT contains confidence score and the bounding box coordinates for the predicted text

In [None]:
img_path = "../x.jpeg"
observations = get_ocr_observations(img_path)
print("RAW OBSERVATIONS: \n", observations)

In [None]:
print("Recognized text in the image:", extract_recognized_text(observations))

#### Strings after the predicted text is split into subphrases:

In [None]:
split_strings_list = [word for text in extract_recognized_text(observations) for word in split_text(text)]
print(split_strings_list)

### Display an image with bounding box and text for OCR

In [None]:
# observations = get_ocr_observations(img_path)
recognized_text = []
bounding_boxes = []

for observation in observations:
    # Get the top candidate text string
    recognized_text.append(observation.topCandidates_(1)[0].string())
    # Get the bounding box for the recognized text
    bounding_boxes.append(observation.boundingBox())

recognized_text, bounding_boxes

draw_text_on_image(img_path, zip(recognized_text, bounding_boxes))

In [None]:
img_name = "TJ.jpeg"
img_path = f"../data/{img_name}"

observations = get_ocr_observations(img_path)
print("RAW OBSERVATIONS: \n", observations)

In [None]:
# apple_vision_framework_ocr_text(img_path)

In [None]:
print("Recognized text in the image:", extract_recognized_text(observations))

In [None]:
split_strings_list = [word for text in extract_recognized_text(observations) for word in split_text(text)]
print(split_strings_list)

In [None]:
# preprocess results
processed_results = {remove_after_special_char(word.lower()) for word in split_strings_list}
processed_results

## Check if one of the n labels is present in the text
Final logic to detect label of an image:
* OCR is performed on the whole image
* If the click dimensions are inside the bounding box coordinates of any result from the OCR, only that particular text is pre-processed and is evaluated using the below criteria to determine the label:
    * If the pre-prcoessed substring is part of any of the labels
    * else - evaluate the jaro winkler similarity of the pre-prcoessed text against all the labels, and output the label which have the highest Jaro Winkler similarity metric

## This code uses two label sets

In [None]:
labels_set1 = {"x", "y", "z"}
labels_list2 = [{'a', 'b'},
{'c', 'd'},
{'e', 'f'}]

# Step 1: Attempt a direct match between two sets of labels
# 'labels_set1' is a predefined set of labels
# 'processed_results' is a set of processed text
direct_match = labels_set1.intersection(processed_results)

# If there is any overlap (i.e., direct match found), print the matching labels
if direct_match:
    print(direct_match)
else:
    # Step 2: If no direct match, attempt a secondary match using a list of label combinations
    # 'labels_list2' is a list of lists or tuples, where each sublist contains multiple words
    # The goal is to find a sublist where at least two words are present in 'processed_results'
    secondary_match = next(
        (label_combo for label_combo in labels_list2 if sum(1 for word in label_combo if word in processed_results) >= 2),
        None  # If no such combination is found, return None
    )

    # If a secondary match is found, print it
    if secondary_match:
        print(secondary_match)
    else:
        # Step 3: As a final fallback, use fuzzy matching with the Jaro-Winkler similarity metric
        # This helps catch near matches that aren't exact due to typos, formatting, etc.
        found_match = False  # Flag to indicate if a fuzzy match is found

        # Iterate through each result in the processed results
        for result in processed_results:
            # Compare it against each label in the primary label set
            for label in labels_set1:
                # Compute the Jaro-Winkler similarity score between the result and the label
                jaro_winkler_metric = jaro.jaro_winkler_metric(result, label)

                # If the similarity score is above a threshold (e.g., 0.8), consider it a match
                if jaro_winkler_metric > 0.8:
                    print(label)  # Print the matched label
                    found_match = True
                    break  # Exit the inner loop once a match is found

            if found_match:
                break  # Exit the outer loop as well once a match is found


## This code uses label dict instead of two label sets

In [None]:
labels_dict = {
    "x": "A",
    "y": "B",
    "z": "C",
}

# Collect all direct matches (exact string matches) from processed_results
# using the keys in labels_dict. The values are the standardized labels.
direct_match = {labels_dict[result] for result in processed_results if result in labels_dict.keys()}

# If at least one direct match is found
if direct_match:
    # Exactly one label found → success
    if len(direct_match) == 1:
        print("direct_match",direct_match)
    # More than one different label was matched → ambiguous result
    else:
        print("more than 2 labels found, retake image")

# If no direct match, try fuzzy matching
else:
    similarity_match = set()
    # Iterate through each result in the processed results
    for result in processed_results:
        # Compare it against each label in the primary label set
        for text, label  in labels_dict.items():
            # Compute the Jaro-Winkler similarity score between the result and the label
            jaro_winkler_score = jaro.jaro_winkler_metric(result, text)
            # If the similarity score is above a threshold (e.g., 0.8), consider it a match
            if jaro_winkler_score > 0.8:
                similarity_match.add(label)

    # If we found at least one fuzzy match
    if similarity_match:
        # Exactly one fuzzy match found → success
        if len(similarity_match) == 1:
            print("fuzzy_match",similarity_match)
        # Multiple fuzzy matches found → ambiguous
        else:
            print("more than 2 labels found, retake image")

    # Neither direct nor fuzzy match found
    else:
        print("no confident match found")


In [None]:
if direct_match and len(direct_match) == 1:
        print("direct_match",direct_match)