In [None]:
# Installing the easyocr library used to extract text from images
!pip install easyocr

import pandas as pd
import numpy as np
from PIL import Image
import easyocr
import re
import requests
from io import BytesIO
import time 

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Loading Input csv file, set the path properly of source file
TEST_FILE = '../dataset/test.csv'
test_df = pd.read_csv(TEST_FILE)
start = 0

# Code to selectively process images based on the index range
# Use this only to select a particular range
# start = 0
# end = 10
# test_df = test_df[(test_df['index'] >= start) & (test_df['index'] <= end)]

# All entity and their according units
entity_unit_map = {
    'width': {'centimetre', 'centimeters', 'foot', 'feet', 'inch', 'inches', 'metre', 'meters', 'millimetre', 'millimeters', 'yard', 'yards', 'cm', 'mm', 'm'},
    'depth': {'centimetre', 'centimeters', 'foot', 'feet', 'inch', 'inches', 'metre', 'meters', 'millimetre', 'millimeters', 'yard', 'yards', 'cm', 'mm', 'm'},
    'height': {'centimetre', 'centimeters', 'foot', 'feet', 'inch', 'inches', 'metre', 'meters', 'millimetre', 'millimeters', 'yard', 'yards', 'cm', 'mm', 'm'},
    'item_weight': {'gram', 'grams', 'kilogram', 'kilograms', 'microgram', 'micrograms', 'milligram', 'milligrams', 'ounce', 'ounces', 'pound', 'pounds', 'ton', 'tons', 'g', 'kg', 'mg', 'lb', 'oz'},
    'maximum_weight_recommendation': {'gram', 'grams', 'kilogram', 'kilograms', 'microgram', 'micrograms', 'milligram', 'milligrams', 'ounce', 'ounces', 'pound', 'pounds', 'ton', 'tons', 'g', 'kg', 'mg', 'lb', 'oz'},
    'voltage': {'kilovolt', 'kilovolts', 'millivolt', 'millivolts', 'volt', 'volts', 'kv', 'mv', 'v'},
    'wattage': {'kilowatt', 'kilowatts', 'watt', 'watts', 'kw', 'w'},
    'item_volume': {'centilitre', 'centilitres', 'cubic foot', 'cubic feet', 'cubic inch', 'cubic inches', 'cup', 'cups', 'decilitre', 'decilitres', 'fluid ounce', 'fluid ounces', 'gallon', 'gallons', 'imperial gallon', 'imperial gallons', 'litre', 'litres', 'microlitre', 'microlitres', 'millilitre', 'millilitres', 'pint', 'pints', 'quart', 'quarts', 'cl', 'ml', 'l' }
}

# Full form of units if short form is present in picture
unit_full_form_map = {
    'g': 'gram', 'kg': 'kilogram', 'mg': 'milligram', 'lb': 'pound', 'oz': 'ounce',
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt',
    'w': 'watt', 'kw': 'kilowatt',
    'ml': 'millilitre', 'l': 'litre', 'cl': 'centilitre',
    'fl oz': 'fluid ounce'
}

# Extract allowed units from the entity_unit_map
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Function to detect measurement units in extracted text and find corresponding values
def find_units_and_values(text, allowed_units):
    found_units_and_values = []

    # Removing all non-alphanumeric characters from text obtained
    text = re.sub(r'[^a-zA-Z0-9\s\.]', '', text) 

    # To find and match valid values with units
    for unit in allowed_units:

        pattern = rf"(\d+(\.\d+)?\s*{unit})"
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        if matches:

            found_units_and_values.extend(matches)
    return found_units_and_values

# Function to select the most appropriate unit and value pair based on the entity name
def get_relevant_value(found_units_and_values, entity_name):
    relevant_units = entity_unit_map.get(entity_name, set())

    for unit_value_pair in found_units_and_values:

        matched_value = unit_value_pair[0]
        match = re.match(r"(\d+(\.\d+)?)\s*([a-zA-Z]+)", matched_value)

        if match:
            value, unit = match.group(1), match.group(3).lower() 
            if unit == 'fluid':
                unit = 'fl oz'

            # Check if the unit matches any of the relevant ones for the entity
            for relevant_unit in relevant_units:
                if relevant_unit in unit.lower():

                    # Replaced unit with full form
                    full_unit = unit_full_form_map.get(unit.lower(), unit)
                    return f"{value} {full_unit}" 

    return None

# Counter to track processed images
x = start
processed_images_count = 0 + x -1
total_images = len(test_df) + x - 1

# Process the current images
predictions = []
for idx, row in test_df.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']

    # Getting the image from the URL presented in the dataset
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))

    # Using Easy-OCR to extract text from the image
    result = reader.readtext(np.array(img))

    # Concatenate all the text from OCR results
    extracted_text = " ".join([text[1].replace(',', '.') for text in result])  # Replace commas with periods

    # Detecting measurement units and corresponding values in the extracted text
    found_units_and_values = find_units_and_values(extracted_text, allowed_units)

    # Finding the relevant value and unit all per the entity given
    relevant_value = get_relevant_value(found_units_and_values, entity_name)

    # Add the result to the output csv file
    predictions.append({
        'index': row['index'],
        'prediction': relevant_value if relevant_value else " "
    })

    # Increment the processed images counter
    processed_images_count += 1

    # Printing the latest image processed
    print(f"Processed {processed_images_count}/{total_images} images")

# Convert the output to a DataFrame and save it as per the name given
predictions_df = pd.DataFrame(predictions)
# Set the path properly of output file
predictions_df.to_csv('../output/test_out.csv', index=False)

print(f"Total number of images processed: {processed_images_count - start + 1}")