In [None]:
!pip install easyocr
!pip install opencv-python

In [None]:
import easyocr
import requests
import cv2
import re
import requests
from io import BytesIO
from PIL import Image
import numpy as np
import pandas as pd
'''
  easyocr -> Used to extract text from images
  requests -> HTTP library to fect images from the URLs
  cv2 ->  for grayscale conversion(pre-processing of image)
  re -> used for string matching to extract specific information from the extracted text.
  BytesIO -> to handle image data fetched from urls
  PIL -> used for image processing
  numpy -> for array operations on image data
  pandas -> to handle input data from excel files and to export the output
'''

In [None]:
'''
    step of image preprocessing
    i. converts input image to grayscale
    ii. reduces the color depth by dividing pixel values by a factor

    This function helps in pre-processing image such that text extraction is efficent
'''
def convert_to_grayscale_and_reduce_color(image, div_factor=128):
    if len(image.shape) == 2:
        gray_image = image
    else:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    reduced = (gray_image / div_factor).astype(np.uint8)
    return reduced



The functions extract_weights, extract_volts, extract_volume, extract_dimensions, extract_watts, extract_max_weights help in extracting the required entities from the text extracted from the given images.
This was done using regex to match the units of required entity.

In [None]:
def extract_weights(text):
    value = None
    unit = None
    if "pounds" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*pounds', text.lower())
        if match:
            value = float(match.group(1))
            unit = "pound"
    elif "lbs" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*lbs', text.lower())
        if match:
            value = float(match.group(1))
            unit = "pound"
    elif "kg" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*kg', text.lower())
        if match:
            value = float(match.group(1))
            unit = "kilogram"
    elif "ounce" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*ounce', text.lower())
        if match:
            value = float(match.group(1))
            unit = "ounce"
    elif "grams" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*grams', text.lower())
        if match:
            value = float(match.group(1))
            unit = "gram"
    elif "g" in text.lower():
        match = re.search(r'(\d+\.?\d*)\s*g', text.lower())
        if match:
            value = float(match.group(1))
            unit = "gram"
    if value is not None and unit is not None:
        return "{value} {unit}".format(value=value, unit=unit)
    else:
        return None

In [None]:
def extract_volts(text):
    pattern = r'\b(\d+(?:\.\d+)?)(kV|V|v|volts|Volts)\b'
    matches = re.findall(pattern, text)
    converted = ''
    for value, unit in matches:
        if unit == 'kV':
            converted = value + ' kilovolt'
            break
        elif unit.lower() == 'v':
            converted = value + ' volt'
            break
        else:
            converted = value + ' volt'
            break
    return converted

In [None]:
def extract_watts(text):
    pattern = r'\b(\d+(?:\.\d+)?)(kW|W)\b'
    matches = re.findall(pattern, text)
    converted = ''
    for value, unit in matches:
        if unit == 'kW':
            converted= value + ' kilowatt'
            break
        else:
            converted= value + ' watt'
            break
    return converted

In [None]:
def extract_dimensions(text):
    pattern = r'\b(\d+(?:\.\d+)?)\s*(cm|centimetre|mm|millimetre|m|metre|inch|inches|\'|\"|ft|fts|foot|feet|yard|yards|cm |centimetre |mm |millimetre |m |metre |inch |inches |\' |\" |ft |fts |foot |feet |yard |yards )\b'
    matches = re.findall(pattern, text)
    dimensions = []
    for value, unit in matches:
        if "cm" in unit or "centimetre" in unit:
            dimensions.append(f'{value} centimetre')
        elif "mm" in unit or "millimetre" in unit:
            dimensions.append(f'{value} millimetre')
        elif "m" in unit or "metre" in unit:
            dimensions.append(f'{value} metre')
        elif "inch" in unit or "inches" in unit or '"' in unit:
            dimensions.append(f'{value} inch')
        elif "foot" in unit or "feet" in unit or "ft" in unit or "fts" in unit:
            dimensions.append(f'{value} feet')
        elif "yard" in unit or "yards" in unit:
            dimensions.append(f'{value} yard')
    if(dimensions):
      return dimensions[0]
    else:
      return None

In [None]:
import re
def extract_volume(text):
    text=text.lower()
    pattern = r'\b(\d+(?:\.\d+)?)(\s*(?:cubic foot|cubic feet|cu ft|cu\.ft|microlitre|µl|ml|millilitre|milliliter|l|litre|liter|cl|centilitre|centiliter|dl|decilitre|deciliter|cup|cups|fl oz|fluid ounce|pint|pints|qt|quart|quarts|gal|gallon|gallons|imperial gallon|in³|cubic inch|cubic inches|inch³|inch3|inches³|inches3))\b'
    matches = re.findall(pattern, text)
    volumes = []
    for value, unit in matches:
        if "microlitre" in unit or "µl" in unit:
            volumes.append(f'{value} microlitre')
            break
        elif "ml" in unit or "millilitre" in unit or "milliliter" in unit:
            volumes.append(f'{value} millilitre')
            break
        elif "l" in unit or "litre" in unit or "liter" in unit:
            volumes.append(f'{value} litre')
            break
        elif "cl" in unit or "centilitre" in unit or "centiliter" in unit:
            volumes.append(f'{value} centilitre')
            break
        elif "dl" in unit or "decilitre" in unit or "deciliter" in unit:
            volumes.append(f'{value} decilitre')
            break
        elif "cup" in unit or "cups" in unit:
            volumes.append(f'{value} cup')
            break
        elif "fl oz" in unit or "fluid ounce" in unit:
            volumes.append(f'{value} fluid ounce')
            break
        elif "pint" in unit or "pints" in unit:
            volumes.append(f'{value} pint')
            break
        elif "qt" in unit or "quart" in unit or "quarts" in unit:
            volumes.append(f'{value} quart')
            break
        elif "gal" in unit or "gallon" in unit or "gallons" in unit or "imperial gallon" in unit:
            volumes.append(f'{value} gallon')
            break
        elif "cubic foot" in unit or "cubic feet" in unit or "cu ft" in unit or "cu.ft" in unit:
            volumes.append(f'{value} cubic feet')
            break
        elif "cubic inch" in unit or "cubic inches" in unit or "in³" in unit or "inch³" in unit or "inches³" in unit or "inch3" in unit or "inches3" in unit:
            volumes.append(f'{value} cubic inche')
            break
    if volumes:
      return volumes[0]
    else:
      return None

In [None]:
'''
    The function standardizes all the units to grams for easier comparison and returns the maximum weight found in it's original units.
'''
CONVERSION_FACTORS = {
    "milligram": 1e-3,
    "microgram": 1e-6,
    "gram": 1,
    "kilogram": 1e3,
    "ounce": 28.3495,
    "ton": 1e6,
    "pound": 453.592
}

def clean_text(text):
    cleaned_text = re.sub(r"[^\w\s]", "", text)
    return cleaned_text

def extract_max_weights(text):
    text = clean_text(text)
    text=text.lower()
    pattern = r'(\d+(?:\.\d+)?)\s?(milligrams?|mgs?|micrograms?|grams?|gr|g|kilograms?|kgs?|kg|ounces?|oz|tons?|pounds?|lbs?|ibs?|LBS?|Lbs?|ib?|lb?)'
    weight_dict = {}
    matches = list(re.finditer(pattern, text))
    for match in matches:
        value, unit = match.groups()
        value = float(value)
        unit = unit.lower()
        if unit in ["milligram", "milligrams", "mg", "mgs"]:
            unit = "milligram"
        elif unit in ["microgram", "micrograms"]:
            unit = "microgram"
        elif unit in ["gram", "grams", "gr", "g"]:
            unit = "gram"
        elif unit in ["kilogram", "kilograms", "kg", "kgs","k"]:
            unit = "kilogram"
        elif unit in ["ounce", "ounces", "oz"]:
            unit = "ounce"
        elif unit in ["ton", "tons"]:
            unit = "ton"
        elif unit in ["pound", "pounds", "lb", "lbs", "ibs", "Ibs", "LBS","ib","lb","i","l"]:
            unit = "pound"
        weight_in_grams = value * CONVERSION_FACTORS[unit]
        weight_dict[f"{value} {unit}"] = weight_in_grams
    sorted_weights = sorted(weight_dict.items(), key=lambda x: x[1], reverse=True)
    max_weight = sorted_weights[0][0] if sorted_weights else None
    return max_weight


In [None]:
'''
  This function helps is retriving the image from the link provided in the dataset.
  The image is then converted to numpy array.
'''
def get_image(image_link):
    response = requests.get(image_link)
    image = Image.open(BytesIO(response.content))
    image_np = np.array(image)

    return image_np

In [None]:
'''
  Pre-processing of image is done by converting to grayscale,reducing color and sharpening.
  Perfroms optical character recognition using EasyOCR library.
  Calls the appropriate function to extract the required entities.
'''

def extract(typee, image_link):
    image_np=get_image(image_link)
    if image_np.ndim == 3 and image_np.shape[2] == 4:
        image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2RGB)

    reduced=convert_to_grayscale_and_reduce_color(image_np)
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    sharpened_img = cv2.filter2D(reduced, -1, kernel)

    _, binary_img = cv2.threshold(reduced, 128, 255, cv2.THRESH_BINARY)
    result = reader.readtext(image_np,paragraph=True)
    t=""
    for i in result:
      t+=i[1] + " "
    match typee:
      case "wattage":
        return extract_watts(t)
      case "item_weight":
        return extract_weights(t)
      case "voltage":
        return extract_volts(t)
      case "maximum_weight_recommendation":
        return extract_max_weights(t)
      case "depth"|"width"|"height":
        return extract_dimensions(t)
      case "item_volume":
        return extract_volume(t)



In [None]:
'''
  test.csv is loaded into a dataframe using pandas
'''
import pandas as pd
df = pd.read_csv('/content/test.csv')

In [None]:
reader = easyocr.Reader(['en'])



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [None]:
'''
  from the dataframe the entity_type, index are queried and the function "extract" is called
'''
rows = []
from tqdm import tqdm
for i in tqdm(range(df.shape[0])):
  typee = (df.iloc[i,3])
  value=extract(typee,df.iloc[i,1])
  idx=df.iloc[i,0]
  rows.append({'index': idx, 'prediction': value if value is not None else ''})

In [None]:
new_df = pd.DataFrame(rows)

In [None]:
#connection is established to drive and the dataframe generated is converted to csv file and stored on the drive.
from google.colab import drive
drive.mount('/drive/')

Mounted at /drive/


In [None]:
new_df.to_csv('/drive/My Drive/output.csv', index=False)