## **Importing** **Libraries**

In [12]:
import os
import re
import cv2
import time
import urllib
import requests
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm
from io import BytesIO
from pathlib import Path
from functools import partial
from google.colab import drive
from time import time as timer
from collections import Counter
from PIL import Image, ImageEnhance


!pip install pillow
!pip install pytesseract
!pip install opencv-python
!pip install paddlepaddle paddleocr

import pytesseract
from paddleocr import PaddleOCR



## **Reading Dataset**

In [13]:
drive.mount('/content/drive')
train = pd.read_csv('/content/drive/My Drive/dataset/train.csv')
test = pd.read_csv('/content/drive/My Drive/dataset/test.csv')
output = pd.read_csv('/content/drive/My Drive/dataset/output_sanity.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
unique_values_count = test['group_id'].nunique()
unique_value = test['group_id'].unique()

print(f'Number of unique values: {unique_values_count}')
unique_value

Number of unique values: 924


array([156839, 792578, 478357, 953313, 276611, 648011, 279307, 569206,
       348551, 442321, 751532, 498074, 178958, 752266, 965518, 957050,
       486636, 329793, 113134, 601746, 244283, 249345, 955292, 916768,
       297918, 709627, 997176, 558806, 801829, 630390, 488883, 933453,
       411423, 469317, 860821, 311997, 932012, 120219, 961822, 318770,
       524117, 628971, 219211, 267482, 849273, 308856, 982714, 365637,
       494500, 931856, 150913, 594224, 658003, 913156, 452717, 449805,
       704724, 825954, 386873, 695925, 640408, 521308, 312608, 507988,
       519155, 308671, 861555, 658892, 594191, 900671, 783898, 276075,
       801837, 926134, 149708, 939129, 966823, 825239, 635528, 625310,
       462731, 243137, 823083, 254449, 171418, 781426, 433914, 292475,
       893692, 120569, 369753, 483370, 152057, 983323, 302672, 810266,
       375816, 436746, 639090, 858439, 276700, 952353, 922709, 178778,
       501250, 963595, 387046, 347404, 764417, 745326, 245959, 525429,
      

## **Entity Unit Mapping**

In [15]:
entity_unit_map = {
    'width': {'centimetre', 'centimeter', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'meter', 'm', 'millimetre', 'millimeter', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'centimeter', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'meter', 'm', 'millimetre', 'millimeter', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'centimeter', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'meter', 'm', 'millimetre', 'millimeter', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'maximum_weight_recommendation': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'voltage': {'kilovolt', 'kv', 'millivolt', 'mv', 'volt', 'v'},
    'wattage': {'kilowatt', 'kw', 'watt', 'w'},
    'item_volume': {
        'centilitre', 'cl', 'cubic foot', 'ft³', 'cubic inch', 'in³', 'cup', 'decilitre', 'dl',
        'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'litre', 'liter', 'l', 'microlitre', 'µl',
        'millilitre', 'ml', 'pint', 'pt', 'quart', 'qt'
    }
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [16]:
unit_full_form_map = {
    'cm': 'centimeter', 'centimetre': 'centimeter',
    'mm': 'millimeter', 'm': 'meter', 'ft': 'foot', 'in': 'inch', 'yd': 'yard',
    'g': 'gram', 'kg': 'kilogram', 'mg': 'milligram', 'µg': 'microgram', 'oz': 'ounce', 'lb': 'pound', 'ton': 'ton',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt',
    'w': 'watt', 'kw': 'kilowatt',
    'cl': 'centilitre', 'ft³': 'cubic foot', 'in³': 'cubic inch', 'l': 'liter', 'ml': 'millilitre', 'µl': 'microlitre',
    'pt': 'pint', 'qt': 'quart', 'fl oz': 'fluid ounce', 'gal': 'gallon'
}

allowed_units = set(unit_full_form_map.keys())

## **Extracting dimensions from image**

In [17]:
df = pd.DataFrame(test)
# Create a new column to identify changes in group_id
df['group_change'] = (df['group_id'] != df['group_id'].shift()).astype(int)
df_filtered = df[df['group_change'] != 0]
df['prediction'] = output['prediction']
print(len(df_filtered))

89879


In [18]:
# Helper function to parse values and convert units to their full form
def extract_values(ocr_text):
    # Regex to find values with units like 45.8 cm, 0.8 inch
    pattern = re.compile(r"(\d*\.?\d+)\s*(cm|inch|centimetre|centimeter|cm|foot|ft|inch|in|metre|meter|m|millimetre|millimeter|mm|yard|yd|km|kilometer|kilometre)")
    matches = pattern.findall(ocr_text)
    # Convert the unit to full form in lowercase
    return [(float(value), convert_unit_to_full_form(unit)) for value, unit in matches]

# Main function to determine height, width, and depth
def get_dimensions(ocr_text):
    extracted_values = extract_values(ocr_text)

    if not extracted_values:
        return "", "", ""

    # Count the occurrence of each unit and pick the most frequent one
    units = [unit for _, unit in extracted_values]
    most_common_unit = Counter(units).most_common(1)[0][0]

    # Filter values that match the most frequent unit
    values_in_common_unit = [value for value, unit in extracted_values if unit == most_common_unit]

    # If less than 2 values in the most common unit, set both width and depth to None
    if len(values_in_common_unit) < 2:
            height = f"{values_in_common_unit[0]} {most_common_unit}" if values_in_common_unit else ""
            width = ""
            depth = ""

            return height, width, depth

    # If less than 3 values in the most common unit, set depth to None
    if len(values_in_common_unit) < 3:
            values_in_common_unit.sort(reverse=True)
            height = f"{values_in_common_unit[0]} {most_common_unit}"
            width = f"{values_in_common_unit[1]} {most_common_unit}" if len(values_in_common_unit) > 1 else ""
            depth = ""
            return height, width, depth

    # Sort the values in descending order and assign to height, width, and depth
    values_in_common_unit.sort(reverse=True)
    height = f"{values_in_common_unit[0]} {most_common_unit}"
    width = f"{values_in_common_unit[1]} {most_common_unit}" if len(values_in_common_unit) > 1 else ""
    depth = f"{values_in_common_unit[2]} {most_common_unit}" if len(values_in_common_unit) > 2 else ""

    return height, width, depth

## **OCR Model Predictions**

In [19]:
# Initialize the PaddleOCR model
ocr_model = PaddleOCR(use_angle_cls=True, lang='en')

# Function to handle common mistakes in units
def common_mistake(unit):
    if unit is None:
        return None
    if unit in allowed_units:
        return unit
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def add_space_between_alpha_and_number(text):
    pattern = r"([a-zA-Z])(\d)"
    spaced_text = re.sub(pattern, r"\1 \2", text)
    return spaced_text

# Function to extract OCR text from an image URL using PaddleOCR or Tesseract as a fallback
def extract_ocr_text_from_image(image_link, use_tesseract=False):
    try:
        # Download the image
        response = requests.get(image_link)
        img = Image.open(BytesIO(response.content))
        img = img.convert("RGB")

        if use_tesseract:
            ocr_text = pytesseract.image_to_string(img)
        else:
            ocr_results = ocr_model.ocr(np.array(img))
            ocr_text = ' '.join([item[1][0] for line in ocr_results for item in line])

        if not ocr_text.strip():
            print(f"No text detected in image: {image_link}")
            return ""

        return ocr_text
    except Exception as e:
        print(f"Error extracting OCR text: {e}")
        return ""

def convert_unit_to_full_form(unit):
    unit = unit.lower()
    return unit_full_form_map.get(unit, unit)

def extract_numeric_values_and_units(text, entity_units):
    pattern = r"(\d*\.\d+|\d+)\s*({})".format('|'.join(map(re.escape, entity_units)))
    matches = re.findall(pattern, text)

    cleaned_matches = []
    for num, unit in matches:
        if num.startswith("."):
            cleaned_num = num[1:] if len(num) > 1 else num
        else:
            cleaned_num = num

        cleaned_matches.append((float(cleaned_num), unit))

    return cleaned_matches

# Function to match numeric values to the entity based on extracted values and units
def match_value_to_entity(numeric_values_with_units, entity_name):

    valid_units = entity_unit_map.get(entity_name.lower(), [])
    for value, unit in numeric_values_with_units:
        if unit in valid_units:
            return value, unit

    return None, None

# Updated process_dataset to use the new matching logic
def process_dataset(df, output):
    # Initialize count of matches
    image_no = 1
    ocr_text = ''

    for index, row in df.iterrows():
        image_link = row['image_link']
        entity_name = row['entity_name']
        result = row['prediction']

        if result == '' or (entity_name == 'height' or entity_name == 'width' or entity_name == 'depth'):

            print(f"\nIndex Image: {image_no}\n")
            image_no += 1

            if row['group_change'] == 0:
              print(f"Skipping OCR extraction for index {index} as group_id is repeated.")

            elif row['group_change'] == 1:

            # Extract OCR text from the image using Tesseract first
                ocr_text = extract_ocr_text_from_image(image_link, use_tesseract=True)

                if not ocr_text:  # If OCR text is still empty after both OCR attempts
                    print(f"Tesseract failed, trying PaddleOCR for Image: {image_link}")
                    ocr_text = extract_ocr_text_from_image(image_link)


            if ocr_text:
                # Add a space between alphabetic characters and numbers
                ocr_text = add_space_between_alpha_and_number(ocr_text)

                # Extract numeric values from OCR text
                ocr_text = ocr_text.lower()
                # print(f"OCR Text: {ocr_text}")

                if(entity_name == 'height' or entity_name == 'width' or entity_name == 'depth') :
                    height, width, depth = get_dimensions(ocr_text)
                    entity_val = ""

                    if(entity_name == 'height') :
                      entity_val = height

                    elif (entity_name == 'width') :
                      entity_val = width

                    else :
                      entity_val = depth

                    output.loc[index, 'prediction'] = entity_val
                    # output({"index": index, "entity_name": entity_name, "entity_value": entity_val})


                else:
                  entity_units = entity_unit_map.get(entity_name, allowed_units)

                  # Extract numeric values and their corresponding units
                  numeric_values_with_units = extract_numeric_values_and_units(ocr_text, entity_units)

                  # Match the numeric values to the entity
                  value, unit = match_value_to_entity(numeric_values_with_units, entity_name)
                  unit = common_mistake(unit)

                  if value and unit:
                      # Convert the unit to its full form before appending
                      full_unit = convert_unit_to_full_form(unit)
                      # cnt += 1  # Increment count of successful matches
                      output.loc[index, 'prediction'] = f"{value} {full_unit}"
                      # results.append({"index": index, "entity_name": entity_name, "entity_value": f"{value} {full_unit}"})
                  else:
                      # If no match is found, keep the entity_value empty
                      output.loc[index, 'prediction'] = ""
                      # results.append({"index": index, "entity_name": entity_name, "entity_value": ""})
                      print(f"No match found for Entity: {entity_name} in Image: {image_link}")

            else:
                # If OCR text extraction fails, keep entity_value empty
                output.loc[index, 'prediction'] = ""
                # results.append({"index": index, "entity_name": entity_name, "entity_value": ""})
                print(f"Failed to extract OCR text for Image: {image_link}")

    return output

start_index = 1
stop_index = 10

df_selected = df[start_index:stop_index].copy()
df_selected['group_change'] = (df_selected['group_id'] != df_selected['group_id'].shift()).astype(int)

for i in range(start_index, stop_index):
    df.at[i, 'group_change'] = df_selected.at[i, 'group_change']

# Process the dataset
process_dataset(df[start_index:stop_index], output[start_index:stop_index])

# Display the final output
output[start_index:stop_index]
output_cleaned = output.iloc[:, :-2]
output_cleaned[start_index:stop_index]

[2024/10/17 18:32:54] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

Unnamed: 0,index,prediction
1,1,42.0 centimeter
2,2,200.0 centimeter
3,3,
4,4,
5,5,200.0 centimeter
6,6,42.0 centimeter
7,7,4.0 inch
8,8,30.0 centimeter
9,9,40.0 centimeter


In [21]:
outputs = output_cleaned[start_index:stop_index]
outputs.to_csv('Output_Sanity.csv', index=False)