In [None]:
!pip install paddlepaddle paddleocr
!pip install easyocr
!pip install "python-doctr[tf]"
!pip install "python-doctr[torch]"
!pip install "python-doctr[torch,viz,html,contib]"
!pip install roboflow torch torchvision easyocr opencv-python pandas ultralytics
!pip install transformers==4.40.0 Pillow==10.1.0 torch==2.1.2 torchvision==0.16.2 sentencepiece==0.1.99 accelerate==0.30.1 bitsandbytes==0.43.1
!pip install fuzzywuzzy

In [None]:
import requests
import os
import cv2
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image, ImageFont, ImageDraw
import matplotlib.pyplot as plt
from io import BytesIO
import numpy as np
import pandas as pd
from tqdm import tqdm
import traceback
import easyocr
from concurrent.futures import ThreadPoolExecutor
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from doctr.models import ocr_predictor
from re import L
from ultralytics import YOLO
from transformers import AutoModel, AutoTokenizer
import csv
import re
import ast

In [None]:
def save_paddle_ocr(img_url, out_path, result, font_path=None):
    # Download the image from the URL
    response = requests.get(img_url)
    image = Image.open(BytesIO(response.content))

    # Convert the image to OpenCV format (numpy array)
    image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Extract boxes, text, and scores from the result
    boxes = [res[0] for res in result[0]]
    txts = [res[1][0] for res in result[0]]
    scores = [res[1][1] for res in result[0]]

    # Use the provided font path, or fall back to a default font
    try:
        font = ImageFont.truetype(font_path, 20) if font_path else ImageFont.load_default()
    except IOError:
        print("Font file not found. Using default font.")
        font = ImageFont.load_default()

    # Draw the OCR results on the image
    im_show = draw_ocr(image, boxes, txts, scores, font_path=font_path)

    # Convert the result back to PIL format
    im_show = Image.fromarray(im_show)

    # Save the image with OCR annotations
    im_show.save(out_path)

    # Display the image
    plt.imshow(im_show)
    plt.axis('off')
    plt.show()

In [3]:
def paddle_ocr(dataset):
  # Initialize OCR
  ocr = PaddleOCR(use_angle_cls=True, lang = 'en')

  final_extract = []
  for i in tqdm(range(131189)):  # Adjusted range to reflect 520 to 1000 directly
      img_url = dataset['image_link'][i]

      try:
          result = ocr.ocr(img_url)

          if result != [None]:
              text_confidence_list = [(res[1][0], res[1][1]) for res in result[0]]
          else:
              text_confidence_list = []

      except SystemExit:
          # Catch SystemExit exception and continue (prevents stopping the loop)
          print(f"Skipping image at index {i} due to SystemExit error.")
          text_confidence_list = []

      except Exception as e:
          # Catch any other exceptions and log the error without stopping
          print(f"Error processing image at index {i}: {str(e)}")
          print(traceback.format_exc())  # Optionally log the full traceback
          text_confidence_list = []

      final_extract.append(text_confidence_list)

      # Remove the temporary 'tmp.jpg' file if it exists
      tmp_file = 'tmp.jpg'  # Path to the temporary file
      if os.path.exists(tmp_file):
          os.remove(tmp_file)

  df = pd.DataFrame(list(zip(dataset['index'], dataset['image_link'][87458:], dataset['group_id'][87458:], dataset['entity_name'][87458:], dataset['entity_value'][87458:], final_extract)), columns = ['index', 'image_link', 'group_id', 'entity_name', 'entity_value', 'extracted_text'])
  df.to_csv('paddleocr.csv', index=False)

In [None]:
def extract_text_from_image_easy(url):
    try:
        # Log the URL to check if it's valid
        print(f"Processing image from URL: {url}")

        response = requests.get(url, timeout=10)  # Add a timeout for network requests
        if response.status_code != 200:
            print(f"Error: Received response code {response.status_code} for URL {url}")
            return ""

        image_arr = np.array(bytearray(response.content), dtype=np.uint8)
        img_1 = cv2.imdecode(image_arr, cv2.IMREAD_COLOR)
        if img_1 is None:
            print(f"Error: Failed to decode image from URL {url}")
            return ""

        item = reader.readtext(img_1, detail=1)
        extracted_text_with_confidence = [(entry[1], entry[2]) for entry in item]

        # Log the extracted text for each image
        print(f"Extracted text: {extracted_text_with_confidence}")

        return extracted_text_with_confidence
    except Exception as e:
        # Log the exception and return an empty string to continue the process
        print(f"Error processing image from URL {url}: {e}")
        return ""

In [None]:
reader = easyocr.Reader(['en'], gpu=True)  # Set gpu=True if you have GPU

In [None]:
def process_row_easy(row):
    try:
        # Try to extract the text from the image
        row['Extracted_Text'] = extract_text_from_image_easy(row['image_link'])
    except Exception as e:
        # Handle any exceptions in row processing and return the row with empty Extracted_Text
        print(f"Error processing row {row.name}: {e}")
        row['Extracted_Text'] = ""
    return row

In [None]:
def process_csv_range_easy(csv_path, start_row=None, end_row=None):
    df = pd.read_csv(csv_path)

    # Slice the DataFrame to the specified range
    if start_row is not None and end_row is not None:
        df = df.iloc[start_row:end_row]
    elif start_row is not None:
        df = df.iloc[start_row:]  # Process from start_row to the end if end_row is not specified
    elif end_row is not None:
        df = df.iloc[:end_row]  # Process from the beginning to end_row if start_row is not specified

    # Process rows sequentially for debugging
    processed_rows = []
    for _, row in tqdm(df.iterrows()):
        processed_row = process_row_easy(row)
        processed_rows.append(processed_row)

    # Convert results back to a DataFrame
    df_processed = pd.DataFrame(processed_rows)

    # Save the output to a new CSV file or overwrite the original file
    df_processed.to_csv('easyocr.csv', index=False)
    return df_processed

In [None]:
def easy_ocr(dataset):
  process_csv_range_easy(dataset, start_row=0, end_row=131189)

In [None]:
def doc_tr(dataset):
  # Initializing OCR
  model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
  final_extract = []

  # Process only the first 5 rows for testing
  for i in tqdm(range(87458, 100000)):
      img_url = dataset['image_link'][i]
      response = requests.get(img_url)
      if response.status_code == 200:
          img = Image.open(BytesIO(response.content))
          img.save("/kaggle/working/img1.jpg")

          try:
              doc = DocumentFile.from_images("/kaggle/working/img1.jpg")  # Wrap the image in a list
              result = model(doc)
              if result != [None]:
                  l = []
                  for page in result.pages:
                      for block in page.blocks:
                          for line in block.lines:
                              for word in line.words:
                                  l.append([word.value, word.confidence])
              else:
                  l = []

          except SystemExit:
              # Catch SystemExit exception and continue (prevents stopping the loop)
              print(f"Skipping image at index {i} due to SystemExit error.")
              l = []

          except Exception as e:
              # Catch any other exceptions and log the error without stopping
              print(f"Error processing image at index {i}: {str(e)}")
              print(traceback.format_exc())  # Optionally log the full traceback
              l = []
      else:
          print(f"Failed to fetch image: {img_url}")
          l = []

      final_extract.append(l)

  df = pd.DataFrame(list(zip(dataset['index'], dataset['image_link'][87458:], dataset['group_id'][87458:], dataset['entity_name'][87458:], dataset['entity_value'][87458:], final_extract)), columns = ['index', 'image_link', 'group_id', 'entity_name', 'entity_value', 'extracted_text'])
  df.to_csv('doctr.csv', index=False)

In [None]:
dataset = pd.read_csv('dataset/test.csv')
dataset

In [None]:
paddle_ocr(dataset)
easy_ocr(dataset)
doc_tr(dataset)

In [None]:
def object_detection_util(dataset):
  entities_of_interest = ['height', 'width', 'depth']  # List of entities to filter
  df = pd.read_csv(dataset)

  # Filter rows where the 'entity_name' column matches any of the specified entities
  filtered_df = df[df['entity_name'].isin(entities_of_interest)]

  # Save the result to a new CSV file
  output_file_path = 'height_width_depth_rows.csv'  # Replace with the desired output file path
  filtered_df.to_csv(output_file_path, index=False)

In [None]:
def process_image_from_url(image_url, model_path):
    try:
        # Fetch the image from the URL
        response = requests.get(image_url)
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

        # Load YOLO model for this particular entity
        model = YOLO(model_path)

        # Perform inference with YOLO
        results = model(img)

        # Get bounding boxes in xyxy format
        predictions = results[0].boxes  # Bounding box object

        # Check if there are any bounding boxes
        if len(predictions) > 0:
            # Iterate through bounding boxes and apply OCR
            ocr_results = []
            for box in predictions:
                # Get bounding box coordinates
                x_min, y_min, x_max, y_max = map(int, box.xyxy[0])

                # Crop the image using bounding box
                cropped_img = img[y_min:y_max, x_min:x_max]

                # Perform OCR on the cropped image
                ocr_result = reader.readtext(cropped_img)

                # Collect OCR text
                text_output = ' '.join([text[1] for text in ocr_result])
                ocr_results.append(text_output)

            # Return concatenated OCR results
            return ' '.join(ocr_results) if ocr_results else ''
        else:
            return ''  # No bounding boxes found
    except Exception as e:
        print(f"Error processing {image_url}: {e}")
        return ''  # Return empty string on error


In [None]:
def object_detection(dataset):
  object_detection_util(dataset)
  reader = easyocr.Reader(['en'])

  # Load CSV file (replace 'image_link' with your actual image column name)
  csv_file = 'height_width_depth_rows.csv'
  df = pd.read_csv(csv_file)

  # Add a new column for OCR results
  df['ocr_output'] = ''

  # Dictionary to hold YOLO models
  model_paths = {
      'height': 'height-depthweights/best_h.pt',
      'width': 'height-depthweights/best.pt',
      'depth': 'height-depthweights/best_d.pt'
  }

  for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    image_url = row['image_link']  # Replace 'image_link' with your actual column name
    entity_name = row['entity_name']  # Replace 'entity_name' with your actual column name

    # Get the appropriate model path based on the entity name
    model_path = model_paths.get(entity_name, None)

    if model_path:
        ocr_text = process_image_from_url(image_url, model_path)
        df.at[index, 'ocr_output'] = ocr_text
    else:
        print(f"Model for entity '{entity_name}' not found. Skipping.")
        df.at[index, 'ocr_output'] = ''  # Optionally set empty string if model is not found

  # Save the updated dataframe back to a new CSV
  output_csv_file = 'detection.csv'
  df.to_csv(output_csv_file, index=False)

  print("OCR processing complete. Results saved to:", output_csv_file)


In [None]:
def mini_cpm(dataset):
  model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
  model.eval()

  i=0
  image_name="test_image1.jpg"
  with open(dataset, 'r', newline='') as f, open("llm_output", mode='w', newline='') as of:
      reader = csv.reader(f)
      writer = csv.writer(of)
      next(reader)
      for r in reader:
          print(r[3])
          res="NA"
          if r[3]!="height" and r[3]!="width" and r[3]!="depth":
              r.append(res)
              writer.writerow(r)
              continue
          print(r[1])
          url=r[1]
          response = requests.get(url)
          if response.status_code == 200:
              # Create a file name from the URL
              #image_name = os.path.join(save_dir, url.split('/')[-1])
              # Save the image
              with open(image_name, 'wb') as imf:
                  imf.write(response.content)
              print(f"Image saved: {image_name}")
          else:
              print(f"Failed to fetch image: {url}")

          img= Image.open(image_name)
          question = f'What is the {r[3]} as visible in the image ? Return along with proper units as visible in image. Return output in this format- {r[3]}=value units'
          msgs = [{'role': 'user', 'content': question}]

          try:
              res = model.chat(
                image= img,
                msgs=msgs,
                tokenizer=tokenizer,
                sampling=False, # if sampling=False, beam_search will be used by default
                temperature=0.3,
                # system_prompt=system_prompt, # pass system_prompt if needed
                num_beams = 4
              )
          except:
              continue
          i+=1
          print(res)
          r.append(res)
          writer.writerow(r)
          print(r)
          print(f"====================================================== {i} ==================================================")
  #         if i==10:
  #             break

In [None]:
mini_cpm(dataset)

In [None]:
def merge_ocr_output(test_csv, object_detect_csv, output_csv):
    # Load both CSV files
    test_df = pd.read_csv(test_csv)
    object_detect_df = pd.read_csv(object_detect_csv)

    # Merge both dataframes on the 'index' column, using a left join to keep all rows from test_df
    merged_df = pd.merge(test_df, object_detect_df[['index', 'ocr_output']], on='index', how='left')

    # Save the updated test.csv
    merged_df.to_csv(output_csv, index=False)

# Example usage
test_csv = 'test.csv'
object_detect_csv = 'detection.csv'
output_csv = 'updated_test.csv'

merge_ocr_output(test_csv, object_detect_csv, output_csv)

print(f"Updated test.csv with ocr_output saved as {output_csv}")

In [None]:
def update_ocr_output(updated_test_csv, minicpm_combined_csv, output_csv):
    # Load both CSV files
    updated_test_df = pd.read_csv(updated_test_csv)
    minicpm_combined_df = pd.read_csv(minicpm_combined_csv)

    # Merge both dataframes on 'index' column
    merged_df = pd.merge(updated_test_df, minicpm_combined_df[['index', 'model response']], on='index', how='left')

    # Replace 'ocr_output' with 'model response' wherever 'model response' is not NA
    merged_df['ocr_output'] = merged_df.apply(
        lambda row: row['model response'] if pd.notna(row['model response']) else row['ocr_output'], axis=1
    )

    # Drop the 'model response' column after updating
    merged_df = merged_df.drop(columns=['model response'])

    # Save the updated dataframe to a new CSV file
    merged_df.to_csv(output_csv, index=False)

# Example usage
updated_test_csv = 'updated_test.csv'
minicpm_combined_csv = 'llm_output.csv'
output_csv = 'dimensions.csv'

update_ocr_output(updated_test_csv, minicpm_combined_csv, output_csv)

print(f"Updated test.csv with model response saved as {output_csv}")

In [None]:
entity_unit_map = {
    'width': {'centimetre', 'cm', 'centimetres', 'foot', 'ft', 'feet', "'", 'inch', 'in', 'inches', '"', 'metre', 'm', 'metres', 'millimetre', 'mm', 'millimetres', 'yard', 'yd', 'yards'},
    'depth': {'centimetre', 'cm', 'centimetres', 'foot', 'ft', 'feet', "'", 'inch', 'in', 'inches', '"', 'metre', 'm', 'metres', 'millimetre', 'mm', 'millimetres', 'yard', 'yd', 'yards'},
    'height': {'centimetre', 'cm', 'centimetres', 'foot', 'ft', 'feet', "'", 'inch', 'in', 'inches', '"', 'metre', 'm', 'metres', 'millimetre', 'mm', 'millimetres', 'yard', 'yd', 'yards'},
    'item_weight': {'gram', 'g', 'grams', 'gs'
        'kilogram', 'kg', 'kilograms', 'kgs',
        'microgram', 'mcg', 'micrograms', 'mcgs',
        'milligram', 'mg', 'milligrams', 'mgs',
        'ounce', 'oz', 'ounces',
        'pound', 'lb', 'pounds', 'lbs'},
        # 'ton', 't', 'tons'},
    'maximum_weight_recommendation': {'gram', 'g', 'grams', 'gs'
        'kilogram', 'kg', 'kilograms', 'kgs',
        'microgram', 'mcg', 'micrograms', 'mcgs',
        'milligram', 'mg', 'milligrams', 'mgs',
        'ounce', 'oz', 'ounces',
        'pound', 'lb', 'pounds', 'lbs'},
        # 'ton', 't', 'tons'},
    'voltage': {'kilovolt', 'kv', 'kilovolts', 'millivolt', 'millivolts', 'mv', 'volt', 'volts', 'v'},
    'wattage': {'kilowatt', 'kw', 'kilowatts', 'watt', 'w', 'watts'},
    'item_volume': {'centilitre', 'cl', 'centilitres',
        'cubic foot', 'cu ft', 'cubic feet', 'cuft', 'cu. ft.', 'cu.ft.',
        'cubic inch', 'cu in', 'cubic inches', 'cuin', 'cu. in.', 'cu.in.',
        'cup', 'cups',
        'decilitre', 'dl', 'decilitres',
        'fluid ounce', 'fl oz', 'fluid ounces', 'floz', 'fl. oz.', 'fl.oz.',
        'ounce', 'oz', 'ounces',
        'gallon', 'gal', 'gallons',
        'imperial gallon', 'imp gal', 'imperial gallons', 'impgal', 'imp. gal.', 'imp.gal.',
        'litre', 'l', 'litres',
        'microlitre', 'mcl', 'microlitres',
        'millilitre', 'ml', 'millilitres',
        'pint', 'pt', 'pints',
        'quart', 'qt', 'quarts'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [None]:

easy = pd.read_csv('easyocr.csv')
paddle = pd.read_csv('paddleocr.csv')
doctr = pd.read_excel('doctr.csv')
replacements = pd.read_csv('dimensions.csv')

i = 15
row1e = easy['easy OCR'][i]
row1p = paddle['paddle OCR'][i]
row1d = doctr['doct OCR'][i]
row1r = replacements['ocr_output'][i]

# txt = ''

if row1r != 'nan' and not (isinstance(row1r, float) and math.isnan(row1r)):
  print('hehe')
  txt = row1r
else:
  print('hi')
  # Extract text and confidence scores using regular expressions
  if isinstance(row1e, str):
    matches = re.findall(r"([^\(\)]+)\s*\((\d+\.\d+)\)", row1e)
    # Convert matches to a list of tuples
    row1e = [(text.strip(), float(confidence)) for text, confidence in matches]
  else:
    row1e = []  # Handle cases where row1e is None or not a string
  # Convert matches to a list of tuples
  row1e = [(text.strip(), float(confidence)) for text, confidence in matches]
  row1p = ast.literal_eval(row1p)
  row1d = ast.literal_eval(row1d)


  Firstrow1p = [item[0].lower() for item in row1p]
  Firstrow1e = [item[0].lower() for item in row1e]
  Firstrow1d = [item[0].lower() for item in row1d]
  Firstrow1p.extend(Firstrow1e)
  Firstrow1p.extend(Firstrow1d)
  txt = set(Firstrow1p)
  txt = ' '.join(txt)



final_extract = []
entity_name = easy['entity_name'][i]

if entity_name != 'item_volume':
    # Case like 1400mg, 16.5", 5'
    extracts = re.findall(r'\d+\.?\d*["\'a-zA-Z]+', txt)  # Added ' and " symbols
    for extract in extracts:
        # Match where letters or ' or " begin
        match = re.search(r'["\'a-zA-Z]+', extract)
        if match and match.group(0) in entity_unit_map[entity_name]:
            start_index = match.start()  # Index where letters or ' or " begin
            final_extract.append(extract)

    # Case like 1400 mg, 16.5 ", 5 '
    extracts = re.findall(r'\d+\.?\d*\s+["\'a-zA-Z]+', txt)  # Added ' and " symbols
    for extract in extracts:
        match = re.search(r'["\'a-zA-Z]+', extract)
        if match and match.group(0) in entity_unit_map[entity_name]:
            start_index = match.start()  # Index where letters or ' or " begin
            final_extract.append(extract)
else:

  extracts = re.findall(r'\d+\.?\d*[a-zA-Z]+', txt) #For cases like 14 cuft
  for extract in extracts:
    match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
    if match and match.group(0) in entity_unit_map[entity_name]:
      final_extract.append(extract)

  extracts = re.findall(r'\d+\.?\d*[a-zA-Z]+\s+[a-zA-Z]+', txt)
  for extract in extracts:
    match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
    if match and match.group(0) in entity_unit_map[entity_name]:
      final_extract.append(extract)

  extracts = re.findall(r'\d+\.?\d*\s+[a-zA-Z]+\s+[a-zA-Z]+', txt)
  for extract in extracts:
    match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
    if match and match.group(0) in entity_unit_map[entity_name]:
      final_extract.append(extract)

  extracts = re.findall(r'\d+\.?\d*\s+[a-zA-Z]+', txt)
  for extract in extracts:
    match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
    if match and match.group(0) in entity_unit_map[entity_name]:
      final_extract.append(extract)

print('Final')
print(list(set(final_extract)))

# match = re.search(r'[a-zA-Z]+.*', final_extract[0]) #Detects where letters begin
hehe = []
for i in range(len(final_extract)):
  match = re.search(r'["\'a-zA-Z]+', final_extract[i])
  num = final_extract[i][:match.start()]
  final_unit = standardize_unit(match.group(0), easy['entity_name'][i])
  final_op = num.strip() + ' ' + final_unit.strip()
  hehe.append(final_op)

In [None]:
from fuzzywuzzy import fuzz

# Define valid units and a dictionary of common OCR misreads
valid_units = ['mg', 'ml', 'g', 'l', 'kg', 'oz', 'cm', 'mm']

def correct_ocr_errors(misread_item):
    # Separate the numeric part and the unit part using regex
    import re

    # Regex to match the number part (with possible decimal points) and the unit part
    match = re.match(r'(\d+\.?\d*)([a-zA-Z]+)', misread_item)

    if not match:
        return misread_item  # Return as is if it doesn't match the expected format

    number_part = match.group(1)  # Extract the number part
    unit_part = match.group(2)  # Extract the unit part

    # Correct common OCR error: replace 'o' with '0' in the number part
    corrected_number = unit_part.replace('o', '0')

    # Use fuzzy matching to correct the unit part
    best_match = None
    best_ratio = 0

    for unit in valid_units:
        ratio = fuzz.ratio(unit_part.lower(), unit.lower())
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = unit

    # If the best match is close enough (e.g., 70% similarity), correct the unit
    if best_match and best_ratio > 70:
        return corrected_number + best_match
    else:
        return corrected_number + unit_part  # If no good match, return as is

# Example usage
examples = ['140omg', '22oml', '500mg', '1L', '15oz', '14.5g']

for item in examples:
    corrected = correct_ocr_errors(item)
    print(f"Original: {item}, Corrected: {corrected}")


In [None]:
def correct_ocr_errors(item, entity_name):
  match = re.search(r'[a-zA-Z]+.*', item) #Detects where letters begin
  if match.group(0) in entity_unit_map[entity_name]:
    return item
  else:
    word_index = item.find(match.group(0)) #To find index of where letters begin
    print('Word Index', word_index)

    for unit in entity_unit_map[entity_name]: #To check for every unit in the dictionary
      unit_index = (match.group(0)).find(unit)
      if unit_index!=-1:
        print('Unit', unit)
        print('Unit Index', unit_index)
        subtext = match.group(0)[:unit_index]
        print(subtext)

In [None]:
def standardize_unit(extracted_unit, entity_name):
  if entity_name == 'item_weight' or entity_name == 'maximum_weight_recommendation':
    if extracted_unit in ['gram', 'g', 'grams', 'gs']:
      return 'gram'
    elif extracted_unit in ['kilogram', 'kg', 'kilograms', 'kgs']:
      return 'kilogram'
    elif extracted_unit in ['microgram', 'mcg', 'micrograms', 'mcgs']:
      return 'microgram'
    elif extracted_unit in ['milligram', 'mg', 'milligrams', 'mgs']:
      return 'milligram'
    elif extracted_unit in ['ounce', 'oz', 'ounces']:
      return 'ounce'
    elif extracted_unit in ['pound', 'lb', 'pounds', 'lbs']:
      return 'pound'
    elif extracted_unit in ['ton', 't', 'tons']:
      return 'ton'

  elif entity_name == 'width' or entity_name == 'depth' or entity_name == 'height':
    if extracted_unit in ['centimetre', 'cm', 'centimetres']:
      return 'centimetre'
    elif extracted_unit in ['foot', 'ft', 'feet', "'"]:
      return 'foot'
    elif extracted_unit in ['inch', 'in', 'inches', '"']:
      return 'inch'
    elif extracted_unit in ['metre', 'm', 'metres']:
      return 'metre'
    elif extracted_unit in ['millimetre', 'mm', 'millimetres']:
      return 'millimetre'
    elif extracted_unit in ['yard', 'yd', 'yards']:
      return 'yard'

  elif entity_name == 'voltage':
    if extracted_unit in ['kilovolt', 'kv', 'kilovolts']:
      return 'kilovolt'
    elif extracted_unit in ['millivolt', 'millivolts', 'mv']:
      return 'millivolt'
    elif extracted_unit in ['volt', 'volts', 'v']:
      return 'volt'

  elif entity_name == 'wattage':
    if extracted_unit in ['kilowatt', 'kw', 'kilowatts']:
      return 'kilowatt'
    elif extracted_unit in ['watt', 'w', 'watts']:
      return 'watt'

  elif entity_name == 'item_volume':
    if extracted_unit in ['centilitre', 'cl', 'centilitres']:
      return 'centilitre'
    elif extracted_unit in ['cubic foot', 'cu ft', 'cubic feet', 'cuft', 'cu. ft.', 'cu.ft.']:
      return 'cubic foot'
    elif extracted_unit in ['cubic inch', 'cu in', 'cubic inches', 'cuin', 'cu. in.', 'cu.in.']:
      return 'cubic inch'
    elif extracted_unit in ['cup', 'cups']:
      return 'cup'
    elif extracted_unit in ['decilitre', 'dl', 'decilitres']:
      return 'decilitre'
    elif extracted_unit in ['fluid ounce', 'fl oz', 'fluid ounces', 'floz', 'fl. oz.', 'fl.oz.', 'ounce', 'oz', 'ounces']:
      return 'fluid ounce'
    elif extracted_unit in ['gallon', 'gal', 'gallons']:
      return 'gallon'
    elif extracted_unit in ['imperial gallon', 'imp gal', 'imperial gallons', 'impgal', 'imp. gal.', 'imp.gal.']:
      return 'imperial gallon'
    elif extracted_unit in ['litre', 'l', 'litres']:
      return 'litre'
    elif extracted_unit in ['microlitre', 'mcl', 'microlitres']:
      return 'microlitre'
    elif extracted_unit in ['millilitre', 'ml', 'millilitres']:
      return 'millilitre'
    elif extracted_unit in ['pint', 'pt', 'pints']:
      return 'pint'
    elif extracted_unit in ['quart', 'qt', 'quarts']:
      return 'quart'

In [None]:
def weight_convertor(value):
  num, unit = float(value.split()[0]), value.split()[1].lower()
  conversion_factors = {
    'gram': 1,
    'kilogram': 1000,
    'microgram': 1e-6,
    'milligram': 1e-3,
    'ounce': 28.3495,
    'pound': 453.592,
    'ton': 1e6,
}
  if unit in conversion_factors:
      conversion_factor = conversion_factors[unit]
      return num * conversion_factor

def length_convertor(value):
  num, unit = float(value.split()[0]), value.split()[1].lower()
  conversion_factors = {
    'centimetre': 10,
    'foot': 304.8,
    'inch': 25.4,
    'metre': 1000,
    'millimetre': 1,
    'yard': 914.4,
}
  if unit in conversion_factors:
    conversion_factor = conversion_factors[unit]
    return num * conversion_factor

def voltage_convertor(value):
  num, unit = float(value.split()[0]), value.split()[1].lower()

  if unit == 'volt':
    return num * 1000
  elif unit == 'millivolt':
    return num
  elif unit == 'kilovolt':
    return num * 1000000

def wattage_convertor(value):
  num, unit = float(value.split()[0]), value.split()[1].lower()
  if unit == 'watt':
    return num
  elif unit == 'kilowatt':
    return num * 1000

def volume_convertor(value):
  parts = value.split()
  num = float(parts[0])
  unit = ' '.join(parts[1:]).lower()

  conversion_factors = {
    'centilitre': 10,
    'cubic foot': 28316.8466,
    'cubic inch': 16.3871,
    'cup': 236.588,
    'decilitre': 100,
    'fluid ounce': 29.5735,
    'ounce': 29.5735,
    'gallon': 3785.41,
    'imperial gallon': 4546.09,
    'litre': 1000,
    'microlitre': 0.001,
    'millilitre': 1,
    'pint': 473.176,
    'quart': 946.353,
}
  if unit in conversion_factors:
      conversion_factor = conversion_factors[unit]
      return num * conversion_factor

def optimum_op(extract_list, entity_name):
  if entity_name == 'item_weight' or entity_name == 'maximum_weight_recommendation':
    converted_values = [weight_convertor(value) for value in extract_list]
    max_index = np.argmax(converted_values)
    return extract_list[max_index]

  elif entity_name == 'depth' or entity_name == 'height':
    converted_values = [length_convertor(value) for value in extract_list]
    max_index = np.argmax(converted_values)
    return extract_list[max_index]

  elif entity_name == 'width':
    converted_values = [length_convertor(value) for value in extract_list]
    min_index = np.argmin(converted_values)
    return extract_list[min_index]

  elif entity_name == 'voltage':
    converted_values = [voltage_convertor(value) for value in extract_list]
    max_index = np.argmax(converted_values)
    return extract_list[max_index]

  elif entity_name == 'wattage':
    converted_values = [wattage_convertor(value) for value in extract_list]
    max_index = np.argmax(converted_values)
    return extract_list[max_index]

  else:
    converted_values = [volume_convertor(value) for value in extract_list]
    max_index = np.argmax(converted_values)
    return extract_list[max_index]

In [None]:
def regex_op(txt, entity_name):
  final_extract = []

  if entity_name != 'item_volume':
    # Case like 1400mg, 16.5", 5'
    extracts = re.findall(r'\d+\.?\d*["\'a-zA-Z]+', txt)  # Added ' and " symbols
    for extract in extracts:
        # Match where letters or ' or " begin
        match = re.search(r'["\'a-zA-Z]+', extract)
        if match and match.group(0) in entity_unit_map[entity_name]:
            start_index = match.start()  # Index where letters or ' or " begin
            final_extract.append(extract)

    # Case like 1400 mg, 16.5 ", 5 '
    extracts = re.findall(r'\d+\.?\d*\s+["\'a-zA-Z]+', txt)  # Added ' and " symbols
    for extract in extracts:
        match = re.search(r'["\'a-zA-Z]+', extract)
        if match and match.group(0) in entity_unit_map[entity_name]:
            start_index = match.start()  # Index where letters or ' or " begin
            final_extract.append(extract)
  else:

    extracts = re.findall(r'\d+\.?\d*[a-zA-Z]+', txt) #For cases like 14 cuft
    for extract in extracts:
      match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
      if match and match.group(0) in entity_unit_map[entity_name]:
        final_extract.append(extract)

    extracts = re.findall(r'\d+\.?\d*[a-zA-Z]+\s+[a-zA-Z]+', txt)
    for extract in extracts:
      match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
      if match and match.group(0) in entity_unit_map[entity_name]:
        final_extract.append(extract)

    extracts = re.findall(r'\d+\.?\d*\s+[a-zA-Z]+\s+[a-zA-Z]+', txt)
    for extract in extracts:
      match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
      if match and match.group(0) in entity_unit_map[entity_name]:
        final_extract.append(extract)

    extracts = re.findall(r'\d+\.?\d*\s+[a-zA-Z]+', txt)
    for extract in extracts:
      match = re.search(r'[a-zA-Z]+.*', extract) #Detects where letters begin
      if match and match.group(0) in entity_unit_map[entity_name]:
        final_extract.append(extract)

  return final_extract

In [None]:
end_list = []
for i in tqdm(range(len(easy))):
  row1e = easy['easy OCR'][i]
  row1p = paddle['paddle OCR'][i]
  row1d = doctr['doct OCR'][i]
  row1r = replacements['ocr_output'][i]

  if row1r != 'nan' and not (isinstance(row1r, float) and math.isnan(row1r)):
    txt = row1r
  else:
    # Extract text and confidence scores using regular expressions
    if isinstance(row1e, str):
      matches = re.findall(r"([^\(\)]+)\s*\((\d+\.\d+)\)", row1e)
      # Convert matches to a list of tuples
      row1e = [(text.strip(), float(confidence)) for text, confidence in matches]
    else:
      row1e = []  # Handle cases where row1e is None or not a string
    # Convert matches to a list of tuples
    try:
            if pd.isna(row1p):  # If row1p is NaN or None
                row1p = []  # Assign an empty list as a default
            else:
                row1p = ast.literal_eval(row1p)

            if pd.isna(row1d):  # If row1d is NaN or None
                row1d = []  # Assign an empty list as a default
            else:
                row1d = ast.literal_eval(row1d)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing row1p or row1d on index {i}: {e}")
        row1p = []
        row1d = []


    Firstrow1p = [item[0].lower() for item in row1p]
    Firstrow1e = [item[0].lower() for item in row1e]
    Firstrow1d = [item[0].lower() for item in row1d]
    Firstrow1p.extend(Firstrow1e)
    Firstrow1p.extend(Firstrow1d)
    txt = set(Firstrow1p)
    txt = ' '.join(txt)


  l = regex_op(txt, doctr['entity_name'][i])
  final_l = []
  for element in l:
    match = re.search(r'["\'a-zA-Z]+', element)
    num = element[:match.start()]
    unit = element[match.start():]
    final_unit = standardize_unit(unit.strip(), doctr['entity_name'][i])
    final_op = num.strip() + ' ' + final_unit.strip()
    final_l.append(final_op)

  if len(final_l) == 1:
    end_list.append(final_l[0])
  elif len(final_l) == 0:
    end_list.append('')
  else:
    final_l = optimum_op(final_l, doctr['entity_name'][i])
    end_list.append(final_l)

In [None]:
# Assuming `easy[:43730]` is a DataFrame or Series, and `end_list` is a list of predictions
df = pd.DataFrame({
    'index': easy['index'][:len(end_list)],  # If you want an index
    'prediction': end_list  # Assuming end_list is the list of predictions
})

# Save the DataFrame to a CSV file
df.to_csv('prediction.csv', index=False)

In [None]:
df1 = pd.DataFrame({
    'index': easy['index'][:len(end_list)],  # If you want an index
    'image_link': easy['image_link'][:len(end_list)],
    'group_id': easy['group_id'][:len(end_list)],
    'entity_name': easy['entity_name'][:len(end_list)],
    'prediction': end_list  # Assuming end_list is the list of predictions
})
df1.to_csv('Long_predictions.csv', index = False)