### Basic library imports

In [2]:
import os
import pandas as pd

### Read Dataset

In [28]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [None]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

In [None]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

### Download images

In [29]:
from utils import download_images
download_images(train['image_link'], '../train_images')

  2%|▏         | 6387/263859 [04:25<2:58:04, 24.10it/s] 


KeyboardInterrupt: 

In [None]:
from utils import process_images_with_easyocr

# Process images and extract quantitative data
ocr_results = process_images_with_easyocr('../del_images')

# Print extracted quantitative data
for image_name, data in ocr_results.items():
    print(f"Quantitative data extracted from {image_name}: {data}")

In [None]:
# Shivansh pytesseract
from utils import process_images_with_ocr

ocr_results = process_images_with_ocr(sample_test['image_link'], '../del_images')

for image_name, text in ocr_results.items():
    print(f"Text extracted from {image_name}: {text}")

In [None]:
assert len(os.listdir('../images')) > 0

In [15]:
rm -rf ../images

In [None]:
import easyocr
import re
import os
import cv2
from PIL import Image

# Load EasyOCR model
reader = easyocr.Reader(['en'])

# Map short forms to full forms
unit_abbreviation_map = {
    'cm': 'centimetre',
    'mm': 'millimetre',
    'm': 'metre',
    'kg': 'kilogram',
    'g': 'gram',
    'mg': 'milligram',
    'lb': 'pound',
    'oz': 'ounce',
    'kv': 'kilovolt',
    'v': 'volt',
    'w': 'watt',
    'kw': 'kilowatt',
    'ml': 'millilitre',
    'l': 'litre',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
    '"' : 'inch',
    "'" : 'foot'
}

# Function to normalize short forms to full units
def normalize_unit(text):
    for short_form, full_form in unit_abbreviation_map.items():
        # Replace short form with full form in the text (case-insensitive)
        text = re.sub(r'\b' + re.escape(short_form) + r'\b', full_form, text, flags=re.IGNORECASE)
    return text

# Function to extract the prediction from OCR and process it
def predictor(image_link):
    # Perform OCR to extract text from the image
    ocr_result = reader.readtext(image_link, detail=0)
    
    # Normalize each detected text in the OCR result
    normalized_result = [normalize_unit(text) for text in ocr_result]
    
    return normalized_result

# Processing images from a folder
image_folder = '../images'  # Replace with your folder path
image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder) if fname.endswith(('.jpg', '.png', '.jpeg'))]

for idx, image_path in enumerate(image_paths, 1):
    # Load the image
    image = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

    # Save the processed image (optional)
    processed_image_path = os.path.join(image_folder, f"processed_image_{idx}.jpg")
    cv2.imwrite(processed_image_path, thresh)

    # Pass the processed image to EasyOCR
    pil_image = Image.open(processed_image_path)
    
    # Get normalized OCR output
    ocr_output = predictor(processed_image_path)
    
    print(f"OCR Output for image {idx}: {ocr_output}")


### Working Code (Not for all but for more than 1)

In [None]:
import easyocr
import re
import cv2
from PIL import Image
import numpy as np
import os

# Load EasyOCR model
reader = easyocr.Reader(['en'])

# Updated entity-unit map
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Function to extract the prediction from OCR and process it
def predictor(image_path):
    # Open the image and apply preprocessing
    image = Image.open(image_path)
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

    # Perform OCR to extract text from the processed image
    ocr_result = reader.readtext(thresh, detail=0)
    
    # Join OCR results into a single string for easier searching
    ocr_text = " ".join(ocr_result).lower()

    # Results to store predictions
    results = []

    # Loop through the entity_unit_map and try to find any matches
    for entity_name, allowed_units in entity_unit_map.items():
        # Build regex pattern for matching the entity and allowed units
        pattern = r'(\d+\.?\d*)\s?(' + '|'.join(allowed_units) + ')'
        
        # Search for the pattern in the OCR text
        match = re.search(pattern, ocr_text)
        
        if match:
            value = match.group(1)  # Extract the value
            unit = match.group(2)   # Extract the unit
            results.append(f"{entity_name}: {value} {unit}")
    
    # If results are found, return them, otherwise return empty string
    if results:
        return " ".join(results)
    else:
        return ""


# Automatically get all image paths from the 'images' folder
image_folder = '../images'  # Replace with your folder path
image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder) if fname.endswith(('.jpg', '.png', '.jpeg'))]

# Process all images and print results
for idx, image_path in enumerate(image_paths, 1):
    result = predictor(image_path)
    if result:
        print(f"{idx}. {result}")
    else:
        print(f"{idx}. No entities detected")


In [None]:
import easyocr
import re
import pandas as pd

# Load EasyOCR model
reader = easyocr.Reader(['en'])

# Define allowed units based on entity type
units_dict = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

def extract_text_from_image(image_link):
    result = reader.readtext(image_link, detail=0)
    return " ".join(result).lower()

def find_entity_value(text, entity_name):
    allowed_units = units_dict.get(entity_name, [])
    pattern = r'(\d+\.?\d*)\s?(' + '|'.join(allowed_units) + ')'
    match = re.search(pattern, text)
    if match:
        value = match.group(1)
        unit = match.group(2)
        return f"{value} {unit}"
    return ""

def process_csv(csv_file):
    df = pd.read_csv(csv_file)
    results = []
    for _, row in df.iterrows():
        image_link = row['image_link']
        entity_name = row['entity_name']
        text = extract_text_from_image(image_link)
        value = find_entity_value(text, entity_name)
        results.append((row['index'], value))
    return results

# Example usage
csv_file = '../dataset/sample_test.csv'
output = process_csv(csv_file)
for image_number, value in output:
    print(f"Index: {image_number}, Extracted Value: {value}")


In [43]:
import cv2
import easyocr
import pandas as pd
import requests
from tempfile import NamedTemporaryFile
import re
import os

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'], gpu=False)

# Entity-unit map and allowed abbreviations
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", 
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Map for allowed abbreviations
unit_abbreviation_map = {
    "cm": "centimetre", "mm": "millimetre", "m": "metre", "in": "inch", "\"": "inch", "'": "foot",
    "ft": "foot", "yd": "yard", "mg": "milligram", "kg": "kilogram", "g": "gram", "oz": "ounce",
    "ton": "ton", "lb": "pound", "v": "volt", "kv": "kilovolt", "mv": "millivolt", "kw": "kilowatt",
    "w": "watt", "ml": "millilitre", "l": "litre", "gal": "gallon", "qt": "quart", "pt": "pint", "fl oz": "fluid ounce", "cup": "cup"
}

# Preprocessing: Apply various thresholding techniques
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise ValueError(f"Image at path {image_path} could not be loaded.")

    # Binary Thresholding
    _, binary_thresh = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY)
    
    # Adaptive Thresholding
    adaptive_thresh = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                            cv2.THRESH_BINARY, 11, 2)
    
    # Otsu's Thresholding
    _, otsu_thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    return {
        'binary': binary_thresh,
        'adaptive': adaptive_thresh,
        'otsu': otsu_thresh
    }

# Function to extract relevant entity from text based on entity_name
def extract_entity(ocr_result, entity_name):
    pattern = re.compile(r'(\d+(\.\d+)?)\s*([a-zA-Z\'\"]+)')  # Regex to match numbers and their units
    matches = pattern.findall(ocr_result)
    
    for match in matches:
        value = match[0]
        unit = match[2].lower()

        # Convert abbreviation to full form if necessary
        if unit in unit_abbreviation_map:
            unit = unit_abbreviation_map[unit]
        
        # Check if the unit belongs to the entity
        if unit in entity_unit_map[entity_name]:
            return f"{value} {unit}"
        
    # Special case for "maximum_weight_recommendation"
    if entity_name == "maximum_weight_recommendation" and re.search(r'load\s*bearing', ocr_result, re.IGNORECASE):
        return "load bearing"
    
    return None

# Function to download image from URL
def download_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Create a temporary file to save the image
        temp_file = NamedTemporaryFile(delete=False, suffix='.jpg')
        with open(temp_file.name, 'wb') as f:
            f.write(response.content)
        return temp_file.name
    else:
        raise ValueError(f"Failed to download image from URL {url}")

# Function to evaluate OCR result from different thresholding techniques
def evaluate_thresholding(image_path, entity_name):
    thresholds = preprocess_image(image_path)
    best_result = None
    best_match = None

    for method, thresh_img in thresholds.items():
        # Perform OCR
        bgr_image = cv2.cvtColor(thresh_img, cv2.COLOR_GRAY2BGR)
        ocr_result = reader.readtext(bgr_image, detail=0)
        ocr_text = ' '.join(ocr_result)
        
        # Extract entity
        entity_value = extract_entity(ocr_text, entity_name)
        
        # Compare with predefined value (if available)
        if best_result is None or (entity_value and best_match is None):
            best_result = entity_value
            best_match = method
    
    return best_result, best_match

# Main function to process a batch of images and generate output CSV
def process_images(input_csv_path, output_csv_path, image_folder, batch_size=100):
    df = pd.read_csv(input_csv_path)
    output_data = []
    image_index = 0

    # Process only the first batch_size rows
    df = df.head(batch_size)

    for _, row in df.iterrows():
        image_index += 1
        image_link = row['image_link']
        entity_name = row['entity_name']
        predefined_value = row['entity_value']

        # Handle local file paths and URLs
        if image_link.startswith('http'):
            try:
                image_path = download_image(image_link)
            except Exception as e:
                print(f"Error downloading image {image_link}: {e}")
                output_data.append({"image_index": image_index, "entity_value": None})
                continue
        else:
            image_path = os.path.join(image_folder, image_link.replace('\\', '/'))

        try:
            # Evaluate OCR results from different thresholding techniques
            entity_value, best_method = evaluate_thresholding(image_path, entity_name)
            
            if entity_value == predefined_value:
                output_data.append({"image_index": image_index, "entity_value": entity_value})
            else:
                output_data.append({"image_index": image_index, "entity_value": None})
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            output_data.append({"image_index": image_index, "entity_value": None})
        finally:
            # Clean up temporary files
            if image_path.startswith('/tmp/'):
                os.remove(image_path)

    output_df = pd.DataFrame(output_data)
    output_df.to_csv(output_csv_path, index=False)
    print(f"Results saved to {output_csv_path}")

# Path configurations
input_csv_path = "../dataset/test1.csv"  # Input CSV with image_link, entity_name, and entity_value
output_csv_path = "../output/extracted_entity.csv"  # Output CSV where the results will be saved
image_folder = "../train_images"  # Folder where images are stored

# Run the main function to process a batch of images and extract entities
process_images(input_csv_path, output_csv_path, image_folder, batch_size=100)


Using CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


KeyError: 'entity_value'

In [46]:
import csv
import re
import os
import cv2
import easyocr
import pandas as pd
import requests
from tempfile import NamedTemporaryFile

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'], gpu=False)

# Entity-unit map and allowed abbreviations
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", 
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Map for allowed abbreviations
unit_abbreviation_map = {
    "cm": "centimetre", "mm": "millimetre", "m": "metre", "in": "inch", "\"": "inch", "'": "foot",
    "ft": "foot", "yd": "yard", "mg": "milligram", "kg": "kilogram", "g": "gram", "oz": "ounce",
    "ton": "ton", "lb": "pound", "v": "volt", "kv": "kilovolt", "mv": "millivolt", "kw": "kilowatt",
    "w": "watt", "ml": "millilitre", "l": "litre", "gal": "gallon", "qt": "quart", "pt": "pint", "fl oz": "fluid ounce", "cup": "cup"
}

# Preprocessing: Apply thresholding for better OCR results
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        raise ValueError(f"Image at path {image_path} could not be loaded.")
    _, thresholded_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    return thresholded_image

# Function to extract relevant entity from text based on entity_name
def extract_entity(ocr_result, entity_name):
    pattern = re.compile(r'(\d+(\.\d+)?)\s*([a-zA-Z\'\"]+)')  # Regex to match numbers and their units
    matches = pattern.findall(ocr_result)
    
    for match in matches:
        value = match[0]
        unit = match[2].lower()

        # Convert abbreviation to full form if necessary
        if unit in unit_abbreviation_map:
            unit = unit_abbreviation_map[unit]
        
        # Check if the unit belongs to the entity
        if unit in entity_unit_map[entity_name]:
            return f"{value} {unit}"

    return None

# Function to download image from URL
def download_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Create a temporary file to save the image
        temp_file = NamedTemporaryFile(delete=False, suffix='.jpg')
        with open(temp_file.name, 'wb') as f:
            f.write(response.content)
        return temp_file.name
    else:
        raise ValueError(f"Failed to download image from URL {url}")

# Main function to process all images and generate output CSV incrementally
def process_images(input_csv_path, output_csv_path, image_folder):
    df = pd.read_csv(input_csv_path)

    # Open the output CSV file for appending
    with open(output_csv_path, mode='a', newline='') as csvfile:
        fieldnames = ['index', 'entity_value']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header only if the file is empty
        if os.stat(output_csv_path).st_size == 0:
            writer.writeheader()

        for _, row in df.iterrows():
            row_index = row['index']
            image_link = row['image_link']
            entity_name = row['entity_name']

            # Handle local file paths and URLs
            if image_link.startswith('http'):
                try:
                    image_path = download_image(image_link)
                except Exception as e:
                    print(f"Error downloading image {image_link}: {e}")
                    writer.writerow({"index": row_index, "entity_value": None})
                    continue
            else:
                image_path = os.path.join(image_folder, image_link.replace('\\', '/'))

            try:
                preprocessed_image = preprocess_image(image_path)
                bgr_image = cv2.cvtColor(preprocessed_image, cv2.COLOR_GRAY2BGR)
                ocr_result = reader.readtext(bgr_image, detail=0)
                ocr_text = ' '.join(ocr_result)
                entity_value = extract_entity(ocr_text, entity_name)
                writer.writerow({"index": row_index, "entity_value": entity_value})
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")
                writer.writerow({"index": row_index, "entity_value": None})
            finally:
                # Clean up temporary files
                if image_path.startswith('/tmp/'):
                    os.remove(image_path)

    print(f"Results saved to {output_csv_path}")

# Path configurations
input_csv_path = "../dataset/test.csv"  # Input CSV with row_index, image_link, entity_name
output_csv_path = "../output/extracted_entity.csv"  # Output CSV where the results will be saved
image_folder = "../train_images"  # Folder where images are stored

# Run the main function to process images and extract entities
process_images(input_csv_path, output_csv_path, image_folder)


Using CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


Error processing image C:\Users\aryan\AppData\Local\Temp\tmpu9cda82_.jpg: OpenCV(4.10.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'

Error processing image C:\Users\aryan\AppData\Local\Temp\tmp6ds1yxy9.jpg: OpenCV(4.10.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'

Error processing image C:\Users\aryan\AppData\Local\Temp\tmplvt2tc9_.jpg: OpenCV(4.10.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'

Error processing image C:\Users\aryan\AppData\Local\Temp\tmpoarbknn9.jpg: OpenCV(4.10.0) :-1: error: (-5:Bad argument) in function 'cvtColor'
> Overload resolution failed:
>  - src is not a numpy 