In [6]:
import pandas as pd
import re

ocr_unit_variations = {
    'width': {
        'centimetre': ['cm', 'centimetres', 'centemeter', 'centimeter', 'centemtre', 'centimitre'],
        'foot': ['ft', 'feets', 'fot', 'fooot'],
        'inch': ['in', 'inches', 'inchs', 'ich', 'inche'],
        'metre': ['m', 'meter', 'metre', 'metter', 'mitre'],
        'millimetre': ['mm', 'milimetre', 'millimeter', 'milimeter', 'mllmtr'],
        'yard': ['yd', 'yards', 'yrd', 'yad']
    },
    'depth': {
        'centimetre': ['cm', 'centimetres', 'centemeter', 'centimeter', 'centemtre', 'centimitre'],
        'foot': ['ft', 'feets', 'fot', 'fooot'],
        'inch': ['in', 'inches', 'inchs', 'ich', 'inche'],
        'metre': ['m', 'meter', 'metre', 'metter', 'mitre'],
        'millimetre': ['mm', 'milimetre', 'millimeter', 'milimeter', 'mllmtr'],
        'yard': ['yd', 'yards', 'yrd', 'yad']
    },
    'height': {
        'centimetre': ['cm', 'centimetres', 'centemeter', 'centimeter', 'centemtre', 'centimitre'],
        'foot': ['ft', 'feets', 'fot', 'fooot'],
        'inch': ['in', 'inches', 'inchs', 'ich', 'inche'],
        'metre': ['m', 'meter', 'metre', 'metter', 'mitre'],
        'millimetre': ['mm', 'milimetre', 'millimeter', 'milimeter', 'mllmtr'],
        'yard': ['yd', 'yards', 'yrd', 'yad']
    },
    'item_weight': {
        'gram': ['g', 'grams', 'grm', 'grammes', 'gm'],
        'kilogram': ['kg', 'kilograms', 'kilogrammes', 'kilogrm', 'kgrm'],
        'microgram': ['µg', 'mcg', 'microg', 'micrograms', 'ugrm'],
        'milligram': ['mg', 'milligrams', 'milligms', 'miligram', 'millig'],
        'ounce': ['oz', 'onz', 'ounces', 'ounz', 'ouce'],
        'pound': ['lb', 'lbs', 'pund', 'poud', 'pounds'],
        'ton': ['t', 'tonnes', 'tonn', 'tun', 'tn']
    },
    'maximum_weight_recommendation': {
        'gram': ['g', 'grams', 'grm', 'grammes', 'gm'],
        'kilogram': ['kg', 'kilograms', 'kilogrammes', 'kilogrm', 'kgrm'],
        'microgram': ['µg', 'mcg', 'microg', 'micrograms', 'ugrm'],
        'milligram': ['mg', 'milligrams', 'milligms', 'miligram', 'millig'],
        'ounce': ['oz', 'onz', 'ounces', 'ounz', 'ouce'],
        'pound': ['lb', 'lbs', 'pund', 'poud', 'pounds'],
        'ton': ['t', 'tonnes', 'tonn', 'tun', 'tn']
    },
    'voltage': {
        'kilovolt': ['kV', 'kilovolts', 'klv', 'kilovlt', 'kvlt'],
        'millivolt': ['mV', 'millivolts', 'millivlt', 'mvlt', 'millvolt'],
        'volt': ['V', 'volts', 'vlt', 'voltz', 'vt']
    },
    'wattage': {
        'kilowatt': ['kW', 'kilowatts', 'klw', 'kwt', 'kilowtt'],
        'watt': ['W', 'watts', 'wat', 'wt', 'wattz']
    },
    'item_volume': {
        'centilitre': ['cl', 'centilitres', 'centiliter', 'centiletre', 'centl'],
        'cubic foot': ['ft³', 'cubic feet', 'cuft', 'cbft', 'cubfoot'],
        'cubic inch': ['in³', 'cubic inches', 'cuin', 'cbinch', 'cubicinch'],
        'cup': ['cups', 'cupp', 'ccp', 'cupz'],
        'decilitre': ['dl', 'decilitres', 'deciliter', 'deciletre', 'decl'],
        'fluid ounce': ['fl oz', 'fluid ounces', 'fl.oz', 'fldoz', 'fldounce'],
        'gallon': ['gal', 'gallons', 'galn', 'glln', 'gallon'],
        'imperial gallon': ['imp gal', 'imperial gallons', 'imp.gal', 'impgl', 'igallon'],
        'litre': ['l', 'litres', 'liter', 'ltr', 'lit.'],
        'microlitre': ['µl', 'microlitres', 'microliter', 'micrl', 'micrltr'],
        'millilitre': ['ml', 'millilitres', 'milliliter', 'mililitre', 'mll'],
        'pint': ['pt', 'pints', 'pintz', 'pinte'],
        'quart': ['qt', 'quarts', 'quartz', 'quarte', 'qrts']
    }
}

In [8]:
import pandas as pd
import re
from fuzzywuzzy import process

# Provided unit variations
ocr_unit_variations = {
    'width': {
        'centimetre': ['cm', 'centimetres', 'centemeter', 'centimeter', 'centemtre', 'centimitre'],
        'foot': ['ft', 'feets', 'fot', 'fooot'],
        'inch': ['in', 'inches', 'inchs', 'ich', 'inche'],
        'metre': ['m', 'meter', 'metre', 'metter', 'mitre'],
        'millimetre': ['mm', 'milimetre', 'millimeter', 'milimeter', 'mllmtr'],
        'yard': ['yd', 'yards', 'yrd', 'yad']
    },
    # Other units are omitted for brevity
}

# Flatten the variations for fuzzy matching
all_variations = {unit: variation for sublist in ocr_unit_variations.values() for unit, variations in sublist.items() for variation in variations}

def extract_numbers_with_units(input_string):
    # Step 1: Find all numbers (including decimals) in the input string
    matches = re.finditer(r'\d+\.?\d*', input_string)

    # Step 2: Store the numbers with words before and after in an array
    results = []
    for match in matches:
        number = match.group()  # The number itself (can include decimals)
        start = max(0, match.start() - 5)  # Ensure start is not negative
        end = match.end() + 5
        before = input_string[max(0, match.start() - 5):match.start()].strip()  # 5 characters before the number
        after = input_string[match.end():match.end() + 5].strip()  # 5 characters after the number

        # Check if units are present in 'before' or 'after'
        if contains_unit(before) or contains_unit(after):
            # Add the result in the format [number, 5_after, 5_before]
            results.append([number, after, before])

    return results

def contains_unit(text):
    # Normalize the text for fuzzy matching
    text = text.lower()

    # Check if any unit variation is present in the text
    for unit, variations in all_variations.items():
        # Use fuzzy matching to check if any unit variation is present
        best_match, score = process.extractOne(text, variations)
        if score >= 80:  # Adjust the threshold as needed
            return True

    return False


In [9]:
def process_csv(input_csv_path, output_csv_path):
    # Read the input CSV
    df = pd.read_csv(input_csv_path)

    # Apply the extract_numbers_with_units function to each row in the 'ocr' column
    df['output_numbers'] = df['pytesseract_output'].apply(extract_numbers_with_units)

    # Write the modified DataFrame to a new CSV file
    df.to_csv(output_csv_path, index=False)

# Example usage
input_csv_path = 'tesseract_test_ocr.csv'  # Replace with your input file path
output_csv_path = 'output.csv'  # The output file path
process_csv(input_csv_path, output_csv_path)