In [1]:
import re
import pandas as pd

# Define the unit abbreviations dictionary
unit_abbreviations = {
    'item_weight': {
        'gram': ['g', 'gm', 'G', 'GM', 'gram'],
        'milligram': ['mg', 'mG', 'Milligram', 'milligram'],
        'kilogram': ['kg', 'KG', 'Kilogram', 'kilogram'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'pound': ['lb', 'LB', 'Pound', 'pound'],
        'ton': ['t', 'T', 'Ton', 'ton'],
        'microgram': ['µg', 'mcg', 'Microgram', 'microgram']
    },
    'item_volume': {
        'cup': ['cup', 'Cup'],
        'gallon': ['gal', 'Gallon', 'gallon'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'millilitre': ['ml', 'mL', 'Millilitre', 'millilitre'],
        'cubic': ['cubic', 'Cubic'],
        'fluid': ['fluid', 'Fluid'],
        'decilitre': ['dl', 'dL', 'Decilitre', 'decilitre'],
        'litre': ['l', 'L', 'Litre', 'litre'],
        'quart': ['qt', 'Quart', 'quart'],
        'pint': ['pt', 'Pint', 'pint'],
        'centilitre': ['cl', 'cL', 'Centilitre', 'centilitre']
    },
    'voltage': {
        'volt': ['v', 'V', 'Volt', 'volt']
    },
    'wattage': {
        'watt': ['w', 'W', 'Watt', 'watt'],
        'horsepower': ['hp', 'HP', 'Horsepower', 'horsepower'],
        'kilowatt': ['kw', 'KW', 'Kilowatt', 'kilowatt'],
        'milliampere': ['mA', 'MA', 'Milliampere', 'milliampere']
    },
    'maximum_weight_recommendation': {
        'kilogram': ['kg', 'KG', 'Kilogram', 'kilogram'],
        'pound': ['lb', 'LB', 'Pound', 'pound'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'ton': ['t', 'T', 'Ton', 'ton'],
        'milligram': ['mg', 'mG', 'Milligram', 'milligram'],
        'gram': ['g', 'gm', 'G', 'GM', 'gram'],
        'microgram': ['µg', 'mcg', 'Microgram', 'microgram']
    },
    'height': {
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'metre': ['m', 'M', 'Metre', 'metre'],
        'foot': ['ft', 'FT', 'Foot', 'foot']
    },
    'depth': {
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'metre': ['m', 'M', 'Metre', 'metre'],
        'foot': ['ft', 'FT', 'Foot', 'foot']
    },
    'width': {
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'foot': ['ft', 'FT', 'Foot', 'foot'],
        'metre': ['m', 'M', 'Metre', 'metre']
    }
}

# Create a reverse dictionary to map abbreviations to full units
unit_map = {}
for category, units in unit_abbreviations.items():
    for full_unit, abbreviations in units.items():
        for abbr in abbreviations:
            unit_map[abbr.lower()] = full_unit

# Regex pattern to find numbers followed by units, handling optional spaces and punctuation
pattern = re.compile(r'(\d+\.?\d*)\s*([a-zA-Zµ]+)', re.IGNORECASE)

def extract_units(text):
    # Find all matches in the text
    matches = pattern.findall(text)
    results = []
    for match in matches:
        number, unit = match
        unit = unit.lower()
        # Clean up unit by removing any trailing characters that are not part of the unit
        unit = re.sub(r'[^\w]', '', unit)
        if unit in unit_map:
            full_unit = unit_map[unit]
            results.append(f"{number} {full_unit}")
    return ' '.join(results)

data = pd.read_csv('updated_sample_test_df.csv')
df = pd.DataFrame(data)
# Check for non-string values
df['ocr_text'] = df['ocr_text'].astype(str)

# Apply the function to create a new column 'extracted_units'
df['extracted_units'] = df['ocr_text'].apply(extract_units)

# print(df[['image_link', 'entity_name', 'entity_value', 'ocr_text', 'extracted_units']])
print(df[['image_link', 'entity_name', 'entity_value','extracted_units']])

                                            image_link  entity_name  \
0    https://m.media-amazon.com/images/I/61I9XdN6OF...  item_weight   
1    https://m.media-amazon.com/images/I/71gSRbyXmo...  item_volume   
2    https://m.media-amazon.com/images/I/61BZ4zrjZX...  item_weight   
3    https://m.media-amazon.com/images/I/612mrlqiI4...  item_weight   
4    https://m.media-amazon.com/images/I/617Tl40LOX...  item_weight   
..                                                 ...          ...   
495  https://m.media-amazon.com/images/I/7179JK26-1...  item_weight   
496  https://m.media-amazon.com/images/I/719skH5g3D...  item_weight   
497  https://m.media-amazon.com/images/I/81L5nXqWeL...  item_weight   
498  https://m.media-amazon.com/images/I/61m-jQu+Za...  item_weight   
499  https://m.media-amazon.com/images/I/71yqw2dcvC...  item_weight   

           entity_value                                    extracted_units  
0            500.0 gram                                     0 ton 500 