In [12]:
import pandas as pd

In [13]:
import re

valid_dict = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

entity_unit_dict = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'gm', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'maximum_weight_recommendation': {'gram', 'g', 'gm', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton', 't'},
    'voltage': {'kilovolt', 'kv', 'millivolt', 'mv', 'volt', 'v'},
    'wattage': {'kilowatt', 'kw', 'watt', 'w'},
    'item_volume': {'centilitre', 'cl', 'cubic foot', 'ft³', 'cubic inch', 'in³', 'cup', 'decilitre', 'dl', 'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'imp gal', 'litre', 'l', 'microlitre', 'µl', 'millilitre', 'ml', 'pint', 'pt', 'quart', 'qt'}
}

correct_units = {
    'cm': 'centimetre',
    'm': 'metre',
    'mm': 'millimetre',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'lb': 'pound',
    'oz': 'ounce',
    'mg': 'milligram',
    'µg': 'microgram',
    't': 'ton',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'w': 'watt',
    'kw': 'kilowatt',
    'l': 'litre',
    'ml': 'millilitre',
    'µl': 'microlitre',
    'cl': 'centilitre',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'pt': 'pint',
    'qt': 'quart',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'ft³': 'cubic foot',
    'in³': 'cubic inch'
}

def extract_numeric_value_with_unit(sentence, entity_name):
    units = entity_unit_map.get(entity_name, set())
    valid_units = valid_entity_unit_map.get(entity_name, set())
    units_pattern = '|'.join(re.escape(unit) for unit in units)
    regex_pattern = rf'(\d+(?:\.\d+)?)\s*({units_pattern})'
    match = re.search(regex_pattern, sentence, re.IGNORECASE)

    if match:
        numeric_value = match.group(1)
        unit = match.group(2).lower()
        unit = unit_corrections.get(unit, unit)

        return numeric_value + " " + unit
    else:
        return ''

In [14]:
df = pd.read_csv('/content/test.csv')

In [15]:
df.shape

(131187, 6)

In [16]:
df.head()

Unnamed: 0,index,image_link,group_id,entity_name,filename,paddleocr
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,110EibNyclL.jpg,2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,11gHj8dhhrL.jpg,"Size Width Length One Size 10.50cm/4.13"" 90cm/..."


In [17]:
df['prediction'] = df.apply(
    lambda row: extract_numeric_value_with_unit(
        str(row['paddleocr']) if pd.notnull(row['paddleocr']) else '',
        str(row['entity_name']) if pd.notnull(row['entity_name']) else ''
    ),
    axis=1
)

In [18]:
df = df[['index', 'prediction']]
df.to_csv('test_prediction.csv', index=False)