In [3]:
import os
import pandas as pd

In [4]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

In [5]:
def findUnits(text):
    import re
    pattern = r"(\d+(?:\.\d+)?)\s*([A-Za-z]+)"
    matches = re.findall(pattern, text)
    return matches

In [23]:
def extractedEntityValue(entity_name, unit_list):
    # Unit normalization for more robust handling
    def normalize_unit(entity, unit):
        unit = unit.lower()
        
        if entity in ['voltage']:
            if unit in ['v', 'volts', 'volt']:
                return 'volt'
            if unit in ['kv', 'kilovolt']:
                return 'kilovolt'
            if unit in ['mv', 'millivolt']:
                return 'millivolt'
        
        elif entity in ['wattage']:
            if unit in ['w', 'watt', 'watts']:
                return 'watt'
            if unit in ['kw', 'kilowatt']:
                return 'kilowatt'
        
        elif entity in ['item_volume']:
            if unit in ['ml', 'millilitre', 'milliliters']:
                return 'ml'
            if unit in ['floz', 'fluid ounce', 'oz', 'ounce']:
                return 'floz'
            if unit in ['cut', 'cubic foot']:
                return 'cut'
        
        elif entity in ['depth', 'width', 'height']:
            if unit in ['mm', 'millimetre', 'millimeters']:
                return 'mm'
            if unit in ['cm', 'centimetre', 'centimeters']:
                return 'cm'
            if unit in ['m', 'metre', 'meters']:
                return 'm'
            if unit in ['in', 'inch', 'inches']:
                return 'in'
        
        elif entity in ['item_weight', 'maximum_weight_recommendation']:
            if unit in ['g', 'gm', 'gms', 'grams', 'gn', 'gsm']:
                return 'g'
            if unit in ['kg', 'kgs', 'kilogram', 'kilograms']:
                return 'kg'
            if unit in ['mg', 'milligram', 'milligrams']:
                return 'mg'
            if unit in ['oz', 'ounce', 'ounces']:
                return 'oz'
            if unit in ['lbs', 'pound', 'pounds']:
                return 'pound'
        
        return unit

    # Extract voltage
    if entity_name == 'voltage':
        volt_list = [float(x[0]) for x in unit_list if normalize_unit('voltage', x[1]) == 'volt']
        return f"{max(volt_list)} volt" if volt_list else ""

    # Extract wattage
    elif entity_name == 'wattage':
        watt_list = [float(x[0]) for x in unit_list if normalize_unit('wattage', x[1]) == 'watt']
        return f"{max(watt_list)} watt" if watt_list else ""

    # Extract item volume
    elif entity_name == 'item_volume':
        volume_list = [(float(x[0]), normalize_unit('item_volume', x[1])) for x in unit_list]
        if volume_list:
            max_value, unit = max(volume_list)
            unit_name = {'ml': 'millilitre', 'floz': 'fluid ounce', 'cut': 'cubic foot'}.get(unit, 'ounce')
            return f"{max_value} {unit_name}"
        return ""

    # Extract dimensions (depth, width, height)
    elif entity_name in ['depth', 'width', 'height']:
        import random
        dimension_list = [(float(x[0]), normalize_unit(entity_name, x[1])) for x in unit_list]
        if dimension_list:
            idx = random.randint(0, len(dimension_list) - 1)
            max_value, unit = dimension_list[idx]
            unit_name = {'mm': 'millimeter', 'cm': 'centimeter', 'm': 'meter', 'in': 'inch'}.get(unit)
            return f"{max_value} {unit_name}"
        return ""

    # Extract item weight or maximum weight recommendation
    elif entity_name in ['item_weight', 'maximum_weight_recommendation']:
        conversion_factors = {
            "mg": 1 / 1000,  # Convert mg to grams
            "g": 1,          # grams as base unit
            "kg": 1000,      # Convert kg to grams
            "oz": 28.34,     # Convert oz to grams
            "pound": 453.62  # Convert pound to grams
        }

        weight_list = [(float(x[0]) * conversion_factors[normalize_unit('item_weight', x[1])], 
                        float(x[0]), normalize_unit('item_weight', x[1]))
                       for x in unit_list if normalize_unit('item_weight', x[1]) in conversion_factors]

        if weight_list:
            _, max_value, unit = max(weight_list)
            unit_name = {"mg": "milligram", "g": "gram", "kg": "kilogram", "oz": "ounce", "pound": "pound"}[unit]
            return f"{max_value} {unit_name}"
        return ""

    return ""


In [7]:
import pandas as pd
import glob

# List all CSV files in a directory
csv_files = glob.glob('../dataset/merged/*.csv')

# Read and concatenate all CSV files
df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)

# Print the combined DataFrame
print(df)

            image_file                                           ocr_text
0      41CZmN2nxnS.jpg  5g Fiber Jo6ns 60 190Calories p.p.s.open here ...
1      41q0Vddrg7S.jpg                        3.3 in/8.5 cm 2.5 in/6.5 cm
2      41IvgHo9iUL.jpg                             2.3cm/0.9" 22.5cm/8.9"
3      41cgZ+EJcsL.jpg                            91.44cm 96.52cm 60.96cm
4      41cnxkr8VrL.jpg     13.39inch/340mm 10.63inch/270mm 4.72inch/120mm
...                ...                                                ...
90661  519XvU8UsgL.jpg                                    5cm 22cm 18.5cm
90662  513janpTTEL.jpg                                   45cm 15cm 30.5cm
90663  515SIsjMkEL.jpg                                                 8"
90664  519F04JTChL.jpg              Product Size 8.6CM SANGINE 8.6CM 4pcs
90665  517UxJ+xN7L.jpg  10G PROTEIN 10GPRO 106PROTEW OG PROTEN ZERO OG...

[90666 rows x 2 columns]


In [8]:
df.sample()

Unnamed: 0,image_file,ocr_text
80791,81nFVVw7uUS.jpg,akyga AKNB12A008346 Professional Power Supply ...


In [9]:
df['image_file'].apply(lambda x: f"'https://m.media-amazon.com/images/I/{x}.jpg'")

0        'https://m.media-amazon.com/images/I/41CZmN2nx...
1        'https://m.media-amazon.com/images/I/41q0Vddrg...
2        'https://m.media-amazon.com/images/I/41IvgHo9i...
3        'https://m.media-amazon.com/images/I/41cgZ+EJc...
4        'https://m.media-amazon.com/images/I/41cnxkr8V...
                               ...                        
90661    'https://m.media-amazon.com/images/I/519XvU8Us...
90662    'https://m.media-amazon.com/images/I/513janpTT...
90663    'https://m.media-amazon.com/images/I/515SIsjMk...
90664    'https://m.media-amazon.com/images/I/519F04JTC...
90665    'https://m.media-amazon.com/images/I/517UxJ+xN...
Name: image_file, Length: 90666, dtype: object

In [10]:
test.iloc[0]['image_link']

'https://m.media-amazon.com/images/I/110EibNyclL.jpg'

In [13]:
# Step 1: Extract the image filename from the image_link in test
test['image_file'] = test['image_link'].apply(lambda x: x.split('/')[-1])

# Step 2: Merge the df and test DataFrames on the image_file
merged_df = test.merge(df[['image_file', 'ocr_text']], on='image_file', how='left')

# Step 3: Display the shape and columns of the merged DataFrame
print(merged_df.shape)
print(merged_df.columns)

# # Step 4: Save the merged DataFrame to a CSV file (optional)
# merged_df.to_csv("merged_ocr_test_results.csv", index=False)


(131187, 6)
Index(['index', 'image_link', 'group_id', 'entity_name', 'image_file',
       'ocr_text'],
      dtype='object')


In [15]:
merged_df

Unnamed: 0,index,image_link,group_id,entity_name,image_file,ocr_text
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,110EibNyclL.jpg,2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,11TU2clswzL.jpg,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,11gHj8dhhrL.jpg,"Size Width Length One Size 10.50cm/4.13"" 90cm/..."
...,...,...,...,...,...,...
131182,131283,https://m.media-amazon.com/images/I/A1rVsIzEtk...,721522,maximum_weight_recommendation,A1rVsIzEtkL.jpg,"FOREMAN 1,500 LB weight capacity"
131183,131284,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,item_weight,A1rdvZ5zDdL.jpg,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...
131184,131285,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,maximum_weight_recommendation,A1rdvZ5zDdL.jpg,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...
131185,131286,https://m.media-amazon.com/images/I/A1tnTUPyr7...,853009,item_weight,A1tnTUPyr7L.jpg,DOGNAM JOSSIE HUMAN NAMI VIeToR 955123-1/567


In [16]:
# Step 1: Rename the final unit column to prediction (if applicable)
merged_df = merged_df.rename(columns={"final_unit": "prediction"})

# Step 2: Select only the index and prediction columns
# Note: If your index is already a DataFrame column, include it directly; if not, reset the index
# output_df = merged_df[["index", "prediction"]]

merged_df['detected_unit'] = merged_df.ocr_text.apply(lambda x:findUnits(str(x)))
merged_df['final_unit']=merged_df.apply(lambda x: extractedEntityValue(x['entity_name'],x['detected_unit']),axis=1)


In [25]:
merged_df['final_unit']=merged_df.apply(lambda x: extractedEntityValue(x['entity_name'],x['detected_unit']),axis=1)

In [27]:
merged_df = merged_df.rename(columns={"final_unit": "prediction"})
output_df = merged_df[["index", "prediction"]]

In [28]:
output_df.to_csv("text_out.csv", index=False)