In [1]:
import pandas as pd
test_df = pd.read_csv("../dataset/test_captions.csv")

In [2]:
test_df

Unnamed: 0,index,image_link,group_id,entity_name,generated_caption
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,2.65cm
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,The extracted values and units from the text f...
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,21.87cm
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,"The extracted value and unit for the entity ""d..."
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,10.5 centimetres
...,...,...,...,...,...
4995,5005,https://m.media-amazon.com/images/I/41BbYD0xsq...,478357,height,75cm
4996,5006,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,width,140 centimetre
4997,5007,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,depth,140 centimetre
4998,5008,https://m.media-amazon.com/images/I/41BcdB4bYk...,142748,width,"The extracted value and unit for the entity ""w..."


In [3]:
import re

entity_unit_map = {
    "width": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "depth": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "height": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "item_weight": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "maximum_weight_recommendation": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "voltage": ["millivolt", "kilovolt", "volt"],
    "wattage": ["kilowatt", "watt"],
    "item_volume": ["cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", 
                   "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"]
}

unit_variations_by_entity = {
    "width": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in"  : "inch", "yd": "yard"},
    "depth": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in": "inch", "yd": "yard"},
    "height": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in": "inch", "yd": "yard"},
    "item_weight": {"mg": "milligram", "kg": "kilogram", "g": "gram", "lb": "pound", "oz": "ounce", "t": "ton"},
    "maximum_weight_recommendation": {"mg": "milligram", "kg": "kilogram", "g": "gram", "lb": "pound", "oz": "ounce", "t": "ton"},
    "voltage": {"mv": "millivolt", "kv": "kilovolt", "v": "volt", "voltage": "volt"},
    "wattage": {"kw": "kilowatt", "w": "watt", "wattage": "watt"},              
    "item_volume": {"ml": "millilitre", "l": "litre", "cl": "centilitre", "dl": "decilitre", "oz": "fluid ounce", 
                    "pt": "pint", "qt": "quart", "gal": "gallon", "cup": "cup", "cu ft": "cubic foot", 
                    "cu in": "cubic inch"}
}

def normalize_unit(entity_name, unit):
    unit = unit.lower().rstrip('s')  
    return unit_variations_by_entity.get(entity_name, {}).get(unit, unit)  

def extract_value_and_unit(entity_name, generated_caption):
    allowed_units = entity_unit_map.get(entity_name, [])
    
    unit_pattern = r"|".join([re.escape(unit) for unit in allowed_units] + 
                             [re.escape(abbr) for abbr in unit_variations_by_entity.get(entity_name, {}).keys()])
    
    pattern = fr"([-+]?\d*\.?\d+)\s*({unit_pattern})"
    
    match = re.search(pattern, generated_caption, re.IGNORECASE)
    
    if match:
        value = match.group(1)
        unit = normalize_unit(entity_name, match.group(2))
        if unit in allowed_units:  
            return f"{value} {unit}"
    
    return ""

def add_entity_value(df):
    df['entity_value'] = df.apply(lambda row: extract_value_and_unit(row['entity_name'], row['generated_caption']), axis=1)
    
    return df


In [4]:
import pandas as pd
df = pd.DataFrame({
    'index': [0, 1, 2, 3, 4, 5],
    'image_link': ['link1', 'link2', 'link3', 'link4', 'link5', 'link6'],
    'group_id': ['group1', 'group1', 'group2', 'group2', 'group3', 'group3'],
    'entity_name': ['height', 'width', 'height', 'depth', 'item_volume', 'item_weight'],
    'generated_caption': [
        "2.65cm",
        'The extracted values and units from the text for the entity "width" are: - 2300 feet',
        "21.87cm",
        'The extracted value and unit for the entity "depth" are "0.1526 metre".',
        "5 oz of liquid",
        "idk the amount"
    ]
})

df_with_entity_value = add_entity_value(test_df)

In [8]:
df_with_entity_value

Unnamed: 0,index,image_link,group_id,entity_name,generated_caption,entity_value
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,2.65cm,2.65 centimetre
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,The extracted values and units from the text f...,2300 foot
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,21.87cm,21.87 centimetre
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,"The extracted value and unit for the entity ""d...",0.1526 metre
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,10.5 centimetres,10.5 centimetre
...,...,...,...,...,...,...
4995,5005,https://m.media-amazon.com/images/I/41BbYD0xsq...,478357,height,75cm,75 centimetre
4996,5006,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,width,140 centimetre,140 centimetre
4997,5007,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,depth,140 centimetre,140 centimetre
4998,5008,https://m.media-amazon.com/images/I/41BcdB4bYk...,142748,width,"The extracted value and unit for the entity ""w...",300 centimetre


In [None]:
test_df

In [6]:
test_df.to_csv('../dataset/cleaned_test_captions.csv')

In [11]:
import re
import pandas as pd
test_df = pd.read_csv("../dataset/test_captions.csv")

# Entity to unit mapping
entity_unit_map = {
    "width": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "depth": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "height": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "item_weight": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "maximum_weight_recommendation": ["milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"],
    "voltage": ["millivolt", "kilovolt", "volt"],
    "wattage": ["kilowatt", "watt"],
    "item_volume": ["cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", 
                   "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"]
}

# Mapping of common variations and abbreviations for each entity
unit_variations_by_entity = {
    # "width": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in"  : "inch", "yd": "yard"},
    # "depth": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in": "inch", "yd": "yard"},
    # "height": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "in": "inch", "yd": "yard"},
    "width": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot","'":"foot", "in": "inch", "''": "inch" ,"yd": "yard"},
    "depth": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot", "'":"foot","in": "inch","''": "inch" , "yd": "yard"},
    "height": {"cm": "centimetre", "mm": "millimetre", "m": "metre", "feet": "foot", "ft": "foot","'":"foot", "in": "inch","''": "inch" , "yd": "yard"},
    "item_weight": {"mg": "milligram", "kg": "kilogram", "g": "gram", "lb": "pound", "oz": "ounce", "t": "ton"},
    "maximum_weight_recommendation": {"mg": "milligram", "kg": "kilogram", "g": "gram", "lb": "pound", "oz": "ounce", "t": "ton"},
    "voltage": {"mv": "millivolt", "kv": "kilovolt", "v": "volt", "voltage": "volt"},
    "wattage": {"kw": "kilowatt", "w": "watt", "wattage": "watt"},              
    "item_volume": {"ml": "millilitre", "l": "litre", "cl": "centilitre", "dl": "decilitre", "oz": "fluid ounce", 
                    "pt": "pint", "qt": "quart", "gal": "gallon", "cup": "cup", "cu ft": "cubic foot", 
                    "cu in": "cubic inch"}
}

def normalize_unit(entity_name, unit):
    unit = unit.lower().rstrip('s')  # Remove plural 's'
    return unit_variations_by_entity.get(entity_name, {}).get(unit, unit)  # Map based on entity

def extract_value_and_unit(entity_name, generated_caption):
    allowed_units = entity_unit_map.get(entity_name, [])
    
    unit_pattern = r"|".join([re.escape(unit) for unit in allowed_units] + 
                             [re.escape(abbr) for abbr in unit_variations_by_entity.get(entity_name, {}).keys()])
    
    pattern = fr"([-+]?\d*\.?\d+)\s*({unit_pattern})"
    
    match = re.search(pattern, generated_caption, re.IGNORECASE)
    
    if match:
        value = match.group(1)
        unit = normalize_unit(entity_name, match.group(2))
        if unit in allowed_units:  # Ensure the normalized unit is valid for the entity
            return f"{value} {unit}"
    
    return ""

def add_entity_value(df):
    df['entity_value'] = df.apply(lambda row: extract_value_and_unit(row['entity_name'], row['generated_caption']), axis=1)
    
    return df


df_with_entity_value = add_entity_value(test_df)
df_with_entity_value.to_csv('../dataset/cleaned_test_captions.csv')
df_with_entity_value

Unnamed: 0,index,image_link,group_id,entity_name,generated_caption,entity_value
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,2.65cm,2.65 centimetre
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,The extracted values and units from the text f...,2300 foot
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,21.87cm,21.87 centimetre
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,"The extracted value and unit for the entity ""d...",0.1526 metre
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,10.5 centimetres,10.5 centimetre
...,...,...,...,...,...,...
4995,5005,https://m.media-amazon.com/images/I/41BbYD0xsq...,478357,height,75cm,75 centimetre
4996,5006,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,width,140 centimetre,140 centimetre
4997,5007,https://m.media-amazon.com/images/I/41Bc7Zl3tI...,312608,depth,140 centimetre,140 centimetre
4998,5008,https://m.media-amazon.com/images/I/41BcdB4bYk...,142748,width,"The extracted value and unit for the entity ""w...",300 centimetre


In [13]:
import pandas as pd
import os

def concatenate_csv_files(directory, file_order):
    df_list = []

    for filename in file_order:
        file_path = os.path.join(directory, filename)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df_list.append(df)
        else:
            print(f"Warning: File {filename} not found in the directory.")

    if df_list:
        result = pd.concat(df_list, ignore_index=True)
        
        output_file = 'concatenated_output.csv'
        result.to_csv(output_file, index=False)
        print(f"Concatenation complete. Output saved as '{output_file}'")
    else:
        print("Error: No valid CSV files were found to concatenate.")

directory = '../dataset/'

file_order = [
    'test_captions_30000_45000.csv',
    'test_captions_45000_60000.csv',
    'test_captions_60000_75000.csv',
    'test_captions_75000_90000.csv',
]

concatenate_csv_files(directory, file_order)   

Concatenation complete. Output saved as 'concatenated_output.csv'
