In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/train_merged.csv")

In [None]:
# ---------------------------
# Complete Data Processing Script with F1 Score Calculation
# ---------------------------
import re
import pandas as pd
from sklearn.metrics import f1_score

# ---------------------------
# Part 1: Regex Extraction
# ---------------------------

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s*(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = [
    'milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
    'gram', 'g', 'grams',
    'kilogram', 'kg',
    'microgram', 'µg',
    'ounce', 'oz',
    'pound', 'lb',
    'ton'
]

length_units = [
    'millimetre', 'mm', 'millimeter',
    'centimetre', 'cm', 'centimeter',
    'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
    'foot', 'ft',
    'inch', 'in',
    'yard', 'yd'
]

voltage_units = [
    'kilovolt', 'kV', 'kv',
    'millivolt', 'mV', 'mv',
    'volt', 'V', 'v'
]

wattage_units = [
    'kilowatt', 'kW', 'kw', 'watt', 'W', 'w'
]

volume_units = [
    'centilitre', 'cl',
    'cubic foot', 'ft³',
    'cubic inch', 'in³',
    'cup',
    'decilitre', 'dl',
    'fluid ounce', 'fl oz',
    'gallon', 'imperial gallon',
    'litre', 'liter',
    'millilitre', 'ml', 'milliliter',
    'microlitre', 'microliter',
    'pint', 'quart'
]

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # Weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # Length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    # Voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # Wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # Volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantities(row):
    text = row['raw_text']
    entity_name = row['entity_name']

    # Skip if text is None or not a string
    if pd.isna(text) or not isinstance(text, str):
        return []

    pattern = patterns.get(entity_name)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches
        extracted_values = []
        for match in matches:
            value, unit = match
            try:
                value = float(value)
            except ValueError:
                # Skip if the value is not a valid float
                continue
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            extracted_values.append(f"{value} {normalized_unit}")
        return extracted_values

    return []

# ---------------------------
# Part 2: Highest Value Prediction
# ---------------------------

# Define conversion factors to base units
# Base units:
# - Weight: gram
# - Length: millimeter
# - Voltage: volt
# - Wattage: watt
# - Volume: millilitre

conversion_factors = {
    'item_weight': {
        'milligram': 0.001,
        'gram': 1,
        'kilogram': 1000,
        'microgram': 0.000001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1_000_000
    },
    'maximum_weight_recommendation': {  # Same as item_weight
        'milligram': 0.001,
        'gram': 1,
        'kilogram': 1000,
        'microgram': 0.000001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1_000_000
    },
    'depth': {
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'width': {  # Same as depth
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'height': {  # Same as depth
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'voltage': {
        'millivolt': 0.001,
        'volt': 1,
        'kilovolt': 1000
    },
    'wattage': {
        'watt': 1,
        'kilowatt': 1000
    },
    'item_volume': {
        'millilitre': 1,
        'centilitre': 10,
        'decilitre': 100,
        'litre': 1000,
        'microlitre': 0.001,
        'cubic inch': 16.3871,
        'cubic foot': 28316.8,
        'fluid ounce': 29.5735,
        'cup': 236.588,
        'pint': 473.176,
        'quart': 946.353,
        'gallon': 3785.41,
        'imperial gallon': 4546.09
    }
}

# Function to find the highest value-unit pair from the extracted_units
def find_highest_pair(row):
    extracted_list = row['extracted_units']
    entity_name = row['entity_name']

    if not extracted_list:
        return None

    # Get the appropriate conversion mapping
    conversion_map = conversion_factors.get(entity_name)
    if not conversion_map:
        print(f"Warning: No conversion mapping for entity '{entity_name}'.")
        return None  # No conversion mapping defined

    max_value = -float('inf')
    max_pair = None

    for item in extracted_list:
        try:
            # Split into value and unit with maxsplit=1 to handle multi-word units
            value_str, unit = item.split(maxsplit=1)
            value = float(value_str)
            # Convert to base unit
            factor = conversion_map.get(unit)
            if factor is None:
                print(f"Warning: No conversion factor for unit '{unit}' in entity '{entity_name}'. Skipping.")
                continue
            base_value = value * factor
            if base_value > max_value:
                max_value = base_value
                max_pair = item
        except Exception as e:
            print(f"Error processing item '{item}': {e}")
            continue

    return max_pair

# Function to check if extracted text matches the entity value
def check_match(row):
    extracted = row['extracted_units']
    entity_value = row['entity_value']

    if isinstance(entity_value, str):
        parts = entity_value.split()
        if len(parts) != 2:
            return False
        try:
            val = float(parts[0])
        except ValueError:
            return False
        unit = parts[1].lower()
        normalized_unit = unit_mappings.get(unit, unit)
        normalized_ev = f"{val} {normalized_unit}"
        return normalized_ev in extracted
    elif isinstance(entity_value, list):
        # Assuming entity_value is a list of "value unit" strings
        normalized_ev = []
        for ev in entity_value:
            parts = ev.split()
            if len(parts) != 2:
                continue
            try:
                val = float(parts[0])
            except ValueError:
                continue
            unit = parts[1].lower()
            normalized_unit = unit_mappings.get(unit, unit)
            normalized_ev.append(f"{val} {normalized_unit}")
        return all(ev in extracted for ev in normalized_ev)
    else:
        return False

# ---------------------------
# Rename Columns and Apply Extraction
# ---------------------------

# Rename 'extracted_text' to 'raw_text' to preserve original data
df = df.rename(columns={'extracted_text': 'raw_text'})

# Apply the extraction function and store results in 'extracted_units'
df['extracted_units'] = df.apply(extract_quantities, axis=1)

# ---------------------------
# Apply the Check Match Function
# ---------------------------

df['match'] = df.apply(check_match, axis=1)

# ---------------------------
# Apply the Highest Value Prediction Function
# ---------------------------

df['predicted'] = df.apply(find_highest_pair, axis=1)

# ---------------------------
# Part 4: Normalizing `entity_value` for F1 Score
# ---------------------------

# Function to normalize entity_value similar to extract_quantities
def normalize_entity_value(row):
    entity_value = row['entity_value']

    if isinstance(entity_value, str):
        parts = entity_value.split()
        if len(parts) != 2:
            return None
        try:
            val = float(parts[0])
        except ValueError:
            return None
        unit = parts[1].lower()
        normalized_unit = unit_mappings.get(unit, unit)
        return f"{val} {normalized_unit}"
    else:
        return None

# Apply normalization to create 'normalized_entity_value'
df['normalized_entity_value'] = df.apply(normalize_entity_value, axis=1)

# ---------------------------
# Part 5: Calculating F1 Score
# ---------------------------

# Prepare the true labels and predicted labels
# Drop rows where normalization failed (if any)
df_valid = df.dropna(subset=['normalized_entity_value', 'predicted'])

# Calculate the F1 Score
# Since this is a multi-class classification problem, use 'macro' average
f1 = f1_score(df_valid['normalized_entity_value'], df_valid['predicted'], average='macro')

print("\n=== Final DataFrame ===")
print(df[['raw_text', 'extracted_units', 'match', 'predicted']])

print(f"\n=== F1 Score ===\n{f1:.4f}")

In [None]:
df.iloc[100:150]

Unnamed: 0.1,Unnamed: 0,image_link,group_id,entity_name,entity_value,raw_text,extracted_units,match,predicted,normalized_entity_value
100,100,51OEPkMaQZL.jpg,784538,depth,24.0 inch,"12x24 20x24"" 20""x30"" 24x36 9x12 12x16 10x20 20...",[],False,,24.0 inch
101,101,614x0gUMaLL.jpg,611510,height,32.0 inch,SIZE Due todifferentmesqure method thedctudlpr...,[],False,,32.0 inch
102,102,51tNTgGggvL.jpg,123238,width,40.0 centimetre,60cm/23.6in 40cm/15.7in,"[60.0 centimetre, 23.6 inch, 40.0 centimetre, ...",True,60.0 centimetre,40.0 centimetre
103,103,61zTgcURPWL.jpg,318770,wattage,100.0 watt,Thunderbolt 3 Cable Multifunctional One-line C...,[100.0 watt],True,100.0 watt,100.0 watt
104,104,41+m1VoKcFL.jpg,730429,item_weight,12.0 ounce,12 ooe Tvo Three Four Five,[],False,,12.0 ounce
105,105,51Dg4rUBLML.jpg,843434,height,6.0 inch,"1.6' 4"" 2.4"" 4"" 6"" 2"" 4.8' 6"" 4.8""",[],False,,6.0 inch
106,106,41cedd9XpUL.jpg,961155,width,72.0 millimetre,72mm 10mm 30US 118mm,"[72.0 millimetre, 10.0 millimetre, 118.0 milli...",True,118.0 millimetre,72.0 millimetre
107,107,41+X1orhoEL.jpg,825239,item_weight,9.8 kilogram,"190cm 180 cm 210 455 260 240 5,9m2 210 235 ARP...",[8.0 kilogram],False,8.0 kilogram,9.8 kilogram
108,108,81P4aH4QxyL.jpg,625842,item_volume,0.75 litre,Ohne Zucker & Zusatzstoffe mit 100% Volvic nat...,[],False,,0.75 litre
109,109,81o8RGZuekL.jpg,120569,item_weight,8.333333333333334 gram,Geprufte Qualitat Ing. Christian Fuczik Chemis...,[3.0 gram],False,3.0 gram,8.333333333333334 gram


In [None]:
# ---------------------------
# Part 2: Highest Value Prediction
# ---------------------------

# Define conversion factors to base units
# Base units:
# - Weight: gram
# - Length: millimeter
# - Voltage: volt
# - Wattage: watt
# - Volume: millilitre

conversion_factors = {
    'item_weight': {
        'milligram': 0.001,
        'gram': 1,
        'kilogram': 1000,
        'microgram': 0.000001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1_000_000
    },
    'maximum_weight_recommendation': {  # Same as item_weight
        'milligram': 0.001,
        'gram': 1,
        'kilogram': 1000,
        'microgram': 0.000001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1_000_000
    },
    'depth': {
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'width': {  # Same as depth
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'height': {  # Same as depth
        'millimetre': 1,
        'centimetre': 10,
        'metre': 1000,
        'foot': 304.8,
        'inch': 25.4,
        'yard': 914.4
    },
    'voltage': {
        'millivolt': 0.001,
        'volt': 1,
        'kilovolt': 1000
    },
    'wattage': {
        'watt': 1,
        'kilowatt': 1000
    },
    'item_volume': {
        'millilitre': 1,
        'centilitre': 10,
        'decilitre': 100,
        'litre': 1000,
        'microlitre': 0.001,
        'cubic inch': 16.3871,
        'cubic foot': 28316.8,
        'fluid ounce': 29.5735,
        'cup': 236.588,
        'pint': 473.176,
        'quart': 946.353,
        'gallon': 3785.41,
        'imperial gallon': 4546.09
    }
}

# Function to find the highest value-unit pair from the extracted_units
def find_highest_pair(row):
    extracted_list = row['extracted_units']
    entity_name = row['entity_name']

    if not extracted_list:
        return None

    # Get the appropriate conversion mapping
    conversion_map = conversion_factors.get(entity_name)
    if not conversion_map:
        print(f"Warning: No conversion mapping for entity '{entity_name}'.")
        return None  # No conversion mapping defined

    max_value = -float('inf')
    max_pair = None

    for item in extracted_list:
        try:
            value_str, unit = item.split()
            value = float(value_str)
            # Convert to base unit
            factor = conversion_map.get(unit)
            if factor is None:
                print(f"Warning: No conversion factor for unit '{unit}' in entity '{entity_name}'. Skipping.")
                continue
            base_value = value * factor
            if base_value > max_value:
                max_value = base_value
                max_pair = item
        except Exception as e:
            print(f"Error processing item '{item}': {e}")
            continue

    return max_pair

# Function to check if extracted text matches the entity value
def check_match(row):
    extracted = row['extracted_units']
    entity_value = row['entity_value']

    if isinstance(entity_value, str):
        parts = entity_value.split()
        if len(parts) != 2:
            return False
        try:
            val = float(parts[0])
        except ValueError:
            return False
        unit = parts[1].lower()
        normalized_unit = unit_mappings.get(unit, unit)
        normalized_ev = f"{val} {normalized_unit}"
        return normalized_ev in extracted
    elif isinstance(entity_value, list):
        # Assuming entity_value is a list of "value unit" strings
        normalized_ev = []
        for ev in entity_value:
            parts = ev.split()
            if len(parts) != 2:
                continue
            try:
                val = float(parts[0])
            except ValueError:
                continue
            unit = parts[1].lower()
            normalized_unit = unit_mappings.get(unit, unit)
            normalized_ev.append(f"{val} {normalized_unit}")
        return all(ev in extracted for ev in normalized_ev)
    else:
        return False

# ---------------------------
# Rename Columns and Apply Extraction
# ---------------------------

# Rename 'extracted_text' to 'raw_text' to preserve original data
df = df.rename(columns={'extracted_text': 'raw_text'})

# Apply the extraction function and store results in 'extracted_units'
df['extracted_units'] = df.apply(extract_quantities, axis=1)

# ---------------------------
# Apply the Check Match Function
# ---------------------------

df['match'] = df.apply(check_match, axis=1)

# ---------------------------
# Apply the Highest Value Prediction Function
# ---------------------------

df['predicted'] = df.apply(find_highest_pair, axis=1)

# ---------------------------
# Display the Final DataFrame
# ---------------------------

print("\n=== Final DataFrame ===")
print(df[['raw_text', 'extracted_units', 'match', 'predicted']])

KeyError: 'extracted_text'