In [None]:
!pip install paddlepaddle paddleocr

In [None]:
import pandas as pd
import numpy as np

####PaddleOCR

In [None]:
import os
import cv2
import time
import requests
from io import BytesIO
from PIL import Image
from joblib import Parallel, delayed
from paddleocr import PaddleOCR
import numpy as np
import pandas as pd

# Define the dataset path (CSV file)
dataset_path = '/content/train.csv'

# Define the output CSV file path with extracted text
output_csv = '/content/train_with_text.csv'

# Define counters for successful extractions and errors
success_count = 0
error_count = 0

# Define the current time
start_time = time.time()

# Function to upscale the image
def upscale_image(image, scale_factor=2):
    """
    Upscales an image by the given scale factor.
    """
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    dimensions = (width, height)

    # Resize the image to the new dimensions
    upscaled_image = cv2.resize(image, dimensions, interpolation=cv2.INTER_CUBIC)
    return upscaled_image

# Function to download image from URL and convert it to an OpenCV image
def download_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for failed requests
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert to RGB
        return np.array(image)  # Convert the PIL image to a NumPy array (OpenCV format)
    except Exception as e:
        print(f"Error downloading image {url}: {str(e)}")
        return None

# Define a function for text extraction using PaddleOCR
def extract_text(image_path, scale_factor=2):
    try:
        # Initialize PaddleOCR within the function to avoid PicklingError
        ocr = PaddleOCR(use_angle_cls=True, lang='en')

        # Check if image_path is a URL (starts with 'http' or 'https')
        if image_path.startswith('http'):
            # Download the image from the URL
            image_cv = download_image_from_url(image_path)
        else:
            # Load the image from a local file
            image_cv = cv2.imread(image_path)

        if image_cv is None:
            raise ValueError("Image could not be loaded, possibly corrupted or invalid format.")

        # Upscale the image to enhance text extraction
        image_upscaled = upscale_image(image_cv, scale_factor=scale_factor)

        # Convert the image from BGR to RGB (PaddleOCR expects RGB format)
        image_rgb = cv2.cvtColor(image_upscaled, cv2.COLOR_BGR2RGB)

        # Extract text using PaddleOCR
        results = ocr.ocr(image_rgb)  # Returns a list of text boxes and text

        # Join the results into a single string
        text = '\n'.join([res[1][0] for res in results[0]])  # Extract text part

        return text

    except Exception as e:
        # Handle exceptions (e.g., if the image is corrupted)
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Read the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)

# Limit the DataFrame to the first 50 rows
df = df.head(30)


# Ensure that there is an 'image_link' column in the dataset
if 'image_link' not in df.columns:
    raise ValueError("The dataset must contain an 'image_link' column with image paths or URLs.")

# Initialize a new column for extracted text
df['extracted_text'] = None

# Iterate over the dataset and extract text for each image
for idx, row in df.iterrows():
    image_path = row['image_link']
    extracted_text = extract_text(image_path)  # Extract text from the image

    if extracted_text is not None:
        df.at[idx, 'extracted_text'] = extracted_text
        success_count += 1
    else:
        error_count += 1

# Save the updated dataset with the extracted text to a new CSV file
df.to_csv(output_csv, index=False)

# Calculate the total execution time in minutes
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60  # Convert to minutes

# Print the total time taken in minutes
print(f"Total time taken: {total_time_minutes:.2f} minutes")

# Print the count of successful extractions and errors
print(f"Images successfully extracted: {success_count}")
print(f"Images with errors: {error_count}")

# Print the path to the output CSV file
print(f"Updated dataset saved to: {output_csv}")

[2024/09/14 12:00:42] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

###Regex - Single value-unit pair


In [None]:
df = pd.read_csv("/content/test_merged.csv")
df

Unnamed: 0.1,Unnamed: 0,index,image_link,group_id,entity_name,extracted_text
0,0,0,110EibNyclL.jpg,156839,height,2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in
1,1,1,11TU2clswzL.jpg,792578,width,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
2,2,2,11TU2clswzL.jpg,792578,height,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
3,3,3,11TU2clswzL.jpg,792578,depth,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
4,4,4,11gHj8dhhrL.jpg,792578,depth,"Size Width Length One Size 10.50cm/4.13"" 90cm/..."
...,...,...,...,...,...,...
131182,131182,131283,A1rVsIzEtkL.jpg,721522,maximum_weight_recommendation,"FOREMAN 1,500 LB weight capacity"
131183,131183,131284,A1rdvZ5zDdL.jpg,603688,item_weight,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...
131184,131184,131285,A1rdvZ5zDdL.jpg,603688,maximum_weight_recommendation,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...
131185,131185,131286,A1tnTUPyr7L.jpg,853009,item_weight,DOGNAM JOSSIE HUMAN NAMI VIeToR 955123-1/567


In [None]:
import re
import pandas as pd

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Add an option to match numeric values followed by a double quote as inches
    units_pattern += r'|"'
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s?(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams',
                'kilogram', 'kg',
                'microgram', 'µg',
                'ounce', 'oz',
                'pound', 'lb',
                'ton']

length_units = ['millimetre', 'mm', 'millimeter',
                'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
                'foot', 'ft',
                'inch', 'in',
                'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv',
                 'millivolt', 'mV', 'mv',
                 'volt', 'V', 'v']

wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']

volume_units = ['centilitre', 'cl',
                'cubic foot', 'ft³',
                'cubic inch', 'in³',
                'cup',
                'decilitre', 'dl',
                'fluid ounce', 'fl oz',
                'gallon', 'imperial gallon',
                'litre', 'liter',
                'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter',
                'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',
    '"': 'inch',  # This handles the double quote for inches

    # voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantity(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return ''

    pattern = patterns.get(entity_name)
    if pattern:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value, unit = match.groups()
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            return f"{value} {normalized_unit}"

    return ''

# Apply the function to the DataFrame
df['extracted_text'] = df.apply(extract_quantity, axis=1)


In [None]:
df[df['extracted_text'] == ''].count()

Unnamed: 0,0
Unnamed: 0,37961
index,37961
image_link,37961
group_id,37961
entity_name,37961
extracted_text,37961


In [None]:
# Calculate the F1 score by treating the values as individual tokens
f1 = f1_score(df['entity_value'], df['extracted_text'], average='weighted')

print(f"F1 Score: {f1}")

F1 Score: 0.37978304844789823


In [None]:
df = df.drop(columns=["image_link", "group_id", "entity_name"])

In [None]:
df

Unnamed: 0.1,Unnamed: 0,index,image_link,group_id,entity_name,extracted_text
0,0,0,110EibNyclL.jpg,156839,height,2.63 inch
1,1,1,11TU2clswzL.jpg,792578,width,42.0 centimetre
2,2,2,11TU2clswzL.jpg,792578,height,42.0 centimetre
3,3,3,11TU2clswzL.jpg,792578,depth,42.0 centimetre
4,4,4,11gHj8dhhrL.jpg,792578,depth,10.5 centimetre
...,...,...,...,...,...,...
131182,131182,131283,A1rVsIzEtkL.jpg,721522,maximum_weight_recommendation,500.0 pound
131183,131183,131284,A1rdvZ5zDdL.jpg,603688,item_weight,
131184,131184,131285,A1rdvZ5zDdL.jpg,603688,maximum_weight_recommendation,
131185,131185,131286,A1tnTUPyr7L.jpg,853009,item_weight,


In [None]:
y_pred = df['extracted_text']
index = df['index']

submission = pd.DataFrame({
    'index': index, # Using index as ID
    'prediction': y_pred
})

submission.to_csv('submission_1.csv', index=False)

###Model Training

In [None]:
# # this wont be able to predict on test data
# data = df
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import f1_score
# import numpy as np
# import re

# # Function to extract the numeric value from entity_value
# def extract_numeric_value(entity_value):
#     match = re.search(r'[\d.]+', entity_value)
#     if match:
#         return float(match.group())
#     return np.nan  # If no numeric value is found, return NaN

# # Apply the extraction function to the entity_value column
# data['entity_value_numeric'] = data['entity_value'].apply(extract_numeric_value)

# # Drop rows where entity_value_numeric could not be extracted
# data.dropna(subset=['entity_value_numeric'], inplace=True)

# # Ensure 'extracted_text' is in string format, join lists if necessary
# data['extracted_text'] = data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# # Binning the target variable (entity_value_numeric) into classes
# data['entity_value_class'] = pd.cut(data['entity_value_numeric'], bins=[0, 50, 150, np.inf], labels=['low', 'medium', 'high'])

# # Drop rows where entity_value_class is NaN
# data.dropna(subset=['entity_value_class'], inplace=True)

# # Ensure that the class labels are correctly assigned
# print(data[['entity_value', 'entity_value_numeric', 'entity_value_class']])

# # Split the data into train and test sets
# X = data['extracted_text']
# y = data['entity_value_class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # TF-IDF Vectorizer: Convert text into numeric form
# tfidf = TfidfVectorizer()

# # Ensure both train and test data are transformed using the same TF-IDF vectorizer
# X_train_tfidf = tfidf.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test)

# # Check if the vectorized data is numeric and in the correct format
# print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
# print(f"X_test_tfidf shape: {X_test_tfidf.shape}")

# # Model: Random Forest Classifier
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train_tfidf, y_train)

# # Predictions: Ensure the test data is properly vectorized before prediction
# y_pred = model.predict(X_test_tfidf)

# # Calculate F1 score
# f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted average for multiclass problems

# print(f'F1 Score: {f1}')

In [None]:
# # training on full data
# data = df
# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer, f1_score
# import numpy as np
# import re


# # Function to extract the numeric value from entity_value
# def extract_numeric_value(entity_value):
#     match = re.search(r'[\d.]+', entity_value)
#     if match:
#         return float(match.group())
#     return np.nan  # If no numeric value is found, return NaN

# # Apply the extraction function to the entity_value column
# data['entity_value_numeric'] = data['entity_value'].apply(extract_numeric_value)

# # Drop rows where entity_value_numeric could not be extracted
# data.dropna(subset=['entity_value_numeric'], inplace=True)

# # Ensure 'extracted_text' is in string format, join lists if necessary
# data['extracted_text'] = data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# # Binning the target variable (entity_value_numeric) into classes
# data['entity_value_class'] = pd.cut(data['entity_value_numeric'], bins=[0, 50, 150, np.inf], labels=['low', 'medium', 'high'])

# # Drop rows where entity_value_class is NaN
# data.dropna(subset=['entity_value_class'], inplace=True)

# # Features (X) and target (y)
# X = data['extracted_text']
# y = data['entity_value_class']

# # TF-IDF Vectorizer: Convert text into numeric form
# tfidf = TfidfVectorizer()
# X_tfidf = tfidf.fit_transform(X)

# # Model: Random Forest Classifier
# model = RandomForestClassifier(n_estimators=100, random_state=42)

# # Cross-validation with F1 score
# f1_scorer = make_scorer(f1_score, average='weighted')

# # Perform cross-validation to calculate the F1 score
# cv_f1_scores = cross_val_score(model, X_tfidf, y, cv=5, scoring=f1_scorer)

# # Print the F1 scores for each fold and the mean F1 score
# print(f'F1 scores for each fold: {cv_f1_scores}')
# print(f'Mean F1 score: {np.mean(cv_f1_scores)}')

F1 scores for each fold: [0.80490516 0.80451849 0.80525354 0.80687844 0.79884535]
Mean F1 score: 0.8040801960935438


###Combining extracted texts with df


In [None]:
train = pd.read_csv("/content/train.csv")

# Define the substring to remove
substring_to_remove = 'https://m.media-amazon.com/images/I/'

# Remove the substring from the 'links' column
train['image_link'] = train['image_link'].str.replace(substring_to_remove, '', regex=False)

In [None]:
df = pd.read_csv("/content/ocr_results_50k.csv")

In [None]:
# renaming column and dropping unnecessary columns
df['image_link'] = df['Image Path']
df = df.drop(columns=['Image Path', 'Processing Time (seconds)'])

In [None]:
# Merge the DataFrames on 'image_link'
df_combined = pd.merge(df, train, on='image_link', how='inner')
df_combined

Unnamed: 0,Extracted Text,image_link,group_id,entity_name,entity_value
0,em71-25 201AN nhwYon kngwkte 19682 2100 0 king...,61BAVCR4heL.jpg,944592,depth,13.0 inch
1,MB FUEL NE NUTRITION FOR PERFORMANCE WEIGHT GA...,612B4JamOKL.jpg,731432,item_weight,5 kilogram
2,STANLEY 125mm),515tc7ALiAL.jpg,116155,width,25.0 millimetre
3,40cm/15.7in 7cm/2.7in,51FcN6IeZ4L.jpg,838598,depth,7.0 centimetre
4,RS100 Alice in Wonderland - English Afternoon ...,81qmAL-r-2L.jpg,281678,item_weight,80.0 gram
...,...,...,...,...,...
58014,66 inch,51EUIX6Mh2L.jpg,358831,depth,66.0 inch
58015,3mm (0.12 inch) Black rhodium plated (stainles...,71KMJrbQ7yL.jpg,299791,item_weight,20.0 gram
58016,WEIGHT:28kg/61.6lb FBA OX SIZE:160*55.5*73cm/6...,91pHF10WBUL.jpg,442321,item_weight,28.0 kilogram
58017,3mm (0.12 inch) Platinum Plated 39mm (1.54 inc...,61XNzV1vHTL.jpg,299791,item_weight,27 gram


In [None]:
df_combined["extracted_text"] = df_combined["Extracted Text"]
df_combined = df_combined.drop(columns=['Extracted Text'])
df_combined.head(10)

Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text
0,61BAVCR4heL.jpg,944592,depth,13.0 inch,em71-25 201AN nhwYon kngwkte 19682 2100 0 king...
1,612B4JamOKL.jpg,731432,item_weight,5 kilogram,MB FUEL NE NUTRITION FOR PERFORMANCE WEIGHT GA...
2,515tc7ALiAL.jpg,116155,width,25.0 millimetre,STANLEY 125mm)
3,51FcN6IeZ4L.jpg,838598,depth,7.0 centimetre,40cm/15.7in 7cm/2.7in
4,81qmAL-r-2L.jpg,281678,item_weight,80.0 gram,RS100 Alice in Wonderland - English Afternoon ...
5,81z40CGeMxL.jpg,449021,item_weight,2.0 ounce,sanaBuL 2oz sanaBuL
6,51Dc6OEn5-L.jpg,675317,width,32.0 millimetre,200mm/7.9in 128mm/5in 32mm Hole centers:128mm ...
7,71nGqHcfGoL.jpg,281678,item_weight,100.0 gram,R REPLANTEA Infusion Ayurveda Kapha INFUSION E...
8,615cwxiX73L.jpg,487566,item_weight,40.0 kilogram,SPEC Height Weight Size XS < 85CM <40KG 50x100...
9,61-PZy0LS0L.jpg,306956,width,30.0 centimetre,PIzZa MakeR Product Code Hawkins PIZZA & CaKe ...


In [None]:
# df_combined.to_csv("train_merged.csv")

In [None]:
df = df_combined

###Regex - All value-unit pair (Train)

In [None]:
# capturing all value-unit pair with regex
import re
import pandas as pd

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s*(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams',
                'kilogram', 'kg',
                'microgram', 'µg',
                'ounce', 'oz',
                'pound', 'lb',
                'ton']

length_units = ['millimetre', 'mm', 'millimeter',
                'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
                'foot', 'ft',
                'inch', 'in',
                'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv',
                 'millivolt', 'mV', 'mv',
                 'volt', 'V', 'v']

wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']

volume_units = ['centilitre', 'cl',
                'cubic foot', 'ft³',
                'cubic inch', 'in³',
                'cup',
                'decilitre', 'dl',
                'fluid ounce', 'fl oz',
                'gallon', 'imperial gallon',
                'litre', 'liter',
                'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter',
                'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    # voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantities(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return []

    pattern = patterns.get(entity_name)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches
        extracted_values = []
        for match in matches:
            value, unit = match
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            extracted_values.append(f"{value} {normalized_unit}")
        return extracted_values

    return []

# Function to check if extracted text matches the entity value
def check_match(row):
    extracted = row['extracted_text']
    entity_value = row['entity_value']

    return extracted == entity_value

# Apply the function to the DataFrame
df['extracted_text'] = df.apply(extract_quantities, axis=1)
df['match'] = df.apply(check_match, axis=1)

###Model Training on train

In [None]:
# # separate training on train dataset
# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import f1_score
# import re

# train_data = df

# # Function to extract the numeric value from entity_value
# def extract_numeric_value(entity_value):
#     match = re.search(r'[\d.]+', entity_value)
#     if match:
#         return float(match.group())
#     return np.nan  # If no numeric value is found, return NaN

# # Apply the extraction function to the entity_value column
# train_data['entity_value_numeric'] = train_data['entity_value'].apply(extract_numeric_value)

# # Drop rows where entity_value_numeric could not be extracted
# train_data.dropna(subset=['entity_value_numeric'], inplace=True)

# # Ensure 'extracted_text' is in string format, join lists if necessary
# train_data['extracted_text'] = train_data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# # Binning the target variable (entity_value_numeric) into classes
# train_data['entity_value_class'] = pd.cut(train_data['entity_value_numeric'], bins=[0, 50, 150, np.inf], labels=['low', 'medium', 'high'])

# # Drop rows where entity_value_class is NaN
# train_data.dropna(subset=['entity_value_class'], inplace=True)

# # Features (X) and target (y)
# X_train = train_data['extracted_text']
# y_train = train_data['entity_value_class']

# # TF-IDF Vectorizer: Convert text into numeric form
# tfidf = TfidfVectorizer()
# X_train_tfidf = tfidf.fit_transform(X_train)

# # Model: Random Forest Classifier
# model = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the model on the entire training dataset

# model.fit(X_train_tfidf, y_train)

# # --- Calculate F1 Score on the Training Data ---

# # Make predictions on the training data
# y_train_pred = model.predict(X_train_tfidf)

# # Calculate the F1 score on the training data
# f1_train = f1_score(y_train, y_train_pred, average='weighted')

# print(f'F1 Score on Training Data: {f1_train}')

# # --- Save the trained model and vectorizer for future use ---
# import pickle
# with open('trained_model.pkl', 'wb') as model_file:
#     pickle.dump(model, model_file)

# with open('tfidf_vectorizer.pkl', 'wb') as tfidf_file:
#     pickle.dump(tfidf, tfidf_file)

# print("Model and vectorizer saved. Training complete.")

Model and vectorizer saved. Training complete.


In [None]:
# --- Training Phase ---
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
import pickle

# Load your training data into 'train_data'
train_data = df  # Replace with your actual DataFrame

# Function to extract numeric value from 'entity_value'
def extract_numeric_value(entity_value):
    match = re.search(r'[\d.]+', str(entity_value))
    if match:
        return float(match.group())
    return np.nan

# Apply the extraction function to the 'entity_value' column
train_data['entity_value_numeric'] = train_data['entity_value'].apply(extract_numeric_value)

# Drop rows where 'entity_value_numeric' could not be extracted
train_data.dropna(subset=['entity_value_numeric'], inplace=True)

# Ensure 'extracted_text' is in string format
train_data['extracted_text'] = train_data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Function to extract value-unit pairs from 'extracted_text'
def extract_value_unit_pairs(text):
    return re.findall(r'([\d.]+)\s*([a-zA-Z]+)', text)

# Create a new DataFrame to hold the value-unit pairs and labels
pairs_list = []

for index, row in train_data.iterrows():
    extracted_text = row['extracted_text']
    entity_value_numeric = row['entity_value_numeric']
    value_unit_pairs = extract_value_unit_pairs(extracted_text)

    for i, (value_str, unit) in enumerate(value_unit_pairs):
        value = float(value_str)
        # Label as correct if the value matches 'entity_value_numeric' within a small tolerance
        is_correct = int(abs(value - entity_value_numeric) < 1e-6)
        pairs_list.append({
            'index': index,
            'value': value,
            'unit': unit,
            'position': i,
            'surrounding_text': extracted_text,
            'is_correct': is_correct
        })

# Create DataFrame from the list
pairs_df = pd.DataFrame(pairs_list)

# Prepare text features
pairs_df['value_str'] = pairs_df['value'].astype(str)
pairs_df['unit'] = pairs_df['unit'].astype(str)
pairs_df['combined_text'] = pairs_df['value_str'] + ' ' + pairs_df['unit'] + ' ' + pairs_df['surrounding_text']

# Vectorize text features using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(pairs_df['combined_text'])

# Add 'position' as a feature
X_features = hstack([X_tfidf, pairs_df[['position']].values])

# Target variable
y = pairs_df['is_correct']

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_features, y)

# Save the trained model and vectorizer
with open('trained_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf, tfidf_file)

print("Model and vectorizer saved. Training complete.")


###Combining Test

In [None]:
test_ocr = pd.read_csv("/content/test_ocr_results.csv")

test_ocr["image_link"] = test_ocr["Image Path"]
test_ocr["extracted_text"] = test_ocr["Extracted Text"]

test_ocr = test_ocr.drop(columns=['Processing Time (seconds)', 'Image Path', 'Extracted Text'])
test_ocr

Unnamed: 0,image_link,extracted_text
0,510XT5g-5IL.jpg,8.8 in 22.5 cm 17.3 in 44 cm
1,51Rfwsw-3CL.jpg,Large Capacity High Power 0.7cu.nt 700w5PowrLe...
2,51gQ8hdziuL.jpg,Countersunk & color matched hardware 100% soli...
3,41wRDewHy2L.jpg,5.5 inches 3.5 inches
4,51QgcVkL31L.jpg,Thickness3mm 3cm/1.18in 18cm/7in 8cm/3.1in
...,...,...
89767,417mqJLsrEL.jpg,6.48in/16.5cm 1.38in/3.5cm
89768,41vgHZPrDeL.jpg,200.cm 82cm 75.cm
89769,71BgmY66rZL.jpg,HEAVY DUTY SUPPORT UP TO 2200LBS 2200 Ibs
89770,71fuw-jdZRL.jpg,"18.1"" 70.8"" Dimensions : 70.8 x 18.1 x 20 inch..."


In [None]:
test = pd.read_csv("/content/test.csv")

# Define the substring to remove
substring_to_remove = 'https://m.media-amazon.com/images/I/'

# Remove the substring from the 'links' column
test['image_link'] = test['image_link'].str.replace(substring_to_remove, '', regex=False)

In [None]:
test

Unnamed: 0,index,image_link,group_id,entity_name
0,0,110EibNyclL.jpg,156839,height
1,1,11TU2clswzL.jpg,792578,width
2,2,11TU2clswzL.jpg,792578,height
3,3,11TU2clswzL.jpg,792578,depth
4,4,11gHj8dhhrL.jpg,792578,depth
...,...,...,...,...
131182,131283,A1rVsIzEtkL.jpg,721522,maximum_weight_recommendation
131183,131284,A1rdvZ5zDdL.jpg,603688,item_weight
131184,131285,A1rdvZ5zDdL.jpg,603688,maximum_weight_recommendation
131185,131286,A1tnTUPyr7L.jpg,853009,item_weight


In [None]:
# Merge the two dataframes on the 'image_link' column (left join to preserve all test_df rows)
merged_df = pd.merge(test, test_ocr, on='image_link', how='left')

In [None]:
df = merged_df

In [None]:
# merged_df.to_csv("test_merged.csv")

###Regex for test

In [None]:
df_test = pd.read_csv("/content/test_merged.csv")

In [None]:
# capturing all value-unit pair with regex
import re
import pandas as pd

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s*(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams',
                'kilogram', 'kg',
                'microgram', 'µg',
                'ounce', 'oz',
                'pound', 'lb',
                'ton']

length_units = ['millimetre', 'mm', 'millimeter',
                'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
                'foot', 'ft',
                'inch', 'in',
                'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv',
                 'millivolt', 'mV', 'mv',
                 'volt', 'V', 'v']

wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']

volume_units = ['centilitre', 'cl',
                'cubic foot', 'ft³',
                'cubic inch', 'in³',
                'cup',
                'decilitre', 'dl',
                'fluid ounce', 'fl oz',
                'gallon', 'imperial gallon',
                'litre', 'liter',
                'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter',
                'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    # voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantities(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return []

    pattern = patterns.get(entity_name)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches
        extracted_values = []
        for match in matches:
            value, unit = match
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            extracted_values.append(f"{value} {normalized_unit}")
        return extracted_values

    return []

# Apply the function to the DataFrame
df_test['extracted_values'] = df_test.apply(extract_quantities, axis=1)

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,index,image_link,group_id,entity_name,extracted_text,extracted_values
0,0,0,110EibNyclL.jpg,156839,height,2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in,"[2.63 inch, 6.68 centimetre, 91.44 centimetre,..."
1,1,1,11TU2clswzL.jpg,792578,width,"Size Width Length One Size 42cm/16.54"" 200cm/7...","[42.0 centimetre, 200.0 centimetre]"
2,2,2,11TU2clswzL.jpg,792578,height,"Size Width Length One Size 42cm/16.54"" 200cm/7...","[42.0 centimetre, 200.0 centimetre]"
3,3,3,11TU2clswzL.jpg,792578,depth,"Size Width Length One Size 42cm/16.54"" 200cm/7...","[42.0 centimetre, 200.0 centimetre]"
4,4,4,11gHj8dhhrL.jpg,792578,depth,"Size Width Length One Size 10.50cm/4.13"" 90cm/...","[10.5 centimetre, 90.0 centimetre]"
...,...,...,...,...,...,...,...
131182,131182,131283,A1rVsIzEtkL.jpg,721522,maximum_weight_recommendation,"FOREMAN 1,500 LB weight capacity",[500.0 pound]
131183,131183,131284,A1rdvZ5zDdL.jpg,603688,item_weight,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...,[]
131184,131184,131285,A1rdvZ5zDdL.jpg,603688,maximum_weight_recommendation,BUILT-INEHANDLES SAFETY AND MANEUVERABILITY. M...,[]
131185,131185,131286,A1tnTUPyr7L.jpg,853009,item_weight,DOGNAM JOSSIE HUMAN NAMI VIeToR 955123-1/567,[]


In [None]:
df_test.loc[131185, "extracted_text"]

'DOGNAM JOSSIE HUMAN NAMI VIeToR 955123-1/567'

###Model predict on Test

In [None]:
# import pandas as pd
# import pickle
# import numpy as np
# import re

# # --- Testing Phase: Work with the test dataset ---

# # Assuming the test data is loaded into `test_data` DataFrame
# test_data = df
# # Ensure 'extracted_text' is in string format, join lists if necessary
# test_data['extracted_text'] = test_data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# # Load the trained model and vectorizer from the training phase
# with open('trained_model.pkl', 'rb') as model_file:
#     model = pickle.load(model_file)

# with open('tfidf_vectorizer.pkl', 'rb') as tfidf_file:
#     tfidf = pickle.load(tfidf_file)

# # Vectorize the test data using the already fitted TF-IDF vectorizer
# X_test_tfidf = tfidf.transform(test_data['extracted_text'])

# # Predict the class ("low", "medium", "high") for the test data
# test_class_predictions = model.predict(X_test_tfidf)

# # Function to extract value-unit pair based on class prediction
# def extract_value_unit(extracted_text, predicted_class):
#     # Use a regular expression to extract numeric values and units
#     value_unit_pairs = re.findall(r'([\d.]+)\s*([a-zA-Z]+)', extracted_text)

#     # Convert the class ("low", "medium", "high") back to a numeric range
#     if predicted_class == 'low':
#         return value_unit_pairs[0] if value_unit_pairs else ('Unknown', 'Unknown')
#     elif predicted_class == 'medium':
#         return value_unit_pairs[1] if len(value_unit_pairs) > 1 else ('Unknown', 'Unknown')
#     elif predicted_class == 'high':
#         return value_unit_pairs[-1] if len(value_unit_pairs) > 2 else ('Unknown', 'Unknown')
#     else:
#         return ('Unknown', 'Unknown')

# # Map predicted classes to numeric value-unit pairs
# test_data['predicted_value_unit'] = test_data.apply(
#     lambda row: extract_value_unit(row['extracted_text'], row['predicted_class']),
#     axis=1
# )

# # Remove the 'predicted_class' column (optional)
# test_data.drop('predicted_class', axis=1, inplace=True)

# # Display the test data with predicted value-unit pairs
# print(test_data[['extracted_text', 'predicted_value_unit']])


                                           extracted_text predicted_value_unit
0       2.63 inch 6.68 centimetre 91.44 centimetre 199...         (2.63, inch)
1                        42.0 centimetre 200.0 centimetre   (Unknown, Unknown)
2                        42.0 centimetre 200.0 centimetre   (Unknown, Unknown)
3                        42.0 centimetre 200.0 centimetre   (Unknown, Unknown)
4                         10.5 centimetre 90.0 centimetre   (10.5, centimetre)
...                                                   ...                  ...
131182                                        500.0 pound   (Unknown, Unknown)
131183                                                      (Unknown, Unknown)
131184                                                      (Unknown, Unknown)
131185                                                      (Unknown, Unknown)
131186                                                      (Unknown, Unknown)

[131187 rows x 2 columns]


In [None]:
# --- Testing Phase ---
import pandas as pd
import pickle
import numpy as np
import re
from scipy.sparse import hstack

# Load your test data into 'test_data'
test_data = df  # Replace with your actual test DataFrame

# Ensure 'extracted_text' is in string format
test_data['extracted_text'] = test_data['extracted_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Load the trained model and vectorizer
with open('trained_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as tfidf_file:
    tfidf = pickle.load(tfidf_file)

# Prepare test data
pairs_list_test = []

for index, row in test_data.iterrows():
    extracted_text = row['extracted_text']
    value_unit_pairs = extract_value_unit_pairs(extracted_text)

    for i, (value_str, unit) in enumerate(value_unit_pairs):
        value = float(value_str)
        pairs_list_test.append({
            'index': index,
            'value': value,
            'unit': unit,
            'position': i,
            'surrounding_text': extracted_text
        })

# Create DataFrame from the list
pairs_df_test = pd.DataFrame(pairs_list_test)

# Prepare text features
pairs_df_test['value_str'] = pairs_df_test['value'].astype(str)
pairs_df_test['unit'] = pairs_df_test['unit'].astype(str)
pairs_df_test['combined_text'] = pairs_df_test['value_str'] + ' ' + pairs_df_test['unit'] + ' ' + pairs_df_test['surrounding_text']

# Transform text features using the loaded TF-IDF vectorizer
X_tfidf_test = tfidf.transform(pairs_df_test['combined_text'])

# Add 'position' as a feature
X_features_test = hstack([X_tfidf_test, pairs_df_test[['position']].values])

# Predict probabilities
probs = model.predict_proba(X_features_test)

# Get the probability of the positive class (is_correct == 1)
class_index = list(model.classes_).index(1)
probs_correct = probs[:, class_index]

pairs_df_test['prob_correct'] = probs_correct

# For each sample, select the value-unit pair with the highest probability
predicted_pairs = pairs_df_test.loc[pairs_df_test.groupby('index')['prob_correct'].idxmax()]

# Merge the predicted pairs back to the test_data
test_data = test_data.merge(predicted_pairs[['index', 'value', 'unit']], left_index=True, right_on='index', how='left')

# Display the test data with predicted value-unit pairs
print(test_data[['extracted_text', 'value', 'unit']])




                                             extracted_text   value  \
2.0       2.63 inch 6.68 centimetre 91.44 centimetre 199...   91.44   
6.0                        42.0 centimetre 200.0 centimetre   42.00   
8.0                        42.0 centimetre 200.0 centimetre   42.00   
10.0                       42.0 centimetre 200.0 centimetre   42.00   
13.0                        10.5 centimetre 90.0 centimetre   90.00   
...                                                     ...     ...   
300552.0                                        500.0 pound  500.00   
NaN                                                             NaN   
NaN                                                             NaN   
NaN                                                             NaN   
NaN                                                             NaN   

                unit  
2.0       centimetre  
6.0       centimetre  
8.0       centimetre  
10.0      centimetre  
13.0      centimetre  
...      

In [None]:
test_data.iloc[31:61]

Unnamed: 0,index,index_x,image_link,group_id,entity_name,extracted_text,index_y,value,unit
,31,31,21W7FvftSCL.jpg,329793,item_weight,,,,
48.0,32,32,21aD6ktvwxS.jpg,113134,width,5.8 inch 14.0 centimetre,32.0,14.0,centimetre
50.0,33,33,21aD6ktvwxS.jpg,113134,depth,5.8 inch 14.0 centimetre,33.0,14.0,centimetre
52.0,34,34,21aD6ktvwxS.jpg,113134,height,5.8 inch 14.0 centimetre,34.0,14.0,centimetre
,35,35,21bfrFeArAL.jpg,601746,item_weight,,,,
53.0,36,36,21bwWoCpGJL.jpg,156839,height,194.3 centimetre 182.9 centimetre,36.0,194.3,centimetre
,37,37,21cLufe8Y5L.jpg,648011,voltage,,,,
,38,38,21cLufe8Y5L.jpg,648011,wattage,,,,
58.0,39,39,21d6Dtc94mL.jpg,244283,depth,5.4 inch 90.0 centimetre 47.2 inch 120.0 centi...,39.0,120.0,centimetre
62.0,40,40,21d6Dtc94mL.jpg,244283,width,5.4 inch 90.0 centimetre 47.2 inch 120.0 centi...,40.0,120.0,centimetre


###F1 Score

In [None]:
import pandas as pd
import re
from sklearn.metrics import f1_score

# Define a function to normalize and extract numerical values and units
def normalize_text(text):
    if pd.isna(text):
        return None
    # Extract value and unit using regex
    match = re.search(r"(\d+\.?\d*)\s?(.*)", text.strip())
    if match:
        value, unit = match.groups()
        value = float(value)
        unit = unit.lower()
        return (value, unit)
    return None

# Function to check if extracted text matches the entity value
def is_match(row):
    entity_value = row['entity_value']
    extracted_text = row['extracted_text']

    # Normalize both values
    entity_normalized = normalize_text(entity_value)
    extracted_normalized = normalize_text(extracted_text)

    if entity_normalized and extracted_normalized:
        return (abs(entity_normalized[0] - extracted_normalized[0]) < 1e-6) and (entity_normalized[1] == extracted_normalized[1])
    return False

# Clean and normalize the columns
df['entity_value'] = df['entity_value'].fillna('').str.strip()
df['extracted_text'] = df['extracted_text'].fillna('').str.strip()

# Create binary columns indicating a match
df['match'] = df.apply(is_match, axis=1)

# Create binary labels for F1 score calculation
y_true = (df['entity_value'] != '').astype(int)  # 1 if entity_value is not empty
y_pred = df['match'].astype(int)  # 1 if match is True

# Calculate F1 score
f1 = f1_score(y_true, y_pred)

print(f"F1 Score: {f1:.4f}")

F1 Score: 0.5763


###Quantulum


In [None]:
!pip install quantulum3

Collecting quantulum3
  Downloading quantulum3-0.9.2-py3-none-any.whl.metadata (16 kB)
Collecting num2words (from quantulum3)
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Collecting docopt>=0.6.2 (from num2words->quantulum3)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading quantulum3-0.9.2-py3-none-any.whl (10.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading num2words-0.5.13-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=cee57187edc35c131d5fed97e696e6b037b6baf1ce3cb0301b21658513150233
  Stored in directory: /root/.cache/p

In [None]:
import os
import cv2
import time
import requests
from io import BytesIO
from PIL import Image
from joblib import Parallel, delayed
from paddleocr import PaddleOCR
import numpy as np
import pandas as pd
from quantulum3 import parser  # Import quantulum3 for quantity extraction

# Define the dataset path (CSV file)
dataset_path = '/content/train.csv'

# Define the output CSV file path with extracted text and numeric values
output_csv = '/content/train_with_text_and_values.csv'

# Define counters for successful extractions and errors
success_count = 0
error_count = 0

# Define the current time
start_time = time.time()

# Function to upscale the image
def upscale_image(image, scale_factor=2):
    """
    Upscales an image by the given scale factor.
    """
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    dimensions = (width, height)

    # Resize the image to the new dimensions
    upscaled_image = cv2.resize(image, dimensions, interpolation=cv2.INTER_CUBIC)
    return upscaled_image

# Function to download image from URL and convert it to an OpenCV image
def download_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for failed requests
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert to RGB
        return np.array(image)  # Convert the PIL image to a NumPy array (OpenCV format)
    except Exception as e:
        print(f"Error downloading image {url}: {str(e)}")
        return None

# Define a function for text extraction using PaddleOCR
def extract_text(image_path, scale_factor=2):
    try:
        # Initialize PaddleOCR within the function to avoid PicklingError
        ocr = PaddleOCR(use_angle_cls=True, lang='en')

        # Check if image_path is a URL (starts with 'http' or 'https')
        if image_path.startswith('http'):
            # Download the image from the URL
            image_cv = download_image_from_url(image_path)
        else:
            # Load the image from a local file
            image_cv = cv2.imread(image_path)

        if image_cv is None:
            raise ValueError("Image could not be loaded, possibly corrupted or invalid format.")

        # Upscale the image to enhance text extraction
        image_upscaled = upscale_image(image_cv, scale_factor=scale_factor)

        # Convert the image from BGR to RGB (PaddleOCR expects RGB format)
        image_rgb = cv2.cvtColor(image_upscaled, cv2.COLOR_BGR2RGB)

        # Extract text using PaddleOCR
        results = ocr.ocr(image_rgb)  # Returns a list of text boxes and text

        # Join the results into a single string
        text = '\n'.join([res[1][0] for res in results[0]])  # Extract text part

        return text

    except Exception as e:
        # Handle exceptions (e.g., if the image is corrupted)
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Read the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)

# Limit the DataFrame to the first 50 rows
df = df.head(30)

# Ensure that there is an 'image_link' column in the dataset
if 'image_link' not in df.columns:
    raise ValueError("The dataset must contain an 'image_link' column with image paths or URLs.")

# Initialize new columns for extracted text and numeric values with units
df['extracted_text'] = None
df['numeric_values_with_units'] = None

# Iterate over the dataset and extract text for each image
for idx, row in df.iterrows():
    image_path = row['image_link']
    extracted_text = extract_text(image_path)  # Extract text from the image

    if extracted_text is not None:
        df.at[idx, 'extracted_text'] = extracted_text
        success_count += 1
    else:
        error_count += 1

# Function to extract numeric values and units using quantulum3
def extract_numeric_values_with_units(text):
    try:
        quantities = parser.parse(text)
        values_with_units = [(q.value, q.unit.name) for q in quantities]
        return values_with_units
    except Exception as e:
        print(f"Error extracting quantities from text: {str(e)}")
        return None

# Apply the function to extract numeric values with units
df['numeric_values_with_units'] = df['extracted_text'].apply(
    lambda x: extract_numeric_values_with_units(x) if pd.notnull(x) else None
)

# Save the updated dataset with the extracted text and numeric values to a new CSV file
df.to_csv(output_csv, index=False)

# Calculate the total execution time in minutes
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60  # Convert to minutes

# Print the total time taken in minutes
print(f"Total time taken: {total_time_minutes:.2f} minutes")

# Print the count of successful extractions and errors
print(f"Images successfully processed: {success_count}")
print(f"Images with errors: {error_count}")

# Print the path to the output CSV file
print(f"Updated dataset saved to: {output_csv}")


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:15<00:00, 256kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:19<00:00, 529kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:15<00:00, 142kiB/s]

[2024/09/14 10:57:36] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




[2024/09/14 10:57:38] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.8568100929260254
[2024/09/14 10:57:39] ppocr DEBUG: cls num  : 19, elapsed : 0.12425017356872559
[2024/09/14 10:57:41] ppocr DEBUG: rec_res num  : 19, elapsed : 2.366570234298706
[2024/09/14 10:57:41] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scale



Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Error extracting quantities from text: Module stemming is not installed.
Total time taken: 3.15 minutes
Images successfully processed: 30
Images with errors: 0
Updated dataset saved to: /content/train_with_text_and_values.csv


In [None]:
df['numeric_values_with_units'].tolist()

[[(100.0, 'dimensionless'),
  (100.0, 'percentage'),
  (100.0, 'percentage'),
  (500.0, 'gram')],
 [],
 [(1.0, 'dimensionless'),
  (0.709, 'gram'),
  (200.0, 'milligram'),
  (101.0, 'dimensionless'),
  (100.0, 'milligram'),
  (50.0, 'milligram'),
  (25.0, 'milligram'),
  (25.0, 'milligram'),
  (25.0, 'milligram'),
  (25.0, 'milligram'),
  (5.0, 'percentage'),
  (25.0, 'milligram'),
  (10.0, 'milligram'),
  (100.0, 'dimensionless'),
  (5.0, 'milligram'),
  (3.04, 'kilocalorie'),
  (0.13, 'dimensionless'),
  (0.51, 'gram'),
  (0.2, 'gram'),
  (0.04, 'gram'),
  (0.07, 'dimensionless'),
  (6600.0, 'dimensionless'),
  (2010.0, 'dimensionless')],
 None,
 [(1400.0, 'dimensionless'), (365.0, 'dimensionless')],
 [(1400.0, 'milligram'), (1400.0, 'milligram'), (365.0, 'dimensionless')],
 None,
 [(0.0, 'dimensionless'), (1400.0, 'milligram'), (365.0, 'dimensionless')],
 [(100.0, 'percentage'), (1400.0, 'milligram'), (365.0, 'dimensionless')],
 [(1400.0, 'milligram'),
  (1400.0, 'milligram'),
  (36

In [None]:
testing = pd.read_csv("/content/train.csv")
testing

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [None]:
import os
import cv2
import time
import requests
from io import BytesIO
from PIL import Image
from paddleocr import PaddleOCR
import numpy as np
import pandas as pd
from quantulum3 import parser  # Import quantulum3 for quantity extraction

# Define the dataset path (CSV file)
dataset_path = '/content/train.csv'

# Define the output CSV file path with extracted text and numeric values
output_csv = '/content/train_with_text_and_values.csv'

# Define counters for successful extractions and errors
success_count = 0
error_count = 0

# Define the current time
start_time = time.time()

# Define the entity-unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to upscale the image
def upscale_image(image, scale_factor=2):
    """
    Upscales an image by the given scale factor.
    """
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    dimensions = (width, height)

    # Resize the image to the new dimensions
    upscaled_image = cv2.resize(image, dimensions, interpolation=cv2.INTER_CUBIC)
    return upscaled_image

# Function to download image from URL and convert it to an OpenCV image
def download_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for failed requests
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert to RGB
        return np.array(image)  # Convert the PIL image to a NumPy array (OpenCV format)
    except Exception as e:
        print(f"Error downloading image {url}: {str(e)}")
        return None

# Define a function for text extraction using PaddleOCR
def extract_text(image_path, scale_factor=2):
    try:
        # Initialize PaddleOCR within the function to avoid PicklingError
        ocr = PaddleOCR(use_angle_cls=True, lang='en')

        # Check if image_path is a URL (starts with 'http' or 'https')
        if image_path.startswith('http'):
            # Download the image from the URL
            image_cv = download_image_from_url(image_path)
        else:
            # Load the image from a local file
            image_cv = cv2.imread(image_path)

        if image_cv is None:
            raise ValueError("Image could not be loaded, possibly corrupted or invalid format.")

        # Upscale the image to enhance text extraction
        image_upscaled = upscale_image(image_cv, scale_factor=scale_factor)

        # Convert the image from BGR to RGB (PaddleOCR expects RGB format)
        image_rgb = cv2.cvtColor(image_upscaled, cv2.COLOR_BGR2RGB)

        # Extract text using PaddleOCR
        results = ocr.ocr(image_rgb)  # Returns a list of text boxes and text

        # Join the results into a single string
        text = '\n'.join([res[1][0] for res in results[0]])  # Extract text part

        return text

    except Exception as e:
        # Handle exceptions (e.g., if the image is corrupted)
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Function to extract numeric values and units using quantulum3
def extract_numeric_values_with_units(text, entity_name):
    try:
        quantities = parser.parse(text)
        allowed_units = entity_unit_map.get(entity_name.lower(), set())
        if not allowed_units:
            print(f"No allowed units found for entity '{entity_name}'.")
            return []

        # Filter quantities based on allowed units
        filtered_quantities = []
        for q in quantities:
            unit_name = q.unit.name.lower()
            if unit_name in allowed_units:
                filtered_quantities.append((q.value, q.unit.name))
            else:
                print(f"Discarding quantity '{q.value} {q.unit.name}' not in allowed units for entity '{entity_name}'.")
        return filtered_quantities
    except Exception as e:
        print(f"Error extracting quantities from text: {str(e)}")
        return []

# Read the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)

# Limit the DataFrame to the first 30 rows
df = df.head(30)

# Ensure that required columns are in the dataset
required_columns = ['image_link', 'entity_name']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"The dataset must contain a '{col}' column.")

# Initialize new columns for extracted text and numeric values with units
df['extracted_text'] = None
df['numeric_values_with_units'] = None

# Iterate over the dataset and extract text and numeric values for each image
for idx, row in df.iterrows():
    image_path = row['image_link']
    entity_name = row['entity_name']
    extracted_text = extract_text(image_path)  # Extract text from the image

    if extracted_text is not None:
        df.at[idx, 'extracted_text'] = extracted_text
        # Extract numeric values with units, filtering by allowed units
        numeric_values_with_units = extract_numeric_values_with_units(extracted_text, entity_name)
        df.at[idx, 'numeric_values_with_units'] = numeric_values_with_units
        success_count += 1
    else:
        error_count += 1

# Save the updated dataset with the extracted text and numeric values to a new CSV file
df.to_csv(output_csv, index=False)

# Calculate the total execution time in minutes
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60  # Convert to minutes

# Print the total time taken in minutes
print(f"Total time taken: {total_time_minutes:.2f} minutes")

# Print the count of successful extractions and errors
print(f"Images successfully processed: {success_count}")
print(f"Images with errors: {error_count}")

# Print the path to the output CSV file
print(f"Updated dataset saved to: {output_csv}")


[2024/09/14 11:37:03] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

In [None]:
df

Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text,numeric_values_with_units
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS\nNATURE\nINGREDIENT MENAGER\nMULT-USAGE...,"[(500.0, gram)]"
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,R\nTEARRIFIC\nLEBENSMITTELECHT\nCup\nRDAY\nGEP...,[]
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION\nServingSize:1 Tablet (0.709 g)|Ea...,"[(0.709, gram), (200.0, milligram), (100.0, mi..."
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,WarningConsutt your physician before using thi...,[]
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach\nHIGHSTRENGTH\nPSYLLIOM\nHUSK\n1400\n...,[]
5,https://m.media-amazon.com/images/I/61QsBSE7jg...,731432,item_weight,1400 milligram,Horbaach\nHIGHSTRENGTH\nNaturally-Sourced Psyl...,"[(1400.0, milligram), (1400.0, milligram)]"
6,https://m.media-amazon.com/images/I/81xsq6vf2q...,731432,item_weight,1400 milligram,"Horbaach\nDirections: For adults, take two 2) ...",[]
7,https://m.media-amazon.com/images/I/71DiLRHeZd...,731432,item_weight,1400 milligram,VEGAN\nHorbaach\nWHEAT\nFREE\nHIGHSTRENGTH\n00...,"[(1400.0, milligram)]"
8,https://m.media-amazon.com/images/I/91Cma3Rzse...,731432,item_weight,1400 milligram,Horbaach\n100%\nHIGHEST\nQUALITY\nHorbaach\nGM...,"[(1400.0, milligram)]"
9,https://m.media-amazon.com/images/I/71jBLhmTNl...,731432,item_weight,1400 milligram,NEWOOK\nSAME TRUSTED OUALITY\nOLD\nNEW\nHorbaa...,"[(1400.0, milligram), (1400.0, milligram)]"


In [None]:
#quantulum + regex
import os
import cv2
import time
import requests
from io import BytesIO
from PIL import Image
from paddleocr import PaddleOCR
import numpy as np
import pandas as pd
from quantulum3 import parser, classes  # Import quantulum3 for quantity extraction

# Define the dataset path (CSV file)
dataset_path = '/content/train.csv'

# Define the output CSV file path with extracted text and numeric values
output_csv = '/content/train_with_text_and_values.csv'

# Define counters for successful extractions and errors
success_count = 0
error_count = 0

# Define the current time
start_time = time.time()

# Define the entity-unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to upscale the image
def upscale_image(image, scale_factor=2):
    """
    Upscales an image by the given scale factor.
    """
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    dimensions = (width, height)

    # Resize the image to the new dimensions
    upscaled_image = cv2.resize(image, dimensions, interpolation=cv2.INTER_CUBIC)
    return upscaled_image

# Function to download image from URL and convert it to an OpenCV image
def download_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for failed requests
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert to RGB
        return np.array(image)  # Convert the PIL image to a NumPy array (OpenCV format)
    except Exception as e:
        print(f"Error downloading image {url}: {str(e)}")
        return None

# Define a function for text extraction using PaddleOCR
def extract_text(image_path, scale_factor=2):
    try:
        # Initialize PaddleOCR within the function to avoid PicklingError
        ocr = PaddleOCR(use_angle_cls=True, lang='en')

        # Check if image_path is a URL (starts with 'http' or 'https')
        if image_path.startswith('http'):
            # Download the image from the URL
            image_cv = download_image_from_url(image_path)
        else:
            # Load the image from a local file
            image_cv = cv2.imread(image_path)

        if image_cv is None:
            raise ValueError("Image could not be loaded, possibly corrupted or invalid format.")

        # Upscale the image to enhance text extraction
        image_upscaled = upscale_image(image_cv, scale_factor=scale_factor)

        # Convert the image from BGR to RGB (PaddleOCR expects RGB format)
        image_rgb = cv2.cvtColor(image_upscaled, cv2.COLOR_BGR2RGB)

        # Extract text using PaddleOCR
        results = ocr.ocr(image_rgb)  # Returns a list of text boxes and text

        # Join the results into a single string
        text = '\n'.join([res[1][0] for res in results[0]])  # Extract text part

        return text

    except Exception as e:
        # Handle exceptions (e.g., if the image is corrupted)
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Preprocess text to replace abbreviations with full unit names
def preprocess_text(text):
    # Define a mapping of abbreviations to full unit names
    abbreviation_mapping = {
        # Weight units
        r'\bmg\b': 'milligram',
        r'\bmiligram\b': 'milligram',
        r'\bmilligrams\b': 'milligram',
        r'\bmiligrammes\b': 'milligram',
        r'\bg\b': 'gram',
        r'\bgrams\b': 'gram',
        r'\bkg\b': 'kilogram',
        r'\bµg\b': 'microgram',
        r'\boz\b': 'ounce',
        r'\blb\b': 'pound',

        # Length units
        r'\bmm\b': 'millimetre',
        r'\bmillimeter\b': 'millimetre',
        r'\bcm\b': 'centimetre',
        r'\bcentimeter\b': 'centimetre',
        r'\bm(?!g|l)\b': 'metre',  # 'm' not followed by 'g' or 'l' to avoid 'mg' or 'ml'
        r'\bft\b': 'foot',
        r'\bin\b': 'inch',
        r'\byd\b': 'yard',

        # Voltage units
        r'\bkV\b': 'kilovolt',
        r'\bkv\b': 'kilovolt',
        r'\bmV\b': 'millivolt',
        r'\bmv\b': 'millivolt',
        r'\bV\b': 'volt',
        r'\bv\b': 'volt',

        # Wattage units
        r'\bkW\b': 'kilowatt',
        r'\bkw\b': 'kilowatt',
        r'\bW\b': 'watt',
        r'\bw\b': 'watt',

        # Volume units
        r'\bcl\b': 'centilitre',
        r'\bdl\b': 'decilitre',
        r'\bft³\b': 'cubic foot',
        r'\bin³\b': 'cubic inch',
        r'\bfl oz\b': 'fluid ounce',
        r'\bliter\b': 'litre',
        r'\bml\b': 'millilitre',
        r'\bmilliliter\b': 'millilitre',
        r'\bmicroliter\b': 'microlitre'
    }

    # Replace abbreviations with full unit names
    for abbrev, full_name in abbreviation_mapping.items():
        text = re.sub(abbrev, full_name, text, flags=re.IGNORECASE)
    return text

# Function to extract numeric values and units using quantulum3
def extract_numeric_values_with_units(text, entity_name):
    try:
        # Preprocess text to replace abbreviations with full unit names
        preprocessed_text = preprocess_text(text)

        quantities = parser.parse(preprocessed_text)
        allowed_units = entity_unit_map.get(entity_name.lower(), set())
        if not allowed_units:
            print(f"No allowed units found for entity '{entity_name}'.")
            return []

        # Filter quantities based on allowed units
        filtered_quantities = []
        for q in quantities:
            unit_name = q.unit.name.lower()
            if unit_name in allowed_units:
                filtered_quantities.append((q.value, q.unit.name))
            else:
                print(f"Discarding quantity '{q.value} {q.unit.name}' not in allowed units for entity '{entity_name}'.")
        return filtered_quantities
    except Exception as e:
        print(f"Error extracting quantities from text: {str(e)}")
        return []

# Read the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)

# Limit the DataFrame to the first 30 rows
df = df.head(30)

# Ensure that required columns are in the dataset
required_columns = ['image_link', 'entity_name']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"The dataset must contain a '{col}' column.")

# Initialize new columns for extracted text and numeric values with units
df['extracted_text'] = None
df['numeric_values_with_units'] = None

# Iterate over the dataset and extract text and numeric values for each image
for idx, row in df.iterrows():
    image_path = row['image_link']
    entity_name = row['entity_name']
    extracted_text = extract_text(image_path)  # Extract text from the image

    if extracted_text is not None:
        df.at[idx, 'extracted_text'] = extracted_text
        # Extract numeric values with units, filtering by allowed units
        numeric_values_with_units = extract_numeric_values_with_units(extracted_text, entity_name)
        df.at[idx, 'numeric_values_with_units'] = numeric_values_with_units
        success_count += 1
    else:
        error_count += 1

# Save the updated dataset with the extracted text and numeric values to a new CSV file
df.to_csv(output_csv, index=False)

# Calculate the total execution time in minutes
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60  # Convert to minutes

# Print the total time taken in minutes
print(f"Total time taken: {total_time_minutes:.2f} minutes")

# Print the count of successful extractions and errors
print(f"Images successfully processed: {success_count}")
print(f"Images with errors: {error_count}")

# Print the path to the output CSV file
print(f"Updated dataset saved to: {output_csv}")


[2024/09/14 11:40:02] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

In [None]:
df

Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text,numeric_values_with_units
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS\nNATURE\nINGREDIENT MENAGER\nMULT-USAGE...,"[(500.0, gram)]"
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,R\nTEARRIFIC\nLEBENSMITTELECHT\nCup\nRDAY\nGEP...,[]
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION\nServingSize:1 Tablet (0.709 g)|Ea...,"[(0.709, gram), (200.0, milligram), (100.0, mi..."
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,WarningConsutt your physician before using thi...,[]
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach\nHIGHSTRENGTH\nPSYLLIOM\nHUSK\n1400\n...,[]
5,https://m.media-amazon.com/images/I/61QsBSE7jg...,731432,item_weight,1400 milligram,Horbaach\nHIGHSTRENGTH\nNaturally-Sourced Psyl...,"[(1400.0, milligram), (1400.0, milligram)]"
6,https://m.media-amazon.com/images/I/81xsq6vf2q...,731432,item_weight,1400 milligram,"Horbaach\nDirections: For adults, take two 2) ...",[]
7,https://m.media-amazon.com/images/I/71DiLRHeZd...,731432,item_weight,1400 milligram,VEGAN\nHorbaach\nWHEAT\nFREE\nHIGHSTRENGTH\n00...,"[(1400.0, milligram)]"
8,https://m.media-amazon.com/images/I/91Cma3Rzse...,731432,item_weight,1400 milligram,Horbaach\n100%\nHIGHEST\nQUALITY\nHorbaach\nGM...,"[(1400.0, milligram)]"
9,https://m.media-amazon.com/images/I/71jBLhmTNl...,731432,item_weight,1400 milligram,NEWOOK\nSAME TRUSTED OUALITY\nOLD\nNEW\nHorbaa...,"[(1400.0, milligram), (1400.0, milligram)]"


### Testing

In [None]:
import pandas as pd
import numpy as np


data = pd.read_csv("/content/train_merged.csv")
df = data

In [None]:
# capturing all value-unit pair with regex
import re
import pandas as pd

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s*(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams',
                'kilogram', 'kg',
                'microgram', 'µg',
                'ounce', 'oz',
                'pound', 'lb',
                'ton']

length_units = ['millimetre', 'mm', 'millimeter',
                'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
                'foot', 'ft',
                'inch', 'in',
                'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv',
                 'millivolt', 'mV', 'mv',
                 'volt', 'V', 'v']

wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']

volume_units = ['centilitre', 'cl',
                'cubic foot', 'ft³',
                'cubic inch', 'in³',
                'cup',
                'decilitre', 'dl',
                'fluid ounce', 'fl oz',
                'gallon', 'imperial gallon',
                'litre', 'liter',
                'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter',
                'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    # voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantities(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return []

    pattern = patterns.get(entity_name)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches
        extracted_values = []
        for match in matches:
            value, unit = match
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            extracted_values.append(f"{value} {normalized_unit}")
        return extracted_values

    return []

# Function to check if extracted text matches the entity value
def check_match(row):
    extracted = row['extracted_text']
    entity_value = row['entity_value']

    return extracted == entity_value

# Apply the function to the DataFrame
df['extracted_values'] = df.apply(extract_quantities, axis=1)
df['match'] = df.apply(check_match, axis=1)

In [None]:
import re

def extract_values_with_units(text):
    # Extract numbers with their units (e.g., 40cm, 15.7in, 40cm/15.7in)
    values_with_units = re.findall(r"(\d+(?:\.\d+)?(?:cm|in)?(?:/\d+(?:\.\d+)?(?:cm|in)?)?)", text)
    return values_with_units

def parse_value(value_unit):
    # If the value contains dual units (e.g., "40cm/15.7in"), pick the second part (inch)
    if "/" in value_unit:
        parts = value_unit.split("/")
        return float(re.findall(r"\d+(?:\.\d+)?", parts[1])[0])
    else:
        return float(re.findall(r"\d+(?:\.\d+)?", value_unit)[0])

def find_dimensions(text):
    # Extract values with units from the string
    values_with_units = extract_values_with_units(text)

    # Parse numerical values
    values = [parse_value(vu) for vu in values_with_units]

    if len(values) == 0 or len(values) == 1:
        raise ValueError("No values found to determine dimensions.")

    # Sort the values to identify smallest, largest, and intermediate
    sorted_values = sorted(values)


    if len(values) == 2:
        # If only two values, assign the smaller one as width or depth, and the larger as height
        width = sorted_values[0]
        depth = None  # You can decide if you want to treat it as width or depth, so set one of them to None
        height = sorted_values[1]
    else:
        # Standard case with 3 or more values
        depth = sorted_values[0]
        width = sorted_values[1]
        height = sorted_values[-1]

    return width, depth, height

# Example inputs
input1 = '40cm/15.7in 7cm/2.7in'
input2 = '50.00" Oak 10.60" 19.00" 43.30" 5.10" 46.50" Ja 18.00" Manual Measurement, Allowable Error'
input3 = '40cm/15.7in 7cm'

# Get the dimensions for all examples
width1, depth1, height1 = find_dimensions(input1)
width2, depth2, height2 = find_dimensions(input2)
width3, depth3, height3 = find_dimensions(input3)

# Print results
def print_dimensions(width, depth, height, label):
    print(f"{label} - Width: {width}\" Depth: {depth}\" Height: {height}\"")

print_dimensions(width1, depth1, height1, "Input 1")
print_dimensions(width2, depth2, height2, "Input 2")
print_dimensions(width3, depth3, height3, "Input 3")


Input 1 - Width: 2.7" Depth: None" Height: 15.7"
Input 2 - Width: 10.6" Depth: 5.1" Height: 50.0"
Input 3 - Width: 7.0" Depth: None" Height: 15.7"


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

def preprocess_text(text):
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    return text

def extract_features(row):
    text = str(row['extracted_text'])
    entity_name = str(row['entity_name'])
    extracted_values = row['extracted_values']

    features = []
    for value in extracted_values:
        # Position of the value in the text
        position = text.find(str(value)) / len(text) if len(text) > 0 else 0

        # Length of the value
        length = len(str(value))

        # Whether the entity name appears close to the value
        context_relevance = 1 if entity_name in text[max(0, text.find(str(value))-20):text.find(str(value))+len(str(value))+20] else 0

        features.append([position, length, context_relevance])

    return features

def train_selector_model(df):
    # Prepare the data
    X = []
    y = []
    for _, row in df.iterrows():
        features = extract_features(row)
        correct_value = str(row['entity_value'])
        for i, value in enumerate(row['extracted_values']):
            X.append(features[i])
            y.append(1 if str(value) == correct_value else 0)

    # Train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    return model

def select_entity_value(row, model):
    features = extract_features(row)
    if not features:
        return None

    probabilities = model.predict_proba(features)
    if len(probabilities) == 0:
        return None
    best_index = np.argmax(probabilities[:, 1])  # Select the index with highest probability of being correct
    if best_index < len(row['extracted_values']):
        return row['extracted_values'][best_index]
    return None

# Main function to process the data
def process_data(train_df, test_df):
    # Ensure 'extracted_values' is a list
    def ensure_list(x):
        if isinstance(x, list):
            return x
        elif isinstance(x, str):
            try:
                return eval(x)
            except:
                return [x]
        elif pd.isna(x):
            return []
        else:
            return list(x)

    train_df['extracted_values'] = train_df['extracted_values'].apply(ensure_list)
    test_df['extracted_values'] = test_df['extracted_values'].apply(ensure_list)

    # Train the selector model
    model = train_selector_model(train_df)

    # Apply the model to select the best entity value for each row
    train_df['selected_value'] = train_df.apply(lambda row: select_entity_value(row, model), axis=1)
    test_df['predicted_value'] = test_df.apply(lambda row: select_entity_value(row, model), axis=1)

    return train_df, test_df

# Usage example
train_df = df
test_df = df_test
processed_train_df, processed_test_df = process_data(train_df, test_df)

In [None]:

#lgbm
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
import re

def preprocess_text(text):
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    return text

def extract_features(row):
    text = str(row['extracted_text'])
    entity_name = str(row['entity_name'])
    extracted_values = row['extracted_values']

    features = []
    for value in extracted_values:
        # Position of the value in the text
        position = text.find(str(value)) / len(text) if len(text) > 0 else 0

        # Length of the value
        length = len(str(value))

        # Whether the entity name appears close to the value
        context_relevance = 1 if entity_name in text[max(0, text.find(str(value))-20):text.find(str(value))+len(str(value))+20] else 0

        features.append([position, length, context_relevance])

    return features

def train_selector_model(df):
    # Prepare the data for training
    X = []
    y = []
    for _, row in df.iterrows():
        features = extract_features(row)
        correct_value = str(row['entity_value'])
        for i, value in enumerate(row['extracted_values']):
            X.append(features[i])
            y.append(1 if str(value) == correct_value else 0)

    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Train the LightGBM model
    model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    return model, X, y

def select_entity_value(row, model):
    features = extract_features(row)
    if not features:
        return None

    probabilities = model.predict_proba(features)
    if len(probabilities) == 0:
        return None
    best_index = np.argmax(probabilities[:, 1])  # Select the index with highest probability of being correct
    if best_index < len(row['extracted_values']):
        return row['extracted_values'][best_index]
    return None

def train_model(train_df):
    # Ensure 'extracted_values' is a list
    def ensure_list(x):
        if isinstance(x, list):
            return x
        elif isinstance(x, str):
            try:
                return eval(x)
            except:
                return [x]
        elif pd.isna(x):
            return []
        else:
            return list(x)

    train_df['extracted_values'] = train_df['extracted_values'].apply(ensure_list)

    # Train the selector model
    model, X, y_true = train_selector_model(train_df)

    # Apply the model to select the best entity value for each row in the training data
    y_pred = []
    for _, row in train_df.iterrows():
        features = extract_features(row)
        if features:
            probabilities = model.predict_proba(features)
            best_index = np.argmax(probabilities[:, 1])
            y_pred.append(1 if str(row['extracted_values'][best_index]) == str(row['entity_value']) else 0)

    # # Calculate F1 Score
    # f1 = f1_score(y_true, y_pred)
    # print(f"F1 Score on training data: {f1}")

    # Save the model
    import pickle
    with open("trained_model.pkl", "wb") as f:
        pickle.dump(model, f)

    return model

# Example usage
# Assume df_train is your training DataFrame

train_model(df)

[LightGBM] [Info] Number of positive: 40763, number of negative: 105147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 274
[LightGBM] [Info] Number of data points in the train set: 145910, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.279371 -> initscore=-0.947585
[LightGBM] [Info] Start training from score -0.947585


In [None]:
train_df

Unnamed: 0.1,Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text,extracted_values,match
0,0,61BAVCR4heL.jpg,944592,depth,13.0 inch,em71-25 201AN nhwYon kngwkte 19682 2100 0 king...,[],False
1,1,612B4JamOKL.jpg,731432,item_weight,5 kilogram,MB FUEL NE NUTRITION FOR PERFORMANCE WEIGHT GA...,[5.0 kilogram],False
2,2,515tc7ALiAL.jpg,116155,width,25.0 millimetre,STANLEY 125mm),[125.0 millimetre],False
3,3,51FcN6IeZ4L.jpg,838598,depth,7.0 centimetre,40cm/15.7in 7cm/2.7in,"[40.0 centimetre, 15.7 inch, 7.0 centimetre, 2...",False
4,4,81qmAL-r-2L.jpg,281678,item_weight,80.0 gram,RS100 Alice in Wonderland - English Afternoon ...,"[80.0 gram, 2.8 ounce]",False
...,...,...,...,...,...,...,...,...
58014,58014,51EUIX6Mh2L.jpg,358831,depth,66.0 inch,66 inch,[66.0 inch],False
58015,58015,71KMJrbQ7yL.jpg,299791,item_weight,20.0 gram,3mm (0.12 inch) Black rhodium plated (stainles...,[20.0 gram],False
58016,58016,91pHF10WBUL.jpg,442321,item_weight,28.0 kilogram,WEIGHT:28kg/61.6lb FBA OX SIZE:160*55.5*73cm/6...,"[28.0 kilogram, 61.6 pound, 28.0 kilogram, 61....",False
58017,58017,61XNzV1vHTL.jpg,299791,item_weight,27 gram,3mm (0.12 inch) Platinum Plated 39mm (1.54 inc...,[27.0 gram],False


In [None]:
y_true = train_df['entity_value'].astype(str)
y_pred = train_df['selected_value'].astype(str)
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score for training data: {f1:.4f}")

F1 Score for training data: 0.3956


In [None]:
train_df.iloc[1000:1050]

Unnamed: 0.1,Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text,extracted_values,match,selected_value
1000,1000,61GAzpR-2CL.jpg,931856,height,39.3 inch,39.3in Plastic branch artifcial flower,[39.3 inch],False,39.3 inch
1001,1001,61irT-TKd+L.jpg,898898,item_weight,1.0 ounce,Smells Like A Stinky A$$ g&e gag gifts STINKY ...,[],False,
1002,1002,21l-4ip9XSL.jpg,916768,height,3.8 inch,3.8 in 9cm,"[3.8 inch, 9.0 centimetre]",False,9.0 centimetre
1003,1003,616FlxMUQDL.jpg,318770,wattage,100.0 watt,12.2mm USB440 6.65mm 40 38mm USB440 40 18mm T ...,[100.0 watt],False,100.0 watt
1004,1004,51I8tUQB6qL.jpg,658003,width,9.8 inch,15 9.8in/ 12.9 in/33 cm,"[9.8 inch, 12.9 inch, 33.0 centimetre]",False,33.0 centimetre
1005,1005,61nwh6MnpRL.jpg,641642,voltage,12.0 volt,8.8MM Width DC12V 5 METER ROLL 60LED'S PERMETER,[12.0 volt],False,12.0 volt
1006,1006,61FR9X56B2L.jpg,268003,item_weight,4.0 pound,"TECHDCALSPECIFICRTIOOT ""6""X2E'PLASTIC INJECTED...",[],False,
1007,1007,716SJO0CUUL.jpg,929999,item_weight,95 gram to 110 gram,VECTOR X 26 Inch VECTOR 95-110 WEIGHT gma,[],False,
1008,1008,81pSU02nQ1L.jpg,529606,item_weight,3 ounce,BENEFITS S DeBelle Lav 6ap 3oz. nder 3 WICK CA...,[3.0 ounce],False,3.0 ounce
1009,1009,714+ft5-c-L.jpg,675317,item_weight,157.0 gram,018.50 038.00 M4 X 0.50P 25.40 38.00 V 12 NOTE...,[157.0 gram],False,157.0 gram
