#EDA Train Data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv("/content/train.csv")

In [None]:
train['entity_name'].unique().tolist()

['item_weight',
 'item_volume',
 'voltage',
 'wattage',
 'maximum_weight_recommendation',
 'height',
 'depth',
 'width']

In [None]:
train.shape

(263859, 4)

In [None]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [None]:
train.isnull().sum()

Unnamed: 0,0
image_link,0
group_id,0
entity_name,0
entity_value,0


In [None]:
train['entity_name'].value_counts()

Unnamed: 0_level_0,count
entity_name,Unnamed: 1_level_1
item_weight,102786
depth,45127
width,44183
height,43597
voltage,9466
wattage,7755
item_volume,7682
maximum_weight_recommendation,3263


In [None]:
train.nunique()

Unnamed: 0,0
image_link,255906
group_id,750
entity_name,8
entity_value,16405


# EDA Test Data

In [None]:
test = pd.read_csv("/content/test.csv")

In [None]:
test.shape

(131187, 4)

In [None]:
test.isnull().sum()

Unnamed: 0,0
index,0
image_link,0
group_id,0
entity_name,0


In [None]:
test.nunique()

Unnamed: 0,0
index,131187
image_link,90666
group_id,924
entity_name,8


# Paddle OCR with Upscaling

In [None]:
!pip install paddlepaddle paddleocr

In [None]:
import os
import cv2
import time
import requests
from io import BytesIO
from PIL import Image
from joblib import Parallel, delayed
from paddleocr import PaddleOCR
import numpy as np
import pandas as pd

# Define the dataset path (CSV file)
dataset_path = '/content/train.csv'

# Define the output CSV file path with extracted text
output_csv = '/content/train_with_text.csv'

# Define counters for successful extractions and errors
success_count = 0
error_count = 0

# Define the current time
start_time = time.time()

# Function to upscale the image
def upscale_image(image, scale_factor=2):
    """
    Upscales an image by the given scale factor.
    """
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    dimensions = (width, height)

    # Resize the image to the new dimensions
    upscaled_image = cv2.resize(image, dimensions, interpolation=cv2.INTER_CUBIC)
    return upscaled_image

# Function to download image from URL and convert it to an OpenCV image
def download_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for failed requests
        image = Image.open(BytesIO(response.content)).convert('RGB')  # Convert to RGB
        return np.array(image)  # Convert the PIL image to a NumPy array (OpenCV format)
    except Exception as e:
        print(f"Error downloading image {url}: {str(e)}")
        return None

# Define a function for text extraction using PaddleOCR
def extract_text(image_path, scale_factor=2):
    try:
        # Initialize PaddleOCR within the function to avoid PicklingError
        ocr = PaddleOCR(use_angle_cls=True, lang='en')

        # Check if image_path is a URL (starts with 'http' or 'https')
        if image_path.startswith('http'):
            # Download the image from the URL
            image_cv = download_image_from_url(image_path)
        else:
            # Load the image from a local file
            image_cv = cv2.imread(image_path)

        if image_cv is None:
            raise ValueError("Image could not be loaded, possibly corrupted or invalid format.")

        # Upscale the image to enhance text extraction
        image_upscaled = upscale_image(image_cv, scale_factor=scale_factor)

        # Convert the image from BGR to RGB (PaddleOCR expects RGB format)
        image_rgb = cv2.cvtColor(image_upscaled, cv2.COLOR_BGR2RGB)

        # Extract text using PaddleOCR
        results = ocr.ocr(image_rgb)  # Returns a list of text boxes and text

        # Join the results into a single string
        text = '\n'.join([res[1][0] for res in results[0]])  # Extract text part

        return text

    except Exception as e:
        # Handle exceptions (e.g., if the image is corrupted)
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Read the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)

# Limit the DataFrame to the first 50 rows
df = df.head(5)


# Ensure that there is an 'image_link' column in the dataset
if 'image_link' not in df.columns:
    raise ValueError("The dataset must contain an 'image_link' column with image paths or URLs.")

# Initialize a new column for extracted text
df['extracted_text'] = None

# Iterate over the dataset and extract text for each image
for idx, row in df.iterrows():
    image_path = row['image_link']
    extracted_text = extract_text(image_path)  # Extract text from the image

    if extracted_text is not None:
        df.at[idx, 'extracted_text'] = extracted_text
        success_count += 1
    else:
        error_count += 1

# Save the updated dataset with the extracted text to a new CSV file
df.to_csv(output_csv, index=False)

# Calculate the total execution time in minutes
end_time = time.time()
total_time_seconds = end_time - start_time
total_time_minutes = total_time_seconds / 60  # Convert to minutes

# Print the total time taken in minutes
print(f"Total time taken: {total_time_minutes:.2f} minutes")

# Print the count of successful extractions and errors
print(f"Images successfully extracted: {success_count}")
print(f"Images with errors: {error_count}")

# Print the path to the output CSV file
print(f"Updated dataset saved to: {output_csv}")

[2024/09/14 12:21:52] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

# Regex

In [None]:
import pandas as pd
import re

# Define the extraction patterns for each entity type, including single-letter units
patterns = {
    'item_weight': r"(\d+\.?\d*)\s?(g|gram|grams|mg|milligram|miligram|milligrams|kilogram|kg|oz|ounce|pound|lb|ton)",
    'depth': r"(\d+\.?\d*)\s?(cm|centimetre|centimeter|ft|inch|m|metre|mm|yard|yd)",
    'width': r"(\d+\.?\d*)\s?(cm|centimetre|centimeter|ft|inch|m|metre|mm|yard|yd)",
    'height': r"(\d+\.?\d*)\s?(cm|centimetre|centimeter|ft|inch|m|metre|mm|yard|yd)",
    'voltage': r"(\d+\.?\d*)\s?(kV|kilovolt|mV|millivolt|V|volt|v)",
    'wattage': r"(\d+\.?\d*)\s?(kW|kilowatt|W|watt|w)",
    'item_volume': r"(\d+\.?\d*)\s?(cl|centilitre|cubic foot|ft³|cubic inch|in³|cup|dl|decilitre|fluid ounce|fl oz|gallon|imperial gallon|litre|liter|ml|millilitre|microlitre|pint|quart)",
    'maximum_weight_recommendation': r"(\d+\.?\d*)\s?(g|gram|grams|mg|milligram|miligram|milligrams|kilogram|kg|oz|ounce|pound|lb|ton)"
}

# Normalize unit names
unit_mappings = {
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram', 'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    'kV': 'kilovolt', 'kilovolt': 'kilovolt', 'mV': 'millivolt', 'millivolt': 'millivolt', 'V': 'volt', 'volt': 'volt', 'v': 'volt',

    'kW': 'kilowatt', 'kilowatt': 'kilowatt', 'W': 'watt', 'watt': 'watt', 'w': 'watt',

    'cl': 'centilitre', 'centilitre': 'centilitre', 'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot', 'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup', 'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre', 'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantity(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return None

    pattern = patterns.get(entity_name)
    if pattern:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value, unit = match.groups()
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            return f"{value} {normalized_unit}"

    return None

# Function to check if extracted text matches the entity value
def check_match(row):
    extracted = row['extracted_text']
    entity_value = row['entity_value']

    return extracted == entity_value

# Apply the function to the DataFrame
df['extracted_text'] = df.apply(extract_quantity, axis=1)
df['match'] = df.apply(check_match, axis=1)

print(df)

                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value extracted_text  match  
0      500.0 gram     500.0 gram   True  
1         1.0 cup           None  False  
2      0.709 gram     0.709 gram   True  
3      0.709 gram       0.2 gram  False  
4  1400 milligram           None  False  


# F1 Score Calculation

In [None]:
import pandas as pd
import re
from sklearn.metrics import f1_score

# Define a function to normalize and extract numerical values and units
def normalize_text(text):
    if pd.isna(text):
        return None
    # Extract value and unit using regex
    match = re.search(r"(\d+\.?\d*)\s?(.*)", text.strip())
    if match:
        value, unit = match.groups()
        value = float(value)
        unit = unit.lower()
        return (value, unit)
    return None

# Function to check if extracted text matches the entity value
def is_match(row):
    entity_value = row['entity_value']
    extracted_text = row['extracted_text']

    # Normalize both values
    entity_normalized = normalize_text(entity_value)
    extracted_normalized = normalize_text(extracted_text)

    if entity_normalized and extracted_normalized:
        return (abs(entity_normalized[0] - extracted_normalized[0]) < 1e-6) and (entity_normalized[1] == extracted_normalized[1])
    return False

# Clean and normalize the columns
df['entity_value'] = df['entity_value'].fillna('').str.strip()
df['extracted_text'] = df['extracted_text'].fillna('').str.strip()

# Create binary columns indicating a match
df['match'] = df.apply(is_match, axis=1)

# Create binary labels for F1 score calculation
y_true = (df['entity_value'] != '').astype(int)  # 1 if entity_value is not empty
y_pred = df['match'].astype(int)  # 1 if match is True

# Calculate F1 score
f1 = f1_score(y_true, y_pred)

print(f"F1 Score: {f1:.2f}")
print(df)

F1 Score: 0.28
                                           image_link  group_id  entity_name  \
0   https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1   https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2   https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3   https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4   https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   
5   https://m.media-amazon.com/images/I/61QsBSE7jg...    731432  item_weight   
6   https://m.media-amazon.com/images/I/81xsq6vf2q...    731432  item_weight   
7   https://m.media-amazon.com/images/I/71DiLRHeZd...    731432  item_weight   
8   https://m.media-amazon.com/images/I/91Cma3Rzse...    731432  item_weight   
9   https://m.media-amazon.com/images/I/71jBLhmTNl...    731432  item_weight   
10  https://m.media-amazon.com/images/I/81N73b5khV...    149159  item_weight   
11  https://m.media-amazo

#Loading Train Merged

In [None]:
import pandas as pd

df = pd.read_csv("/content/test_merged.csv")

In [None]:
df = df.drop(columns=['Unnamed: 0', 'image_link'])

In [None]:
df = df.head(50)

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords from NLTK
nltk.download('stopwords')

# Define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        # Return an empty string if the input is not a string (e.g., NaN or None)
        return ""

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters except periods, double quotes, slashes, and colons
    text = re.sub(r'[^\w\s.:/"-]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    text = text.lower()

    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Split text into words
    words = text.split()

    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]

    # Join words into a single string
    cleaned_text = ' '.join(filtered_words)

    return cleaned_text

# Sample dataframe loading
# df = pd.read_csv('your_file.csv')

# Apply preprocessing function to the 'extracted_text' column
df['cleaned_text'] = df['extracted_text'].apply(preprocess_text)

# Check the result
print(df[['extracted_text', 'cleaned_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                      extracted_text  \
0       2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in   
1  Size Width Length One Size 42cm/16.54" 200cm/7...   
2  Size Width Length One Size 42cm/16.54" 200cm/7...   
3  Size Width Length One Size 42cm/16.54" 200cm/7...   
4  Size Width Length One Size 10.50cm/4.13" 90cm/...   

                                        cleaned_text  
0       2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in  
1  size width length one size 42cm/16.54" 200cm/7...  
2  size width length one size 42cm/16.54" 200cm/7...  
3  size width length one size 42cm/16.54" 200cm/7...  
4  size width length one size 10.50cm/4.13" 90cm/...  


In [None]:
import re
import pandas as pd
from transformers import pipeline

# Function to build the pattern for a list of units
def build_pattern(units):
    units_sorted = sorted(units, key=lambda x: -len(x))
    units_escaped = [re.escape(u) for u in units_sorted]
    units_pattern = '|'.join(units_escaped)
    pattern = r"(\d+\.?\d*)\s*(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams', 'kilogram', 'kg', 'microgram', 'µg',
                'ounce', 'oz', 'pound', 'lb', 'ton']

length_units = ['millimetre', 'mm', 'millimeter', 'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  'foot', 'ft', 'inch', 'in', 'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv', 'millivolt', 'mV', 'mv', 'volt', 'V', 'v']
wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']
volume_units = ['centilitre', 'cl', 'cubic foot', 'ft³', 'cubic inch', 'in³',
                'cup', 'decilitre', 'dl', 'fluid ounce', 'fl oz', 'gallon',
                'imperial gallon', 'litre', 'liter', 'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter', 'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    'mg': 'milligram', 'g': 'gram', 'kg': 'kilogram', 'µg': 'microgram',
    'oz': 'ounce', 'lb': 'pound', 'ton': 'ton',
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre', 'ft': 'foot', 'in': 'inch',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt', 'kw': 'kilowatt', 'w': 'watt',
    'cl': 'centilitre', 'dl': 'decilitre', 'ft³': 'cubic foot', 'in³': 'cubic inch',
    'cup': 'cup', 'fl oz': 'fluid ounce', 'gallon': 'gallon', 'litre': 'litre',
    'ml': 'millilitre', 'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantities(row):
    text = row['cleaned_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return []

    pattern = patterns.get(entity_name)
    if pattern:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches
        extracted_values = []
        for match in matches:
            value, unit = match
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            extracted_values.append(f"{value} {normalized_unit}")
        return extracted_values

    return []

# Use a zero-shot classification model for faster performance
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function to use LLM to select the appropriate value-unit pair based on the context of the cleaned text
def select_value_using_llm(row):
    cleaned_text = row['cleaned_text']
    entity_name = row['entity_name']
    extracted_values = row['extracted_values']

    # If no extracted values, return empty string
    if not extracted_values:
        print(f"Warning: No extracted values for row with entity '{entity_name}'")
        return ""

    # Prepare the candidate labels (extracted values)
    candidate_values = extracted_values

    # Use the classifier to score each candidate value
    try:
        result = classifier(
            sequences=cleaned_text,
            candidate_labels=candidate_values,
            hypothesis_template=f"The correct value for {entity_name} is {{}}.",
            multi_label=False
        )
        # Select the candidate with the highest score
        selected_value = result['labels'][0]
        return selected_value
    except Exception as e:
        print(f"Error during LLM call for row with entity '{entity_name}': {e}")
        return ""

# Apply the extraction function to get value-unit pairs
df['extracted_values'] = df.apply(extract_quantities, axis=1)

# Use the LLM to select the appropriate value-unit pair
df['selected_value'] = df.apply(select_value_using_llm, axis=1)

# Display the final DataFrame with the predicted values
df