In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images, parse_string
from constants import allowed_units
from pathlib import Path

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Load model
model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model.trainable = False

inputs = keras.layers.Input(shape=(224, 224, 3))
x = model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(len(allowed_units), activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels
valid_labels = []
for label in train_labels:
    try:
        # Skip ranges or invalid formats
        if "to" in label:
            print(f"Skipping invalid label: {label} - Range not supported")
            continue
        number, unit = parse_string(label)
        valid_labels.append(unit)  # Only store the unit
    except ValueError as e:
        print(f"Skipping invalid label: {label} - {e}")

# Only keep valid images and labels
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(train_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Convert labels to categorical format
train_labels_categorical = tf.keras.utils.to_categorical(
    [list(allowed_units).index(lbl) for lbl in valid_labels_filtered if lbl in allowed_units],
    num_classes=len(allowed_units)
)

# Train model
if processed_images and len(processed_images) == len(train_labels_categorical):
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    print("Mismatch in number of processed images and labels. Training aborted.")

# Generate predictions
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Format output
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    output_df = pd.DataFrame({'index': test_df['index']})
    output_df['prediction'] = [f"{1.0} {list(allowed_units)[x]}" for x in test_preds_array.argmax(axis=1)]
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    print("No valid test images processed. Predictions cannot be generated.")


2024-09-16 10:59:35.729608: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 10:59:38.972908: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 10:59:39.000907: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████████████████████████████████| 187/187 [00:00<00:00, 47948.09it/s]


Skipping invalid label: 10 kilogram to 15 kilogram - Range not supported
Error processing image /home/rguktrkvalley/Desktop/images/41wvffSxB4L.jpg: image file is truncated (6 bytes not processed)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images, parse_string
from constants import allowed_units
from pathlib import Path

# Original entity unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Mapping of abbreviations to full forms
unit_abbreviation_map = {
    'cm': 'centimetre',
    'ft': 'foot',
    'in': 'inch',
    'm': 'metre',
    'mm': 'millimetre',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'µg': 'microgram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    't': 'ton',
    'kV': 'kilovolt',
    'mV': 'millivolt',
    'V': 'volt',
    'kW': 'kilowatt',
    'W': 'watt',
    'cl': 'centilitre',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'cup': 'cup',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'L': 'litre',
    'µL': 'microlitre',
    'mL': 'millilitre',
    'pt': 'pint',
    'qt': 'quart'
}

# Function to map abbreviations to full forms and return the full form if present
def get_full_unit(unit):
    return unit_abbreviation_map.get(unit, unit)

# Function to check if a unit is allowed (including handling abbreviations)
def is_unit_allowed(unit):
    full_unit = get_full_unit(unit)
    return full_unit in allowed_units

# Generate the allowed_units set
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Load the base ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

# Define the model architecture
inputs = keras.layers.Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(len(allowed_units), activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels
valid_labels = []
filtered_images = []

for img_path, label in zip(train_images, train_labels):
    try:
        # Skip ranges or invalid formats
        if "to" in label:
            print(f"Skipping invalid label: {label} - Range not supported")
            continue
        number, unit = parse_string(label)
        full_unit = get_full_unit(unit)
        if full_unit in allowed_units:
            valid_labels.append(full_unit)
            filtered_images.append(img_path)
        else:
            print(f"Skipping label with invalid unit: {label}")
    except ValueError as e:
        print(f"Skipping invalid label: {label} - {e}")

# Preprocess the valid images
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(filtered_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Convert labels to categorical format
if valid_labels_filtered:
    train_labels_categorical = tf.keras.utils.to_categorical(
        [list(allowed_units).index(lbl) for lbl in valid_labels_filtered],
        num_classes=len(allowed_units)
    )
else:
    print("No valid labels available for training.")

# Train the model
if processed_images and len(processed_images) == len(train_labels_categorical):
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    print("Mismatch in number of processed images and labels. Training aborted.")

# Generate predictions for test images
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Format and save the prediction output
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    output_df = pd.DataFrame({'index': test_df['index']})
    output_df['prediction'] = [f"{1.0} {list(allowed_units)[x]}" for x in test_preds_array.argmax(axis=1)]
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    print("No valid test images processed. Predictions cannot be generated.")

2024-09-16 11:12:46.520773: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 11:12:46.595934: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 11:12:46.597269: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|█████████████████████████████████████| 187/187 [00:00<00:00, 189957.58it/s]


Skipping invalid label: 10 kilogram to 15 kilogram - Range not supported
Error processing image /home/rguktrkvalley/Desktop/images/41wvffSxB4L.jpg: image file is truncated (6 bytes not processed)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [2]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images, parse_string
from constants import allowed_units
from pathlib import Path
import logging

# Original entity unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Mapping of abbreviations to full forms
unit_abbreviation_map = {
    'cm': 'centimetre',
    'ft': 'foot',
    'in': 'inch',
    'm': 'metre',
    'mm': 'millimetre',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'µg': 'microgram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    't': 'ton',
    'kV': 'kilovolt',
    'mV': 'millivolt',
    'V': 'volt',
    'kW': 'kilowatt',
    'W': 'watt',
    'cl': 'centilitre',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'cup': 'cup',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'L': 'litre',
    'µL': 'microlitre',
    'mL': 'millilitre',
    'pt': 'pint',
    'qt': 'quart'
}

# Function to map abbreviations to full forms and return the full form if present
def get_full_unit(unit):
    return unit_abbreviation_map.get(unit, unit)

# Function to check if a unit is allowed (including handling abbreviations)
def is_unit_allowed(unit):
    full_unit = get_full_unit(unit)
    return full_unit in allowed_units

# Generate the allowed_units set
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Set up logging
logging.basicConfig(filename='image_processing.log', level=logging.INFO)

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {e}")
        return None

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Load the base ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

# Define the model architecture
inputs = keras.layers.Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(len(allowed_units), activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels
valid_labels = []
filtered_images = []

for img_path, label in zip(train_images, train_labels):
    try:
        if "to" in label:
            logging.warning(f"Skipping range-based label: {label}")
            continue
        number, unit = parse_string(label)
        full_unit = get_full_unit(unit)
        if full_unit in allowed_units:
            valid_labels.append(full_unit)
            filtered_images.append(img_path)
        else:
            logging.warning(f"Invalid unit in label: {label}")
    except ValueError as e:
        logging.error(f"Error parsing label {label}: {e}")

# Preprocess the valid images
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(filtered_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Convert labels to categorical format
if valid_labels_filtered:
    train_labels_categorical = tf.keras.utils.to_categorical(
        [list(allowed_units).index(lbl) for lbl in valid_labels_filtered],
        num_classes=len(allowed_units)
    )
else:
    print("No valid labels available for training.")

# Train the model
if processed_images and len(processed_images) == len(train_labels_categorical):
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    print("Mismatch in number of processed images and labels. Training aborted.")

# Generate predictions for test images
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Format and save the prediction output
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    
    # Get the predicted class and its probability
    predicted_classes = test_preds_array.argmax(axis=1)
    predicted_probabilities = test_preds_array.max(axis=1)
    
    output_df = pd.DataFrame({'index': test_df['index']})
    # Format the output to include the predicted probability and the unit
    output_df['prediction'] = [f"{prob:.2f} {list(allowed_units)[cls]}" 
                               for prob, cls in zip(predicted_probabilities, predicted_classes)]
    
    # Save to CSV
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    print("No valid test images processed. Predictions cannot be generated.")


100%|██████████████████████████████████████| 187/187 [00:00<00:00, 26983.69it/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [3]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images
from pathlib import Path
import pytesseract
import re
import logging

# Set up logging
logging.basicConfig(filename='image_processing.log', level=logging.INFO)

# OCR-based text extraction using pytesseract
def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)  # Extract text using pytesseract OCR
        return text
    except Exception as e:
        logging.error(f"Error extracting text from image {image_path}: {e}")
        return None

# Function to extract unit and value from text
def extract_value_and_unit(text):
    # Regular expression to find numbers followed by units like grams, kg, etc.
    pattern = re.compile(r'(\d+(\.\d+)?\s?(gram|kilogram|kg|g|pound|lb|ounce|oz))', re.IGNORECASE)
    matches = pattern.findall(text)
    if matches:
        return matches[0][0]  # Return the first match (number + unit)
    else:
        logging.warning(f"No value and unit found in text: {text}")
        return None

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {e}")
        return None

# Load the base ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

# Define the model architecture
inputs = keras.layers.Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1000, activation='softmax')(x)  # Modify based on the actual classes

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Example: Download images and extract labels
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels and extract text from images
valid_labels = []
filtered_images = []

for img_path in train_images:
    # Step 1: Extract text from the image using OCR
    extracted_text = extract_text_from_image(img_path)
    
    if extracted_text:
        # Step 2: Extract the value and unit from the text
        value_and_unit = extract_value_and_unit(extracted_text)
        
        if value_and_unit:
            # Log the extracted value and unit
            logging.info(f"Extracted from {img_path}: {value_and_unit}")
            valid_labels.append(value_and_unit)
            filtered_images.append(img_path)
        else:
            logging.warning(f"No valid value and unit found in {img_path}")
    else:
        logging.warning(f"No text extracted from {img_path}")

# Preprocess the valid images
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(filtered_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Train the model if valid images are available
if valid_labels_filtered and processed_images:
    print(f"Training with {len(processed_images)} images and labels.")
    train_labels_categorical = tf.keras.utils.to_categorical(
        [0] * len(valid_labels_filtered),  # Dummy labels for now, adjust for your case
        num_classes=1000  # Adjust based on your output classes
    )
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    logging.error("No valid labels or images for training.")

# Generate predictions for test images
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Save predictions
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    
    # Format and save the prediction output
    output_df = pd.DataFrame({'index': test_df['index']})
    output_df['prediction'] = ['Predicted_Class' for _ in test_preds_array]  # Placeholder
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    logging.error("No valid test images processed. Predictions cannot be generated.")


100%|███████████████████████████████████████| 187/187 [00:00<00:00, 6030.19it/s]


Training with 32 images and labels.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [5]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images
from constants import allowed_units
from pathlib import Path
import pytesseract
import re
import logging

# Original entity unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Mapping of abbreviations to full forms
unit_abbreviation_map = {
    'cm': 'centimetre',
    'ft': 'foot',
    'in': 'inch',
    'm': 'metre',
    'mm': 'millimetre',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'µg': 'microgram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    't': 'ton',
    'kV': 'kilovolt',
    'mV': 'millivolt',
    'V': 'volt',
    'kW': 'kilowatt',
    'W': 'watt',
    'cl': 'centilitre',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'cup': 'cup',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'L': 'litre',
    'µL': 'microlitre',
    'mL': 'millilitre',
    'pt': 'pint',
    'qt': 'quart'
}

# Function to map abbreviations to full forms and return the full form if present
def get_full_unit(unit):
    return unit_abbreviation_map.get(unit, unit)

# Generate the allowed_units set
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Set up logging
logging.basicConfig(filename='image_processing.log', level=logging.INFO)

def extract_text_from_image(image_path):
    try:
        # Use pytesseract to extract text
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text if text else ""
    except Exception as e:
        logging.error(f"Error extracting text from image {image_path}: {e}")
        return ""


def parse_extracted_text(text):
    # Ensure text is not None or empty
    if not text or not isinstance(text, str):
        logging.warning("No valid text extracted to parse.")
        return []

    # Regular expression to find patterns like "100 g" or "120 kg"
    pattern = r"(\d+\.?\d*)\s*(\w+)"
    matches = re.findall(pattern, text)
    
    # Process and filter matches to return valid ones
    valid_matches = []
    for number, unit in matches:
        full_unit = get_full_unit(unit)
        if full_unit in allowed_units:
            valid_matches.append(f"{number} {full_unit}")
    
    return valid_matches

# Preprocess images for the neural network model
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {e}")
        return None

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Extract text and parse units for train images
for img_path in [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]:
    text = extract_text_from_image(img_path)
    parsed_data = parse_extracted_text(text)
    print(f"Extracted data from {img_path}: {parsed_data}")

# Process test images similarly and create predictions
for img_path in [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]:
    text = extract_text_from_image(img_path)
    parsed_data = parse_extracted_text(text)
    print(f"Extracted data from {img_path}: {parsed_data}")


100%|██████████████████████████████████████| 187/187 [00:00<00:00, 30444.24it/s]


Extracted data from /home/rguktrkvalley/Desktop/images/61I9XdN6OFL.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/71gSRbyXmoL.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/61BZ4zrjZXL.jpg: ['0.709 gram', '200 milligram', '50 milligram', '25 milligram', '25 milligram', '25 milligram', '25 milligram', '25 milligram', '10 milligram', '051 gram', '02 gram', '0.09 gram']
Extracted data from /home/rguktrkvalley/Desktop/images/612mrlqiI4L.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/617Tl40LOXL.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/61QsBSE7jgL.jpg: ['1400 milligram']
Extracted data from /home/rguktrkvalley/Desktop/images/81xsq6vf2qL.jpg: ['1400 milligram']
Extracted data from /home/rguktrkvalley/Desktop/images/71DiLRHeZdL.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/91Cma3RzseL.jpg: []
Extracted data from /home/rguktrkvalley/Desktop/images/71jBLhmTNlL.jpg: []
Extracted data from /home/rguktrkvalley/D