In [32]:
import zipfile
import os

# Define the path to the uploaded file
zip_file = 'student_resource 3.zip'  # Replace this with the actual filename

# Unzip the folder
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall()

# List the contents to verify
os.listdir()

['.config', 'student_resource 3', 'student_resource 3.zip', 'sample_data']

In [15]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt', 'horsepower'},  # Added 'horsepower'
    'item_volume': {'centilitre', 'fluid ounce', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}


In [16]:
import re
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

In [17]:
def common_mistake(unit):
    if unit in allowed_units:
        return unit
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    return unit

In [18]:
def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, allowed_units))
    return number, unit

In [19]:
def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

In [20]:
def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            response = requests.get(image_link, stream=True)
            if response.status_code == 200:
                with open(image_save_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                return
            else:
                print(f"Failed to download {image_link}, status code: {response.status_code}")
        except Exception as e:
            print(f"Error downloading image {image_link}: {e}")
        time.sleep(delay)

    create_placeholder_image(image_save_path)

In [21]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Constants
DATASET_FOLDER = '/content/student_resource 3/dataset'
IMAGE_FOLDER = '/content/student_resource 3/images'
IMAGE_SIZE = (128, 128)  # Resize images
BATCH_SIZE = 32
EPOCHS = 10
  # Limit the number of images

import re

In [22]:
def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        num_processes = min(multiprocessing.cpu_count(), 10)  # Limit to a reasonable number
        with multiprocessing.Pool(num_processes) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

In [23]:
import re

def clean_entity_value(entity_value_str):
    # Handle ranges (e.g., "10 kilogram to 15 kilogram")
    range_match = re.match(r"(\d+(\.\d+)?)\s*(\w+)\s*to\s*(\d+(\.\d+)?)\s*(\w+)", entity_value_str)
    if range_match:
        value1 = float(range_match.group(1))
        unit1 = range_match.group(3)
        value2 = float(range_match.group(4))
        unit2 = range_match.group(6)

        if unit1 == unit2:
            return (value1 + value2) / 2, unit1
        else:
            return None, None

    # Handle lists of values (e.g., "[100.0, 240.0] volt")
    list_match = re.match(r"\[(\d+(\.\d+)?),\s*(\d+(\.\d+)?)\]\s*(\w+)", entity_value_str)
    if list_match:
        value1 = float(list_match.group(1))
        value2 = float(list_match.group(3))
        unit = list_match.group(5)

        return (value1 + value2) / 2, unit

    # Handle standard format (e.g., "10 kilogram")
    standard_match = re.match(r"(\d+(\.\d+)?)\s*(\w+)", entity_value_str)
    if standard_match:
        value = float(standard_match.group(1))
        unit = standard_match.group(3)
        return value, unit

    # Handle formats with possible plural forms (e.g., "fluid ounces", "cubic inches")
    plural_match = re.match(r"(\d+(\.\d+)?)\s*(\w+)(s)?", entity_value_str)
    if plural_match:
        value = float(plural_match.group(1))
        unit = plural_match.group(3)
        return value, unit

    return None, None


In [24]:
def normalize_unit(unit):
    if unit is None:
        return None

    unit = unit.lower().strip()

    # Unit mapping with common variations
    unit_mapping = {
        'fluid ounce': ['fluid ounce', 'fl oz', 'ounce fl', 'fl. oz.', 'fl.oz'],
        'ounce': ['ounce', 'oz'],
        'gallon': ['gallon', 'gal'],
        'imperial gallon': ['imperial gallon', 'imp gal'],
        'millilitre': ['millilitre', 'ml', 'milliliter'],
        'microlitre': ['microlitre', 'µl', 'microliter'],
        'litre': ['litre', 'l', 'liter'],
        'centilitre': ['centilitre', 'cl'],
        'decilitre': ['decilitre', 'dl'],
        'cup': ['cup'],
        'pint': ['pint'],
        'quart': ['quart'],
        'cubic inch': ['cubic inch', 'in³', 'cu in'],
        'cubic foot': ['cubic foot', 'ft³', 'cu ft'],
        'centimetre': ['centimetre', 'cm', 'centimeter'],
        'foot': ['foot', 'ft'],
        'inch': ['inch', 'in'],
        'metre': ['metre', 'm', 'meter'],
        'millimetre': ['millimetre', 'mm', 'millimeter'],
        'yard': ['yard', 'yd'],
        'gram': ['gram', 'g'],
        'kilogram': ['kilogram', 'kg'],
        'microgram': ['microgram', 'µg'],
        'milligram': ['milligram', 'mg'],
        'pound': ['pound', 'lb'],
        'ton': ['ton', 't'],
        'kilovolt': ['kilovolt', 'kv'],
        'millivolt': ['millivolt', 'mv'],
        'volt': ['volt', 'v'],
        'kilowatt': ['kilowatt', 'kw'],
        'watt': ['watt', 'w']
    }

    # Iterate through the unit_mapping dictionary to find the correct unit
    for standard_unit, variations in unit_mapping.items():
        if unit in variations:
            return standard_unit

    return unit  # Return original if no match is found


In [25]:
import gc
def process_chunk(chunk, image_folder, images, numerical_labels, unit_labels, entity_unit_map, IMAGE_SIZE):
    for idx, row in chunk.iterrows():
        img_name = os.path.basename(row['image_link']).split('.')[0] + '.jpg'
        img_path = os.path.join(image_folder, img_name)

        entity_value_str = row['entity_value']
        entity_name = row['entity_name']

        entity_value, unit = clean_entity_value(entity_value_str)

        if entity_value is not None and unit is not None:
            normalized_unit = normalize_unit(unit)  # Normalize the unit

            # Validate entity_value and unit against allowed units for the specific entity
            if normalized_unit in entity_unit_map.get(entity_name, set()):
                if os.path.exists(img_path):  # Check if image exists before appending
                    try:
                        image = tf.keras.preprocessing.image.load_img(img_path, target_size=IMAGE_SIZE)
                        image = tf.keras.preprocessing.image.img_to_array(image)
                        images.append(image)
                        numerical_labels.append(float(entity_value))
                        unit_labels.append(normalized_unit)
                    except OSError as e:
                        print(f"Skipping image {img_path} due to error: {e}")
            else:
                print(f"Skipping row with unexpected entity_value format or unit: {entity_value_str}, entity: {entity_name}")
        else:
            print(f"Skipping row due to invalid entity_value or missing unit: {entity_value_str}, entity: {entity_name}")

    # Clear unused variables to free memory
    gc.collect()

In [26]:
def load_and_preprocess_data(csv_file, image_folder, max_images=200, max_entries=20000):
    df = pd.read_csv(csv_file, nrows=max_entries)  # Limit rows read
    df = df.sample(n=max_entries, random_state=52)  # Randomly select max_entries rows
    download_images(df['image_link'].tolist(), image_folder)

    images = []
    numerical_labels = []
    unit_labels = []

    total_rows = len(df)
    processed_rows = 0

    for idx, row in df.iterrows():
        img_name = os.path.basename(row['image_link']).split('.')[0] + '.jpg'
        img_path = os.path.join(image_folder, img_name)

        entity_value_str = row['entity_value']
        entity_name = row['entity_name']

        # 2. Cleaning and handling entity value and unit
        entity_value, unit = clean_entity_value(entity_value_str)

        # 3. Skip invalid entries (keeping this functionality intact)
        if entity_value is not None and unit is not None:
            normalized_unit = normalize_unit(unit)

            if normalized_unit in entity_unit_map.get(entity_name, set()):
                if os.path.exists(img_path):  # Check if image exists before processing
                    try:
                        image = tf.keras.preprocessing.image.load_img(img_path, target_size=IMAGE_SIZE)
                        image = tf.keras.preprocessing.image.img_to_array(image)
                        images.append(image)
                        numerical_labels.append(float(entity_value))
                        unit_labels.append(normalized_unit)
                    except OSError as e:
                        print(f"Skipping image {img_path} due to error: {e}")
            else:
                print(f"Skipping row with unexpected entity_value format or unit: {entity_value_str}, entity: {entity_name}")
        else:
            print(f"Skipping row due to invalid entity_value or missing unit: {entity_value_str}, entity: {entity_name}")

        processed_rows += 1
        if processed_rows % 1000 == 0:  # Progress logging
            print(f"Processed {processed_rows}/{total_rows} rows")

    # Normalize images to range [0, 1]
    images = np.array(images) / 255.0
    numerical_labels = np.array(numerical_labels)
    unit_labels = np.array(unit_labels)

    # Encode units using LabelEncoder
    le = LabelEncoder()
    unit_labels = le.fit_transform(unit_labels)

    # Output data shape for debugging
    print(f'Images shape: {images.shape}')
    print(f'Numerical labels shape: {numerical_labels.shape}')
    print(f'Unit labels shape: {unit_labels.shape}')
    print(f'Number of unique units: {len(le.classes_)}')

    return images, numerical_labels, unit_labels, le


In [27]:
def build_value_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_normal'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-6)  # Adjust learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])
    return model


In [28]:
def build_unit_model(num_classes):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),  # Added Dropout
        tf.keras.layers.Dense(num_classes, activation='softmax')  # Output for unit classification
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Adjusted learning rate
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [29]:
from sklearn.preprocessing import StandardScaler

def train_model():
    # Load and preprocess data with a limit of 10,000 entries
    train_images, train_numerical_labels, train_unit_labels, unit_le = load_and_preprocess_data(
        os.path.join(DATASET_FOLDER, 'train.csv'), IMAGE_FOLDER, max_entries=10000
    )

    # Normalize image data to [0, 1] (for both models, if applicable)
    train_images = train_images / 255.0

    # Split the data
    X_train, X_val, y_train_num, y_val_num, y_train_unit, y_val_unit = train_test_split(
        train_images, train_numerical_labels, train_unit_labels, test_size=0.2, random_state=42
    )

    # Scale numerical labels for the value model
    scaler_num = StandardScaler()
    y_train_num_scaled = scaler_num.fit_transform(y_train_num.reshape(-1, 1)).ravel()
    y_val_num_scaled = scaler_num.transform(y_val_num.reshape(-1, 1)).ravel()

    # Build models
    value_model = build_value_model()
    unit_model = build_unit_model(num_classes=len(unit_le.classes_))

    # Define callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    model_checkpoint_value = tf.keras.callbacks.ModelCheckpoint(
        'value_model.keras',
        save_best_only=True,
        monitor='val_loss'
    )
    model_checkpoint_unit = tf.keras.callbacks.ModelCheckpoint(
        'unit_model.keras',
        save_best_only=True,
        monitor='val_loss'
    )

    # Train numerical value model
    history_value = value_model.fit(
        X_train, y_train_num_scaled,
        epochs=EPOCHS,
        validation_data=(X_val, y_val_num_scaled),
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping, model_checkpoint_value]
    )

    # Train unit classification model
    history_unit = unit_model.fit(
        X_train, y_train_unit,
        epochs=50,
        validation_data=(X_val, y_val_unit),
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping, model_checkpoint_unit]
    )

    print("Models trained and saved as 'value_model.keras' and 'unit_model.keras'")

In [33]:

if __name__ == "__main__":
    train_model()

  self.pid = os.fork()
 16%|█▋        | 1628/10000 [00:33<03:02, 45.89it/s]

Failed to download https://m.media-amazon.com/images/I/1yw53vfQtS.jpg, status code: 400
Failed to download https://m.media-amazon.com/images/I/1yw53vfQtS.jpg, status code: 400
Failed to download https://m.media-amazon.com/images/I/1yw53vfQtS.jpg, status code: 400


 24%|██▍       | 2446/10000 [00:57<02:56, 42.70it/s]Process ForkPoolWorker-2:



KeyboardInterrupt: 

In [34]:
import os
import shutil
import subprocess
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf


In [35]:
# Constants
DATASET_FOLDER = '/content/student_resource 3/dataset'
IMAGE_FOLDER = '/content/student_resource 3/test_images'
MODEL_VALUE_PATH = 'value_model.keras'
MODEL_UNIT_PATH = 'unit_model.keras'
IMAGE_SIZE = (128, 128)  # Same as defined during training


In [36]:
def remove_directory(directory_path):
    """Remove directory and its contents."""
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path)
    else:
        print(f"Directory {directory_path} does not exist.")


In [37]:

def load_models():
    """Load the trained models."""
    value_model = tf.keras.models.load_model(MODEL_VALUE_PATH)
    unit_model = tf.keras.models.load_model(MODEL_UNIT_PATH)
    return value_model, unit_model


In [38]:
def preprocess_image(image_path):
    """Preprocess the image for model prediction."""
    img = tf.keras.utils.load_img(image_path, target_size=IMAGE_SIZE)
    img_array = tf.keras.utils.img_to_array(img) / 255.0  # Normalize
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    return img_array


In [39]:
import absl.logging

# Suppress TensorFlow warnings and absl logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)

def predict_entity_value(image_path, value_model, unit_model, entity_name):
    """Predict entity value and unit from the image."""
    img_array = preprocess_image(image_path)

    # Predict numerical value without showing the progress bar
    numerical_prediction = value_model.predict(img_array, verbose=0)[0][0]

    # Predict unit without showing the progress bar
    unit_prediction = unit_model.predict(img_array, verbose=0)
    unit_label = np.argmax(unit_prediction[0])

    # Get list of units for the specific entity
    units = entity_unit_map.get(entity_name, set())

    # Ensure unit prediction is valid
    unit = list(units)[unit_label] if unit_label < len(units) else None

    if unit not in allowed_units:
        unit = None

    if unit is not None:
        return f"{numerical_prediction:.2f} {unit}"
    else:
        return ""  # If unit is invalid or not found

In [40]:
def run_sanity_check(test_filename, output_filename):
    """Run sanity check using the sanity.py script."""
    try:
        # Construct the command
        command = [
            "python", "src/sanity.py",
            f"--test_filename={test_filename}",
            f"--output_filename={output_filename}"
        ]
        # Run the command
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Sanity check failed: {e}")


In [41]:
def main():
    # Read the sample test dataset
    sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

    # Load models
    value_model, unit_model = load_models()

    # Download images
    download_images(sample_test['image_link'], IMAGE_FOLDER)

    # Ensure images were downloaded
    if len(os.listdir(IMAGE_FOLDER)) == 0:
        print(f"No images found in {IMAGE_FOLDER}.")
        return

    # Generate predictions with progress output
    total_images = len(sample_test)
    print(f"Processing {total_images} images...")

    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')

    for idx, row in sample_test.iterrows():
        image_path = os.path.join(IMAGE_FOLDER, os.path.basename(row['image_link']))
        prediction = predict_entity_value(image_path, value_model, unit_model, row['entity_name'])
        sample_test.at[idx, 'prediction'] = prediction

        # Save the prediction row-by-row to avoid holding all data in memory
        sample_test[['index', 'prediction']].iloc[[idx]].to_csv(
            output_filename,
            mode='a',
            header=not os.path.exists(output_filename),
            index=False
        )

        # Print progress every 1000 images or at the end
        if (idx + 1) % 1000 == 0 or idx + 1 == total_images:
            print(f"Processed {idx + 1}/{total_images} images")

    # Run sanity check after all images are processed
    run_sanity_check(os.path.join(DATASET_FOLDER, 'sample_test.csv'), output_filename)


In [None]:
if __name__ == "__main__":
    main()

  self.pid = os.fork()
 40%|████      | 52993/131187 [10:59<17:13, 75.62it/s]

Error downloading image https://m.media-amazon.com/images/I/51ZIuXoFxrL.jpg: HTTPSConnectionPool(host='m.media-amazon.com', port=443): Max retries exceeded with url: /images/I/51ZIuXoFxrL.jpg (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1007)')))


  self.pid = os.fork()



Processing 131187 images...
Processed 1000/131187 images
Processed 2000/131187 images
Processed 3000/131187 images
Processed 4000/131187 images
Processed 5000/131187 images
Processed 6000/131187 images
Processed 7000/131187 images
Processed 8000/131187 images
Processed 9000/131187 images
Processed 10000/131187 images
Processed 11000/131187 images
Processed 12000/131187 images
Processed 13000/131187 images
Processed 14000/131187 images
Processed 15000/131187 images
Processed 16000/131187 images
Processed 17000/131187 images
Processed 18000/131187 images
Processed 19000/131187 images
Processed 20000/131187 images
Processed 21000/131187 images
Processed 22000/131187 images
Processed 23000/131187 images
Processed 24000/131187 images
Processed 25000/131187 images
Processed 26000/131187 images
Processed 27000/131187 images
Processed 28000/131187 images


In [48]:
!rmdir /content/student_resource_3/models


In [None]:
!python /content/student_resource\ 3/src/sanity.py --test_filename "/content/student_resource 3/dataset/sample_test_out.csv" --output_filename "/content/student_resource 3/dataset/test_out.csv"
