<a href="https://colab.research.google.com/github/vipulchinmay/amazon-ml-challenge/blob/main/amazon_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install optuna==3.6.1




In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# %% [markdown]
# # Smart Product Pricing — CNN-based multimodal model
# Using images, text, and numeric features to predict price

# %%
import os
import re
import random
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# %% [markdown]
# ## Configuration

DATA_DIR = Path("/content/drive/MyDrive/dataset")
OUTPUT_DIR = Path("/content/drive/MyDrive/output")
IMAGE_DIR = Path("/content/drive/MyDrive/dataset/images")

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

IMG_SIZE = (128, 128)
BATCH_SIZE = 32
EPOCHS = 5
TEXT_SEQ_LEN = 64
EMBED_DIM = 64
MAX_TOKENS = 20000

# %% [markdown]
# ## Load Dataset

TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV = DATA_DIR / "test.csv"

train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)

train['catalog_content'] = train['catalog_content'].astype(str)
test['catalog_content'] = test['catalog_content'].astype(str)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# %% [markdown]
# ## Feature Engineering

IPQ_PAT = re.compile(r'(\d+\s*(?:pack|pcs|pieces|count|ct|pk|packets|bottles)?)', flags=re.IGNORECASE)

def extract_ipq(text):
    if pd.isna(text):
        return 1
    m = IPQ_PAT.search(text)
    if m:
        nums = re.findall(r'\d+', m.group(1))
        if nums:
            return int(nums[0])
    return 1

train['ipq'] = train['catalog_content'].apply(extract_ipq)
test['ipq'] = test['catalog_content'].apply(extract_ipq)
train['text_len'] = train['catalog_content'].str.len()
test['text_len'] = test['catalog_content'].str.len()
train['price_log1p'] = np.log1p(train['price'].clip(lower=0))

print("Feature engineering complete")

# %% [markdown]
# ## Text Vectorization

from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=TEXT_SEQ_LEN)
vectorizer.adapt(train['catalog_content'].values)
vocab_size = len(vectorizer.get_vocabulary())
print(f'Vocabulary size: {vocab_size}')

# %% [markdown]
# ## Efficient TF Dataset Generator

AUTOTUNE = tf.data.AUTOTUNE

def load_and_preprocess_image(image_path):
    """Load and preprocess a single image"""
    # Read the image file
    img = tf.io.read_file(image_path)
    # Decode it into a dense vector
    img = tf.image.decode_jpeg(img, channels=3)
    # Resize to the desired size
    img = tf.image.resize(img, IMG_SIZE)
    # Normalize to [0, 1]
    img = img / 255.0
    return img

def create_image_path(sample_id):
    """Create image path from sample_id"""
    return str(IMAGE_DIR / f"{sample_id}.jpg")

def load_image_with_fallback(image_path):
    """Load image with fallback to zeros if file doesn't exist"""
    # Check if file exists
    file_exists = tf.io.gfile.exists(image_path)

    if file_exists:
        try:
            return load_and_preprocess_image(image_path)
        except:
            return tf.zeros((*IMG_SIZE, 3), dtype=tf.float32)
    else:
        return tf.zeros((*IMG_SIZE, 3), dtype=tf.float32)

def make_tf_dataset(df, is_train=True):
    """Create TensorFlow dataset efficiently"""
    print(f"Creating dataset for {len(df)} samples...")

    # Create image paths
    image_paths = [create_image_path(sid) for sid in df['sample_id'].values]

    # Vectorize all texts at once
    texts_vectorized = vectorizer(df['catalog_content'].values).numpy()

    # Get numeric features
    numerics = df[['ipq', 'text_len']].values.astype('float32')

    if is_train:
        targets = df['price_log1p'].values.astype('float32')

        # Create dataset from components
        def data_generator():
            for i in range(len(df)):
                img = load_image_with_fallback(image_paths[i])
                yield (
                    {
                        "image": img,
                        "text": texts_vectorized[i],
                        "numeric": numerics[i]
                    },
                    targets[i]
                )

        output_signature = (
            {
                "image": tf.TensorSpec(shape=(*IMG_SIZE, 3), dtype=tf.float32),
                "text": tf.TensorSpec(shape=(TEXT_SEQ_LEN,), dtype=tf.int64),
                "numeric": tf.TensorSpec(shape=(2,), dtype=tf.float32)
            },
            tf.TensorSpec(shape=(), dtype=tf.float32)
        )

        dataset = tf.data.Dataset.from_generator(
            data_generator,
            output_signature=output_signature
        )
        dataset = dataset.shuffle(2048, seed=SEED).batch(BATCH_SIZE).prefetch(AUTOTUNE)
    else:
        def data_generator():
            for i in range(len(df)):
                img = load_image_with_fallback(image_paths[i])
                yield {
                    "image": img,
                    "text": texts_vectorized[i],
                    "numeric": numerics[i]
                }

        output_signature = {
            "image": tf.TensorSpec(shape=(*IMG_SIZE, 3), dtype=tf.float32),
            "text": tf.TensorSpec(shape=(TEXT_SEQ_LEN,), dtype=tf.int64),
            "numeric": tf.TensorSpec(shape=(2,), dtype=tf.float32)
        }

        dataset = tf.data.Dataset.from_generator(
            data_generator,
            output_signature=output_signature
        )
        dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    print("Dataset created successfully")
    return dataset

# %% [markdown]
# ## Split train/validation

print("\nSplitting data...")
train_df, val_df = train_test_split(train, test_size=0.1, random_state=SEED)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

print("\nCreating training dataset...")
train_ds = make_tf_dataset(train_df, is_train=True)

print("\nCreating validation dataset...")
val_ds = make_tf_dataset(val_df, is_train=True)

# %% [markdown]
# ## Build Multimodal Model

print("\nBuilding model...")

# Image branch
img_input = keras.Input(shape=(*IMG_SIZE, 3), name="image")
x = layers.Conv2D(32, 3, activation='relu', padding='same')(img_input)
x = layers.MaxPool2D(2)(x)
x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
x = layers.MaxPool2D(2)(x)
x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
x = layers.GlobalAveragePooling2D()(x)
img_out = layers.Dense(128, activation='relu')(x)

# Text branch
text_input = keras.Input(shape=(TEXT_SEQ_LEN,), dtype='int64', name="text")
emb = layers.Embedding(vocab_size, EMBED_DIM)(text_input)
t = layers.Conv1D(128, 3, activation='relu', padding='same')(emb)
t = layers.GlobalMaxPool1D()(t)
text_out = layers.Dense(64, activation='relu')(t)

# Numeric branch
num_input = keras.Input(shape=(2,), name="numeric")
n = layers.Dense(32, activation='relu')(num_input)
num_out = layers.Dense(16, activation='relu')(n)

# Combine
combined = layers.concatenate([img_out, text_out, num_out])
combined = layers.Dense(256, activation='relu')(combined)
combined = layers.Dropout(0.3)(combined)
combined = layers.Dense(64, activation='relu')(combined)
out = layers.Dense(1, activation='linear', name="price_log1p")(combined)

model = keras.Model(inputs=[img_input, text_input, num_input], outputs=out)
model.summary()

# %% [markdown]
# ## Compile Model

def smape_tf(y_true, y_pred):
    y_true = tf.reshape(y_true, [-1])
    y_pred = tf.reshape(y_pred, [-1])
    y_true_v = tf.math.expm1(y_true)
    y_pred_v = tf.math.expm1(y_pred)
    denom = (tf.abs(y_true_v) + tf.abs(y_pred_v)) / 2.0
    diff = tf.abs(y_true_v - y_pred_v)
    return tf.reduce_mean(tf.where(denom == 0, 0.0, diff / denom)) * 100.0

print("\nCompiling model...")
model.compile(optimizer='adam', loss='mae', metrics=[smape_tf])

# %% [markdown]
# ## Callbacks

callbacks = [
    keras.callbacks.ModelCheckpoint(
        str(OUTPUT_DIR / 'best_model.h5'),
        save_best_only=True,
        monitor='val_loss',
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=4,
        restore_best_weights=True,
        verbose=1
    )
]

# %% [markdown]
# ## Train

print("\n" + "="*50)
print("Starting training...")
print("="*50 + "\n")

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

print("\nTraining complete!")

# %% [markdown]
# ## Evaluate

print("\nEvaluating on validation set...")
val_preds_log = model.predict(val_ds, verbose=1)
val_trues_log = val_df['price_log1p'].values[:len(val_preds_log)]

val_preds = np.expm1(val_preds_log.ravel())
val_trues = np.expm1(val_trues_log)

mae = mean_absolute_error(val_trues, val_preds)
print(f"\nValidation MAE: {mae:.4f}")

# Calculate SMAPE manually
smape = np.mean(2 * np.abs(val_preds - val_trues) / (np.abs(val_preds) + np.abs(val_trues))) * 100
print(f"Validation SMAPE: {smape:.4f}%")

# %% [markdown]
# ## Predict on Test Set

print("\nCreating test dataset...")
test_ds = make_tf_dataset(test, is_train=False)

print("\nGenerating predictions...")
preds_log = model.predict(test_ds, verbose=1)
preds = np.expm1(preds_log.ravel())
preds = np.clip(preds, 0.01, None)

submission = pd.DataFrame({
    'sample_id': test['sample_id'].values[:len(preds)],
    'price': preds
})
submission_path = OUTPUT_DIR / 'submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"\nFirst few predictions:\n{submission.head()}")

Train shape: (75000, 4)
Test shape: (75000, 3)
Feature engineering complete
Vocabulary size: 20000

Splitting data...
Train samples: 67500, Validation samples: 7500

Creating training dataset...
Creating dataset for 67500 samples...
Dataset created successfully

Creating validation dataset...
Creating dataset for 7500 samples...
Dataset created successfully

Building model...



Compiling model...

Starting training...

Epoch 1/5
   2110/Unknown [1m143s[0m 62ms/step - loss: 1755135.1250 - smape_tf: nan




Epoch 1: val_loss improved from inf to 0.62215, saving model to /content/drive/MyDrive/output/best_model.h5




[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 68ms/step - loss: 1754756.6250 - smape_tf: nan - val_loss: 0.6222 - val_smape_tf: 61.2237 - learning_rate: 0.0010
Epoch 2/5
[1m2107/2110[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 61ms/step - loss: 0.5235 - smape_tf: 52.5343
Epoch 2: val_loss did not improve from 0.62215
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 65ms/step - loss: 0.5235 - smape_tf: 52.5308 - val_loss: 0.6430 - val_smape_tf: 63.1610 - learning_rate: 0.0010
Epoch 3/5
[1m2108/2110[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 61ms/step - loss: 0.4618 - smape_tf: 46.8401
Epoch 3: val_loss did not improve from 0.62215

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 66ms/step - loss: 0.4618 - smape_tf: 46.8380 - val_loss: 0.6407 - val_smape_tf:




Submission saved to: /content/drive/MyDrive/output/submission.csv
Submission shape: (75000, 2)

First few predictions:
   sample_id      price
0     100179  10.760299
1     245611   9.746073
2     146263  15.343919
3      95658   6.046479
4      36806  17.322092
