In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
import os
import pandas as pd
import shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.applications import ResNet50, resnet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm
import joblib

In [9]:


# Paths to CSVs (read-only)
TRAIN_CSV = "/kaggle/input/ml-challenge-dataset/dataset/train.csv"   # path to your training CSV
TEST_CSV = "/kaggle/input/ml-challenge-dataset/dataset/test.csv"     # path to your test CSV

# Training images (read-only)
DOWNLOAD_DIR_TRAIN = "/kaggle/input/ml-challenge-dataset/images/images"

# Writable directory for test images (to be downloaded later)
WORKING_DIR = "/kaggle/working/ml-challenge-dataset"
DOWNLOAD_DIR_TEST = os.path.join(WORKING_DIR, "test_images")

# Create writable directory for test images
os.makedirs(DOWNLOAD_DIR_TEST, exist_ok=True)

# Model/training parameters
IMG_SIZE = (224, 224)
KFOLDS = 3
EPOCHS = 3
BATCH_SIZE = 32

print("‚úÖ Setup complete.")
print(f"Training images (read-only): {DOWNLOAD_DIR_TRAIN}")
print(f"Test images (writable): {DOWNLOAD_DIR_TEST}")


‚úÖ Setup complete.
Training images (read-only): /kaggle/input/ml-challenge-dataset/images/images
Test images (writable): /kaggle/working/ml-challenge-dataset/test_images


In [10]:
def download_image(url, save_path):
    try:
        if not os.path.exists(save_path):
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content)).convert("RGB")
                img.save(save_path, "JPEG", quality=90)
    except Exception as e:
        print(f"‚ö†Ô∏è Failed: {url} ‚Äî {e}")

In [13]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.columns = [c.strip().lower() for c in train_df.columns]
train_df["filepath"] = train_df["sample_id"].apply(lambda x: os.path.join(DOWNLOAD_DIR_TRAIN, f"{x}.jpg"))

# üì• Download missing training images
print("üì• Checking and downloading missing training images...")
for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    if not os.path.exists(row["filepath"]):
        download_image(row["image_link"], row["filepath"])

train_df = train_df[train_df["filepath"].apply(os.path.exists)].reset_index(drop=True)
print(f"‚úÖ {len(train_df)} training images ready.")

üì• Checking and downloading missing training images...


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 38999/75000 [02:02<1:29:18,  6.72it/s]

‚ö†Ô∏è Failed: https://m.media-amazon.com/images/I/51mjZYDYjyL.jpg ‚Äî HTTPSConnectionPool(host='m.media-amazon.com', port=443): Max retries exceeded with url: /images/I/51mjZYDYjyL.jpg (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7d64019e5490>: Failed to resolve 'm.media-amazon.com' ([Errno -3] Temporary failure in name resolution)"))


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [03:30<00:00, 356.73it/s] 


‚úÖ 74999 training images ready.


In [15]:
# ==================================================
# üî¢ PRICE SCALING
# ==================================================
scaler = MinMaxScaler()
train_df["price_scaled"] = scaler.fit_transform(train_df[["price"]])
joblib.dump(scaler, "price_scaler.pkl")

# ==================================================
# üì∏ IMAGE DATA GENERATORS
# ==================================================
datagen_train = ImageDataGenerator(
    preprocessing_function=resnet50.preprocess_input,
    rotation_range=25,
    width_shift_range=0.15,
    height_shift_range=0.15,
    zoom_range=0.2,
    horizontal_flip=True
)
datagen_val = ImageDataGenerator(preprocessing_function=resnet50.preprocess_input)

In [19]:
def build_model():
    base_model = ResNet50(weights=None, include_top=False, input_shape=(224,224,3))
    base_model.trainable = False

    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.BatchNormalization(),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1)
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='mse', metrics=['mae'])
    return model

In [None]:
# ==================================================
# üîÑ K-FOLD CROSS VALIDATION
# ==================================================
kf = KFold(n_splits=KFOLDS, shuffle=True, random_state=42)
fold = 1
fold_scores = []

for train_idx, val_idx in kf.split(train_df):
    print(f"\nüåÄ Training Fold {fold}/{KFOLDS}")
    train_data = train_df.iloc[train_idx]
    val_data = train_df.iloc[val_idx]

    train_gen = datagen_train.flow_from_dataframe(
        dataframe=train_data,
        x_col="filepath",
        y_col="price_scaled",
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode="raw"
    )

    val_gen = datagen_val.flow_from_dataframe(
        dataframe=val_data,
        x_col="filepath",
        y_col="price_scaled",
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode="raw",
        shuffle=False
    )

    model = build_model()

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=4, min_lr=1e-6),
        ModelCheckpoint(f"best_model_fold{fold}.h5", save_best_only=True, monitor='val_loss')
    ]

    # Phase 1 ‚Äî Train with frozen base
    model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=EPOCHS,
        callbacks=callbacks,
        verbose=1
    )

    # Phase 2 ‚Äî Fine-tuning last 30 layers
    model.layers[0].trainable = True
    for layer in model.layers[0].layers[:-30]:
        layer.trainable = False

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='mse', metrics=['mae'])

    model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=10,
        callbacks=callbacks,
        verbose=1
    )

    val_loss, val_mae = model.evaluate(val_gen)
    fold_scores.append((val_loss, val_mae))
    print(f"‚úÖ Fold {fold} MAE: {val_mae:.4f}")
    fold += 1


üåÄ Training Fold 1/3
Found 49999 validated image filenames.
Found 25000 validated image filenames.


  self._warn_if_super_not_called()


Epoch 1/3
[1m  48/1563[0m [37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m1:08:19[0m 3s/step - loss: 2.5192 - mae: 1.0215

In [None]:

# ==================================================
# üìà FINAL RESULTS
# ==================================================
avg_loss = np.mean([l for l, _ in fold_scores])
avg_mae  = np.mean([m for _, m in fold_scores])
print(f"\nüéØ Average Validation Loss: {avg_loss:.4f}, MAE: {avg_mae:.4f}")
print("‚úÖ Training completed successfully!")

In [None]:
# ==================================================
# üß™ TESTING ‚Äî PRICE PREDICTION ON TEST DATASET
# ==================================================
print("\nüß™ Loading test dataset...")
test_df = pd.read_csv(TEST_CSV)
test_df.columns = [c.strip().lower() for c in test_df.columns]
test_df["filepath"] = test_df["sample_id"].apply(lambda x: os.path.join(DOWNLOAD_DIR_TEST, f"{x}.jpg"))

# üì• Download test images
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if not os.path.exists(row["filepath"]):
        download_image(row["image_link"], row["filepath"])

test_df = test_df[test_df["filepath"].apply(os.path.exists)].reset_index(drop=True)
print(f"‚úÖ {len(test_df)} test images ready for prediction.")

# Use best model (from last fold)
best_model_path = f"best_model_fold{KFOLDS}.h5"
model = tf.keras.models.load_model(best_model_path)

# Preprocess test images and predict
X_test = np.array([
    resnet50.preprocess_input(img_to_array(load_img(p, target_size=IMG_SIZE)))
    for p in tqdm(test_df["filepath"], desc="üßÆ Preprocessing test images")
])

y_pred_scaled = model.predict(X_test)
scaler = joblib.load("price_scaler.pkl")
y_pred = scaler.inverse_transform(y_pred_scaled)

# Save predictions
test_df["predicted_price"] = y_pred
test_df[["sample_id", "predicted_price"]].to_csv("predicted_prices.csv", index=False)
print("üíæ Predictions saved to predicted_prices.csv")