In [None]:
# ================= Kaggle Dataset Setup ==================
# This cell adds Kaggle integration and dataset management for the Amazon ML Challenge

import os

# Toggle: use local dataset (if already downloaded) or fetch from Kaggle
USE_LOCAL = True  # set to False to auto-download from Kaggle

# Paths
LOCAL_DATASET_DIR = '/content/drive/MyDrive/amazon-ml-challenge/dataset'
KAGGLE_DATASET_DIR = '/content/amazon-ml-challenge-dataset'

if not USE_LOCAL:
    # Upload your kaggle.json first if not already present
    from google.colab import files
    if not os.path.exists('/root/.kaggle/kaggle.json'):
        print("Please upload kaggle.json file")
        files.upload()
        !mkdir -p ~/.kaggle
        !cp kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json

    # Download and extract dataset from Kaggle
    !kaggle datasets download -d preethamaap/amazon-ml-challenge -p {KAGGLE_DATASET_DIR}
    !unzip -o {KAGGLE_DATASET_DIR}/amazon-ml-challenge.zip -d {KAGGLE_DATASET_DIR}

# Select dataset path dynamically
DATASET_FOLDER = LOCAL_DATASET_DIR if USE_LOCAL else KAGGLE_DATASET_DIR

print(f"✅ Using dataset from: {DATASET_FOLDER}")


##  Basic Library imports

In [None]:
!pip install utils tensorflow

Collecting utils
  Downloading utils-1.0.2.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: utils
  Building wheel for utils (setup.py) ... [?25l[?25hdone
  Created wheel for utils: filename=utils-1.0.2-py2.py3-none-any.whl size=13906 sha256=1276ba179de3fbdca50d0f757cfe7116478847c5460fe5e82ef43716592f0d4c
  Stored in directory: /root/.cache/pip/wheels/b6/a1/81/1036477786ae0e17b522f6f5a838f9bc4288d1016fc5d0e1ec
Successfully built utils
Installing collected packages: utils
Successfully installed utils-1.0.2


In [None]:
import os
import pandas as pd
import numpy as np
import cv2
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ================= Load Dataset ==================
import os
import pandas as pd

train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

print("✅ Loaded CSV files from:", DATASET_FOLDER)
print("Train shape:", train.shape)


In [None]:
import requests
import os

def download_images(url_list, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    for url in url_list:
        filename = os.path.join(save_dir, url.split("/")[-1])
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)


In [None]:
'from utils import download_images'
save_path = '/content/drive/MyDrive/amazon-ml-challenge/images'
download_images(train['image_link'], save_path)


In [None]:
assert len(os.listdir('../images')) > 0

##  Read Dataset

In [None]:
'rm -rf ../images'

In [None]:
import tensorflow as tf

batch_size = 32
img_size = (128, 128)

train_dataset = tf.keras.utils.image_dataset_from_directory(
    directory='/content/drive/MyDrive/amazon-ml-challenge/images',
    labels=None,  # provide your own labels if not using subdirectories
    image_size=img_size,
    batch_size=batch_size,
    shuffle=True
)

In [None]:
IMG_SIZE = (128, 128)

def load_preprocess_local_image(filepath, img_size=IMG_SIZE):
    img = cv2.imread(filepath)
    if img is not None:
        img = cv2.resize(img, img_size)
        img = img / 255.0
    else:
        img = np.zeros((*img_size, 3))
    return img

# Assume images for train are downloaded and stored as "../images/{filename}"
train_image_paths = [os.path.join('../images', os.path.basename(url)) for url in train['image_link']]
train_images = np.array([load_preprocess_local_image(path) for path in train_image_paths])
train_prices = train['price'].values


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_images, train_prices, test_size=0.1, random_state=42
)


In [None]:
# Build the CNN model
model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1)   # Regression output
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
# Training
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val)
)

In [None]:
# Plot loss history
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training and Validation Loss')
plt.show()