In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import requests
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sentence_transformers import SentenceTransformer

# -------------------------------
# 1. Download train/test from gdrive (as before)
import gdown, re
os.makedirs('dataset', exist_ok=True)
def download_from_gdrive(gdrive_url, output_path):
    m = re.search(r'/d/([a-zA-Z0-9_-]+)', gdrive_url)
    file_id = m.group(1)
    direct_url = f'https://drive.google.com/uc?export=download&id={file_id}'
    gdown.download(direct_url, output_path, quiet=False)
TRAIN_URL="https://drive.google.com/file/d/1zl1Ge8rOdFr1DT4F4zzrEuSMqY3YLmhM/view?usp=drive_link"
TEST_URL="https://drive.google.com/file/d/1U1AAOz9z5WlXvFjn17oAsPaj9vzcxgOO/view?usp=drive_link"
if not os.path.exists('dataset/train.csv'):
    download_from_gdrive(TRAIN_URL, 'dataset/train.csv')
if not os.path.exists('dataset/test.csv'):
    download_from_gdrive(TEST_URL, 'dataset/test.csv')

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# -------------------------------
# 2. Feature Engineering

# Helper: extract item pack quantity (IPQ)
def extract_ipq(text):
    import re
    m = re.search(r'(\d+)\s?(pcs|pieces|pack|pk|ct|count)', str(text).lower())
    if m:
        return int(m.group(1))
    nums = re.findall(r'\d+', str(text))
    if nums:
        return int(nums[-1])
    return 1
train['ipq'] = train['catalog_content'].map(extract_ipq)
test['ipq'] = test['catalog_content'].map(extract_ipq)
# Brand from first word
def extract_brand(text):
    return text.split()[0].lower() if isinstance(text, str) and len(text.split()) else 'unknown'
train['brand'] = train['catalog_content'].map(extract_brand)
test['brand'] = test['catalog_content'].map(extract_brand)
le = LabelEncoder()
train['brand_enc'] = le.fit_transform(train['brand'])
test['brand_enc'] = le.transform(test['brand'])

# -------------------------------
# 3. Text Embeddings (GPU, fast!)
print("Extracting text embeddings...")
sbert = SentenceTransformer('all-MiniLM-L6-v2')  # Only 22MB, GPU-accelerated
train_text_emb = sbert.encode(train['catalog_content'].fillna("").tolist(), batch_size=128, show_progress_bar=True)
test_text_emb = sbert.encode(test['catalog_content'].fillna("").tolist(), batch_size=128, show_progress_bar=True)

# -------------------------------
# 4. Image Embeddings (GPU, fast batch!)
IMG_SIZE = 96
def get_image_tensor(url):
    try:
        response = requests.get(url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert('RGB').resize((IMG_SIZE, IMG_SIZE))
        return np.array(img) / 255.0
    except Exception:
        return np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)

def batch_image_embeddings(urls):
    # EfficientNetV2S is small and fast, or use MobileNetV2
    base_cnn = tf.keras.applications.MobileNetV2(include_top=False, pooling='avg', input_shape=(IMG_SIZE,IMG_SIZE,3))
    feats = []
    for i in tqdm(range(0, len(urls), 128), desc="Img batches"):
        batch_imgs = [get_image_tensor(u) for u in urls[i:i+128]]
        batch_imgs = np.stack(batch_imgs)
        batch_feats = base_cnn.predict(batch_imgs, verbose=0)
        feats.append(batch_feats)
    return np.concatenate(feats, axis=0)

# Only do this ONCE and save to .npy for repeated runs!
if not os.path.exists('train_img_emb.npy'):
    train_img_emb = batch_image_embeddings(train['image_link'].tolist())
    np.save('train_img_emb.npy', train_img_emb)
else:
    train_img_emb = np.load('train_img_emb.npy')
if not os.path.exists('test_img_emb.npy'):
    test_img_emb = batch_image_embeddings(test['image_link'].tolist())
    np.save('test_img_emb.npy', test_img_emb)
else:
    test_img_emb = np.load('test_img_emb.npy')

# -------------------------------
# 5. Assemble features
scaler = MinMaxScaler()
train_tab = scaler.fit_transform(train[['ipq','brand_enc']])
test_tab = scaler.transform(test[['ipq','brand_enc']])
X_train = np.hstack([train_text_emb, train_tab, train_img_emb])
X_test = np.hstack([test_text_emb, test_tab, test_img_emb])
y_train = train['price'].values

# -------------------------------
# 6. Deep Learning Model
inputs = layers.Input(shape=(X_train.shape[1],))
x = layers.Dense(256, activation='relu')(inputs)
x = layers.BatchNormalization()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(1, activation='relu')(x)  # price must be positive
model = models.Model(inputs, x)
model.compile(optimizer=optimizers.Adam(1e-3), loss='mae')
print(model.summary())

# Early stopping for competition
cb = [callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
history = model.fit(X_train, y_train, validation_split=0.08, epochs=30, batch_size=512, callbacks=cb, verbose=2)

# -------------------------------
# 7. Predict and Output
pred = model.predict(X_test, batch_size=256).reshape(-1)
pred = np.clip(pred, 1, None)
out = pd.DataFrame({'sample_id': test['sample_id'], 'price': np.round(pred,2)})
out.to_csv('test_out.csv', index=False)
print("test_out.csv generated!")

Downloading...
From: https://drive.google.com/uc?export=download&id=1zl1Ge8rOdFr1DT4F4zzrEuSMqY3YLmhM
To: /content/dataset/train.csv
100%|██████████| 73.5M/73.5M [00:00<00:00, 89.2MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1U1AAOz9z5WlXvFjn17oAsPaj9vzcxgOO
To: /content/dataset/test.csv
100%|██████████| 73.2M/73.2M [00:01<00:00, 71.6MB/s]


Extracting text embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Img batches: 100%|██████████| 586/586 [1:05:38<00:00,  6.72s/it]
Img batches: 100%|██████████| 586/586 [1:05:40<00:00,  6.72s/it]


None
Epoch 1/30
135/135 - 6s - 48ms/step - loss: 16.0558 - val_loss: 16.5253
Epoch 2/30
135/135 - 1s - 4ms/step - loss: 13.9069 - val_loss: 25.3706
Epoch 3/30
135/135 - 1s - 4ms/step - loss: 13.4591 - val_loss: 51.2472
Epoch 4/30
135/135 - 1s - 4ms/step - loss: 13.2042 - val_loss: 39.3786
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
test_out.csv generated!
