In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Notebook: smart_pricing_solution.ipynb
# Requirements (pip):
# pip install pandas numpy scikit-learn lightgbm tqdm pillow requests torchvision timm sentence-transformers joblib
# Optional: catboost, transformers (if you want to experiment)

# Imports
import os
import re
import gc
import json
import time
import math
import joblib
import random
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# For images and CNN features
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T
import timm  # recommended for many small CNN backbones

In [13]:
# Paths - adjust as needed
ROOT = Path.cwd()
DATA_DIR = ROOT / "dataset"
TRAIN_CSV = DATA_DIR / "/content/Amazon_ML/train.csv"
TEST_CSV  = DATA_DIR / "/content/Amazon_ML/test.csv"
OUTPUT_DIR = ROOT / "outputs"
IMAGES_DIR = ROOT / "images"  # where we download images
OUTPUT_DIR.mkdir(exist_ok=True)
IMAGES_DIR.mkdir(exist_ok=True)

In [15]:
# Utility: parse item pack quantity (IPQ) from catalog_content, naive but effective
def extract_ipq(text):
    if not isinstance(text, str):
        return np.nan
    # look for patterns like "pack of 2", "2 Pack", "Pack of 20", "10 pcs", "10pieces", "500 ml x 2"
    patterns = [
        r'pack of\s*(\d+)',
        r'(\d+)\s*pack',
        r'(\d+)\s*pcs',
        r'(\d+)\s*pieces',
        r'(\d+)\s*p?cs',
        r'(\d+)\s*count',
        r'(\d+)\s*x\s*\d*',  # "500 ml x 2" -> captures 500 possibly; be careful later
        r'(\d+)\s*ml\s*x\s*(\d+)',  # to capture "500 ml x 2"
    ]
    textl = text.lower()
    for pat in patterns:
        m = re.search(pat, textl)
        if m:
            # Return the last numeric capture that is plausibly small (<1000)
            nums = [int(g) for g in m.groups() if g and g.isdigit()] if m.groups() else []
            if nums:
                # prefer smaller number if multiple, else first
                for n in reversed(nums):
                    if 1 <= n <= 10000:
                        return n
                return nums[-1]
    # fallback: try to find isolated small ints in the text
    nums_all = re.findall(r'\b(\d{1,4})\b', textl)
    for n in nums_all:
        ni = int(n)
        if 1 <= ni <= 1000:
            return ni
    return np.nan

In [16]:
def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-6
    return np.mean(np.abs(y_pred - y_true) / denom) * 100.0

In [17]:
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

print("train shape:", train.shape)
print("test shape:", test.shape)
train.head()

train shape: (75000, 4)
test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [18]:
train['is_train'] = 1
test['is_train'] = 0
test['price'] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

In [19]:
# Parse IPQ
tqdm.pandas()
df['ipq'] = df['catalog_content'].progress_apply(extract_ipq)

# text length features
df['catalog_len'] = df['catalog_content'].fillna("").apply(len)
df['catalog_word_count'] = df['catalog_content'].fillna("").apply(lambda x: len(x.split()))

# image link existence
df['has_image'] = df['image_link'].notnull().astype(int)

# quick price log transform for stability (for training)
df['price_log'] = np.log1p(df['price'].fillna(0))  # only meaningful on training rows

# Inspect
df[['sample_id','ipq','catalog_len','has_image','price']].head()

  0%|          | 0/150000 [00:00<?, ?it/s]

Unnamed: 0,sample_id,ipq,catalog_len,has_image,price
0,33127,6.0,91,1,4.89
1,198967,4.0,511,1,13.12
2,261251,6.0,328,1,1.97
3,55858,11.0,1318,1,30.34
4,292686,12.0,155,1,66.49


In [20]:
def download_image(url, out_path, max_retries=3, timeout=10):
    if out_path.exists():
        return True
    for attempt in range(max_retries):
        try:
            r = requests.get(url, stream=True, timeout=timeout)
            if r.status_code == 200:
                with open(out_path, 'wb') as f:
                    for chunk in r.iter_content(1024):
                        f.write(chunk)
                return True
        except Exception as e:
            time.sleep(1 + attempt)
    return False

def download_all_images(df, images_dir=IMAGES_DIR, n_workers=8, limit=None):
    rows = df[['sample_id','image_link']].drop_duplicates()
    if limit:
        rows = rows.head(limit)
    for _, r in tqdm(rows.iterrows(), total=len(rows)):
        sid, url = r['sample_id'], r['image_link']
        if not isinstance(url, str) or not url.startswith('http'):
            continue
        outp = images_dir / f"{sid}.jpg"
        download_image(url, outp)

In [None]:
# For development, limit first 5000 images; for final run remove limit
download_all_images(df[df['has_image']==1], images_dir=IMAGES_DIR, limit=5000)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use a lightweight pretrained backbone
model_name = "tf_efficientnet_b0_ns"  # timm model name (small)
backbone = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool='avg')  # returns feature vector
backbone.to(device)
backbone.eval()

transform = T.Compose([
    T.Resize((224,224)),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

class ImageDataset(Dataset):
    def __init__(self, df, images_dir, transform=None):
        self.df = df
        self.images_dir = Path(images_dir)
        self.transform = transform
        self.ids = df['sample_id'].values

    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        sid = str(self.ids[idx])
        p = self.images_dir / f"{sid}.jpg"
        if p.exists():
            img = Image.open(p).convert('RGB')
        else:
            # fallback blank image
            img = Image.new('RGB', (224,224), (255,255,255))
        if self.transform:
            img = self.transform(img)
        return sid, img

def extract_image_features(df_subset, images_dir=IMAGES_DIR, batch_size=64):
    ds = ImageDataset(df_subset, images_dir, transform=transform)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    feats = {}
    with torch.no_grad():
        for sids, imgs in tqdm(dl, total=len(dl)):
            imgs = imgs.to(device)
            out = backbone(imgs)
            out = out.cpu().numpy()
            for sid, vec in zip(sids, out):
                feats[int(sid)] = vec
    return feats

# Extract for rows that have images (again limit for dev)
image_feats = extract_image_features(df[df['has_image']==1].head(5000))
# Save for reuse
joblib.dump(image_feats, OUTPUT_DIR/"image_feats_sample.pkl")

  model = create_fn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]



  0%|          | 0/79 [00:00<?, ?it/s]

['/content/outputs/image_feats_sample.pkl']

In [25]:
# TF-IDF on catalog_content
text_col = 'catalog_content'
corpus = df[text_col].fillna("").astype(str).tolist()

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=5)
X_tfidf = tfidf.fit_transform(corpus)  # sparse matrix

# reduce dimensionality
svd = TruncatedSVD(n_components=200, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

# attach back to df (as columns)
cols = [f"txt_svd_{i}" for i in range(X_svd.shape[1])]
txt_svd_df = pd.DataFrame(X_svd, columns=cols)
txt_svd_df.index = df.index
df = pd.concat([df.reset_index(drop=True), txt_svd_df.reset_index(drop=True)], axis=1)

In [26]:
# Tabular numeric features
num_feats = ['ipq','catalog_len','catalog_word_count','has_image']
# Fill ipq missing with 1 (assumption) and scale
df['ipq'] = df['ipq'].fillna(1).astype(float)
df[num_feats] = df[num_feats].fillna(0)

# Add image reduced features if present
# Load image_feats dict
image_feats = joblib.load(OUTPUT_DIR/"image_feats_sample.pkl")  # adapt path
# Build DataFrame aligned with df index
img_dim = list(image_feats.values())[0].shape[0]
img_rows = []
for sid in df['sample_id'].values:
    if int(sid) in image_feats:
        img_rows.append(image_feats[int(sid)])
    else:
        img_rows.append(np.zeros(img_dim, dtype=float))
img_arr = np.vstack(img_rows)
# Reduce image features with SVD to 64 dims
from sklearn.decomposition import PCA
pca_img = PCA(n_components=64, random_state=42)
img_reduced = pca_img.fit_transform(img_arr)
img_cols = [f"img_pca_{i}" for i in range(img_reduced.shape[1])]
img_df = pd.DataFrame(img_reduced, columns=img_cols)
img_df.index = df.index
df = pd.concat([df, img_df.reset_index(drop=True)], axis=1)

# Compile final feature list
tfidf_svd_cols = [c for c in df.columns if c.startswith('txt_svd_')]
img_cols = [c for c in df.columns if c.startswith('img_pca_')]
feature_cols = num_feats + tfidf_svd_cols + img_cols
len(feature_cols)

468

In [27]:
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols].fillna(0))
joblib.dump(scaler, OUTPUT_DIR/"scaler.joblib")

['/content/outputs/scaler.joblib']

In [28]:
# Prepare training arrays
train_df = df[df['is_train']==1].reset_index(drop=True)
test_df  = df[df['is_train']==0].reset_index(drop=True)

X = train_df[feature_cols].values
y = train_df['price'].values  # target in original price space
y_log = np.log1p(y)

# KFold training with out-of-fold predictions
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))

lgb_params = {
    "objective": "regression",
    "metric": "mae",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 127,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "verbose": -1,
    "n_jobs": -1
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y_log[tr_idx], y_log[val_idx]  # train on log price for stability

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val)
    model = lgb.train(lgb_params, dtrain, num_boost_round=5000,
                      valid_sets=[dtrain,dval], callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)])
    # preds are in log space -> convert back
    val_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = np.expm1(val_pred_log)
    oof_preds[val_idx] = val_pred

    test_pred_log = model.predict(test_df[feature_cols].values, num_iteration=model.best_iteration)
    test_pred = np.expm1(test_pred_log)
    test_preds += test_pred / n_splits

    # save model
    model.save_model(str(OUTPUT_DIR/f"lgb_fold{fold+1}.txt"))

# Evaluate
score = smape(train_df['price'].values, oof_preds)
print("OOF SMAPE:", score)

# Save OOF and test predictions
train_df['oof_pred'] = oof_preds
test_df['predicted_price'] = test_preds

Fold 1
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.416824	valid_1's l1: 0.566934
[400]	training's l1: 0.316384	valid_1's l1: 0.556541
[600]	training's l1: 0.247408	valid_1's l1: 0.551868
[800]	training's l1: 0.196502	valid_1's l1: 0.548716
[1000]	training's l1: 0.156618	valid_1's l1: 0.546683
[1200]	training's l1: 0.126043	valid_1's l1: 0.545272
[1400]	training's l1: 0.101851	valid_1's l1: 0.54445
[1600]	training's l1: 0.0826333	valid_1's l1: 0.543729
[1800]	training's l1: 0.0678064	valid_1's l1: 0.543257
[2000]	training's l1: 0.0555279	valid_1's l1: 0.543071
Early stopping, best iteration is:
[1934]	training's l1: 0.0593101	valid_1's l1: 0.543055
Fold 2
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.418504	valid_1's l1: 0.559144
[400]	training's l1: 0.317182	valid_1's l1: 0.547951
[600]	training's l1: 0.2475	valid_1's l1: 0.54292
[800]	training's l1: 0.195721	valid_1's l1: 0.540056
[1000]	training's l1: 0.1

In [32]:
# Ensure positive floats
test_df['predicted_price'] = np.clip(test_df['predicted_price'], 0.01, None)

submission = pd.DataFrame({
    "sample_id": test_df['sample_id'].astype(int),
    "price": test_df['predicted_price'].astype(float)
})

submission.to_csv(OUTPUT_DIR/"test_out.csv", index=False)
print("Saved submission to", OUTPUT_DIR/"test_out.csv")

Saved submission to /content/outputs/test_out.csv


In [None]:
!ls /content

In [None]:
import shutil
from google.colab import files

# ✅ Create ZIP of entire Colab session (code + outputs + notebook)
zip_path = "/content/Colab_Project_Backup"
shutil.make_archive(zip_path, 'zip', "/content")

print("✅ ZIP created at", zip_path + ".zip")

# ✅ Download ZIP
files.download(zip_path + ".zip")