In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
!unzip /content/drive/MyDrive/Amazon_dataset.zip -d /content/

Archive:  /content/drive/MyDrive/Amazon_dataset.zip
replace /content/Amazon_dataset/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/.DS_Store  
replace /content/Amazon_dataset/dataset/sample_test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/dataset/sample_test.csv  
replace /content/Amazon_dataset/dataset/sample_test_out.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/dataset/sample_test_out.csv  
replace /content/Amazon_dataset/dataset/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/dataset/test.csv  
replace /content/Amazon_dataset/dataset/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/dataset/train.csv  
replace /content/Amazon_dataset/Documentation_template.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/Amazon_dataset/Documentation_template.md  
replace /content/Amazon_datase

In [51]:
!nvidia-smi
!pip install -q transformers accelerate datasets sentencepiece torch


Mon Oct 13 12:24:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P0             26W /   70W |    6352MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [59]:
import os, random, math
import numpy as np, pandas as pd, torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
# === CONFIG ===
DATA_DIR   = "/content/Amazon_dataset/dataset"   # ✅ your dataset path
MODEL_NAME = "chandar-lab/NeoBERT"
SAVE_DIR   = "/content/neo_model"
os.makedirs(SAVE_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5
WARMUP_STEPS = 200
GRAD_CLIP = 1.0
WEIGHT_DECAY = 0.01

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if DEVICE == "cuda": torch.cuda.manual_seed_all(SEED)
print("✅ Device:", DEVICE)


✅ Device: cuda


In [60]:
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
test_df  = pd.read_csv(f"{DATA_DIR}/test.csv")

print(train_df.shape, test_df.shape)
train_df = train_df.dropna(subset=['catalog_content','price']).reset_index(drop=True)
print(train_df.head(2))


(75000, 4) (75000, 3)
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  


In [61]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)

class PricingDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256, is_train=True):
        self.texts = df['catalog_content'].astype(str).tolist()
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.max_len = max_len
        if is_train:
            self.targets = np.log1p(df['price'].astype(np.float32))

    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tokenizer(
            self.texts[i], truncation=True, padding='max_length',
            max_length=self.max_len, return_tensors=None
        )
        item = {k: torch.tensor(v, dtype=torch.long) for k,v in enc.items()}
        if self.is_train:
            item['target'] = torch.tensor(self.targets.iloc[i], dtype=torch.float)

        return item

def collate_fn(batch):
    out = {
        'input_ids': torch.stack([b['input_ids'] for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch])
    }
    if 'target' in batch[0]:
        out['target'] = torch.stack([b['target'] for b in batch])
    return out


In [62]:
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=SEED)
train_df = train_df.reset_index(drop=True)  # ✅ reset index
val_df   = val_df.reset_index(drop=True)    # ✅ reset index

train_ds = PricingDataset(train_df, tokenizer, MAX_LEN, True)
val_ds   = PricingDataset(val_df, tokenizer, MAX_LEN, True)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)


In [56]:
!pip install -q xformers


In [79]:
class NeoBertRegressor(nn.Module):
    def __init__(self, model_name, dropout=0.2, hidden=512):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        emb = self.encoder.config.hidden_size
        self.regressor = nn.Sequential(
            nn.Linear(emb, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )
        for layer in self.regressor:
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
                if layer.bias is not None:
                    nn.init.constant_(layer.bias, 0.0)

        # Set final layer bias to roughly match target mean (~2.5)
        self.regressor[-1].bias.data.fill_(2.5)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls = out.last_hidden_state[:,0,:]
        output = self.regressor(cls).squeeze(-1)
        return output

model = NeoBertRegressor(MODEL_NAME).to(DEVICE)


In [80]:
def smape_loss_from_log(pred_log, targ_log, eps=1e-6):
    pred, targ = torch.expm1(pred_log), torch.expm1(targ_log)
    pred = torch.clamp(pred, min=eps); targ = torch.clamp(targ, min=eps)
    return torch.mean(torch.abs(pred - targ) / ((torch.abs(pred)+torch.abs(targ))/2))

def smape_np(preds, targs, eps=1e-6):
    preds = np.maximum(preds, eps); targs = np.maximum(targs, eps)
    return np.mean(np.abs(preds-targs)/((np.abs(preds)+np.abs(targs))/2))


In [81]:
LR = 5e-5  # Increase from 2e-5 to 5e-5 for better gradient flow

no_decay = ["bias", "LayerNorm.weight"]
params = [
    {"params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],"weight_decay":WEIGHT_DECAY},
    {"params":[p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],"weight_decay":0.0},
]
optimizer = AdamW(params, lr=LR)
total_steps = len(train_loader)*EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, WARMUP_STEPS, total_steps)
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [82]:
# Quick diagnostic before training
model.eval()
with torch.no_grad():
    batch = next(iter(train_loader))
    ids = batch['input_ids'].to(DEVICE)
    mask = batch['attention_mask'].to(DEVICE)
    preds = model(ids, mask)

    print("=== DIAGNOSTIC CHECK ===")
    print("Sample predictions (log scale):", preds[:5])
    print("Pred range (log):", preds.min().item(), preds.max().item())
    print("Sample targets (log scale):", batch['target'].to(DEVICE)[:5])
    print("Target range (log):", batch['target'].min().item(), batch['target'].max().item())
    print("=======================")

=== DIAGNOSTIC CHECK ===
Sample predictions (log scale): tensor([3.1364, 4.2192, 4.0817, 3.8016, 3.1776], device='cuda:0')
Pred range (log): 3.136380195617676 4.219173431396484
Sample targets (log scale): tensor([0.9322, 3.7386, 2.0136, 1.0953, 2.8326], device='cuda:0')
Target range (log): 0.9321640729904175 3.738621473312378


In [None]:
best_smape = 999
for epoch in range(1, EPOCHS+1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
    train_losses=[]
    for batch in pbar:
        ids, mask, y = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE), batch['target'].to(DEVICE)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            preds = model(ids, mask)
            loss = smape_loss_from_log(preds, y)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        scaler.step(optimizer); scaler.update(); scheduler.step()
        train_losses.append(loss.item()); pbar.set_postfix(loss=np.mean(train_losses))
    print(f"Epoch {epoch} train_loss: {np.mean(train_losses):.5f}")

    # Validation
    model.eval(); val_preds=[]; val_targs=[]
    with torch.no_grad():
        for b in val_loader:
            ids, mask, y = b['input_ids'].to(DEVICE), b['attention_mask'].to(DEVICE), b['target'].to(DEVICE)
            with torch.cuda.amp.autocast():
                out = model(ids, mask)
            val_preds.extend(torch.expm1(out).cpu().numpy())
            val_targs.extend(torch.expm1(y).cpu().numpy())
    smape = smape_np(np.array(val_preds), np.array(val_targs))
    print(f"Epoch {epoch} val_SMAPE: {smape:.4f}")
    if smape < best_smape:
        best_smape = smape
        torch.save(model.state_dict(), f"{SAVE_DIR}/best_model.pth")
        tokenizer.save_pretrained(SAVE_DIR)
        print("✅ Saved new best model")


Epoch 1:   0%|          | 0/7969 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Epoch 1 train_loss: 0.98467


  with torch.cuda.amp.autocast():


Epoch 1 val_SMAPE: 0.5867
✅ Saved new best model


Epoch 2:   0%|          | 0/7969 [00:00<?, ?it/s]

In [None]:
model.load_state_dict(torch.load(f"{SAVE_DIR}/best_model.pth", map_location=DEVICE))
model.eval()

test_ds = PricingDataset(test_df, tokenizer, MAX_LEN, is_train=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

preds=[]
with torch.no_grad():
    for b in tqdm(test_loader, desc="Predict"):
        ids, mask = b['input_ids'].to(DEVICE), b['attention_mask'].to(DEVICE)
        with torch.cuda.amp.autocast():
            out = model(ids, mask)
        preds.extend(torch.expm1(out).cpu().numpy().tolist())

preds = np.maximum(np.array(preds), 1e-3)
submission = pd.DataFrame({"sample_id": test_df['sample_id'], "price": preds})
out_path = "/content/test_out.csv"
submission.to_csv(out_path, index=False)
print("✅ Submission saved at", out_path)
submission.head()
