# Model Training Notebook
Train components: text embeddings + clustering (NLP) and a CV classifier skeleton.

In [ ]:
import pandas as pd, numpy as np
csv_path = '/content/data/products.csv'
df = pd.read_csv(csv_path)
df['text_blob'] = (df.get('title','').fillna('') + ' | ' + df.get('brand','').fillna('') + ' | ' + df.get('description','').fillna(''))
len(df)

## Sentence-Transformers embeddings

In [ ]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
X = model.encode(df['text_blob'].fillna('')).astype('float32')
X.shape

## KMeans clustering

In [ ]:
from sklearn.cluster import KMeans
k=8
km = KMeans(n_clusters=k, n_init=10, random_state=42)
labels = km.fit_predict(X)
df['cluster']=labels
df['cluster'].value_counts()

## CV Training (Transfer Learning Skeleton)

In [ ]:
import torch, torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import requests, io
LABELS = ['chair','table','sofa','bed','cabinet','lamp','shelf','stool','bench','desk']
label_to_idx = {l:i for i,l in enumerate(LABELS)}
class URLImageDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.t = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        url = str(row.get('images','')).split(',')[0].strip()
        lab = 0
        title = (row.get('title') or '') + ' ' + (row.get('categories') or '')
        for j,l in enumerate(LABELS):
            if l in title.lower(): lab=j; break
        try:
            img_bytes = requests.get(url, timeout=10).content
            img = Image.open(io.BytesIO(img_bytes)).convert('RGB')
        except Exception:
            img = Image.new('RGB',(224,224),(200,200,200))
        if self.t: img = self.t(img)
        return img, lab
transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])
ds = URLImageDataset(df.sample(min(500, len(df))), transform)
dl = DataLoader(ds, batch_size=8, shuffle=True)
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, len(LABELS))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
crit = nn.CrossEntropyLoss()
for epoch in range(1):
    model.train()
    total=0; correct=0
    for x,y in dl:
        x,y = x.to(device), y.to(device)
        opt.zero_grad(); out = model(x); loss = crit(out,y); loss.backward(); opt.step()
        pred = out.argmax(1); total+=y.size(0); correct += (pred==y).sum().item()
    print('epoch', epoch, 'train acc', correct/total)
import os
os.makedirs('/content/models', exist_ok=True)
torch.jit.save(torch.jit.script(model.cpu()), '/content/models/cv.pt')
print('Saved TorchScript to /content/models/cv.pt')