# Old Attempted Strat - Embedding Category Names into CLIP

In [None]:
import torch
import clip
from PIL import Image
from tqdm import tqdm
import os
import time
import random
from pathlib import Path
import numpy as np
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import accuracy_score

category_dict = {
	'Boots': ['Ankle','Knee High','Mid-Calf','Over the Knee','Prewalker Boots'],
 	'Sandals': ['Athletic', 'Flat', 'Heel'],
	'Shoes': ['Boat Shoes','Clogs and Mules','Crib Shoes','Firstwalker','Flats','Heels','Loafers','Oxfords','Prewalker','Sneakers and Athletic Shoes'],
	'Slippers': ['Boot', 'Slipper Flats', 'Slipper Heels']
}

# 1. Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def encode_several_texts(lst):
	text_tokens = clip.tokenize(lst).to(device)
	# Encode the texts
	with torch.no_grad():
		text_features = model.encode_text(text_tokens)
	# Normalize
	return text_features / text_features.norm(dim=-1, keepdim=True)

encoded_categories = encode_several_texts(category_dict.keys())
encoded_sub_categories = {}
for category in category_dict:
	encoded_sub_categories[category] = encode_several_texts(category_dict[category])

# Encoding images with CLIP and running similarities

In [2]:
def encode_image_given_image_path(image_path):
	image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)

	# 3. Encode the image
	with torch.no_grad():
		image_features = model.encode_image(image)
	# normalize to unit length
	return image_features / image_features.norm(dim=-1, keepdim=True)

image_path = "test2.jpg"
image_features = encode_image_given_image_path(image_path)


category_similarities = (image_features @ encoded_categories.T).squeeze(0).cpu().numpy()
category = list(category_dict.keys())[category_similarities.argmax()]

subcategory_similarities = (image_features @ encoded_sub_categories[category].T).squeeze(0).cpu().numpy()
subcategory = category_dict[category][category_similarities.argmax()]
print(category, subcategory, category_similarities.max())

Sandals Flat 0.27488837


# Forward Encoding all images in the dataset
## (Prerequesite to begin training our MLP layers)

In [None]:
# 1. CPU‐only setup
device = "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# 2. Gather all JPG paths
root = Path("shoes")
all_jpgs = list(root.rglob("*.jpg"))
print(f"Found {len(all_jpgs)} JPGs under {root}")

# 3. Prepare output folder
out_dir = Path("shoe_features_cpu")
out_dir.mkdir(exist_ok=True)

# 4. Process one by one
with torch.no_grad():
	for img_path in tqdm(all_jpgs, desc="Encoding images", unit="img"):
		# skip if already done
		out_file = out_dir / f"{img_path.stem}.npy"
		if out_file.exists():
			continue

		# load + preprocess
		img = Image.open(img_path).convert("RGB")
		img_t = preprocess(img).unsqueeze(0).to(device)  # 1×3×224×224

		# forward pass
		feat = model.encode_image(img_t)                 # 1×512
		feat = feat / feat.norm(dim=-1, keepdim=True)

		# save vector
		np.save(out_file, feat.cpu().numpy().squeeze())

print("All done!")

Found 50066 JPGs under shoes


Encoding images: 100%|██████████| 50066/50066 [1:32:44<00:00,  9.00img/s]  

All done!





# Training our MLP weights

In [None]:
# ---------- Config ----------
FEAT_ROOT  = Path("shoe_features")
HIDDEN_DIM = 256
BATCH_SZ   = 256
EPOCHS     = 15
LR         = 5e-4
VAL_SPLIT  = 0.10
SEED       = 42
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# -----------------------------

def set_seed(s):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
set_seed(SEED)

# ----- Step 1: load everything into RAM -----
print("Scanning feature files …")
samples, class_to_idx, classes = [], {}, []
for dirpath, dirnames, filenames in os.walk(FEAT_ROOT):
    if filenames and not dirnames:                      # leaf dir
        rel = Path(dirpath).relative_to(FEAT_ROOT)
        cname = str(rel)                                # e.g. Boots/ankle_boots
        if cname not in class_to_idx:
            class_to_idx[cname] = len(classes)
            classes.append(cname)

        lbl = class_to_idx[cname]
        for f in filenames:
            if f.endswith(".npy"):
                samples.append((Path(dirpath)/f, lbl))

print(f"Discovered {len(classes)} classes over {len(samples)} samples. Loading …")

features = np.zeros((len(samples), 512), dtype=np.float32)
labels   = np.zeros(len(samples),       dtype=np.int64)

for i, (path, lbl) in enumerate(samples):
    features[i] = np.load(path)         # (512,)
    labels[i]   = lbl

# convert to tensors once
features = torch.from_numpy(features)
labels   = torch.from_numpy(labels)
dataset  = TensorDataset(features, labels)

# ----- Step 2: train/val split -----
n_total = len(dataset)
n_val   = int(VAL_SPLIT * n_total)
n_train = n_total - n_val
train_ds, val_ds = random_split(dataset, [n_train, n_val],
                                generator=torch.Generator().manual_seed(SEED))

train_loader = DataLoader(train_ds, batch_size=BATCH_SZ, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SZ*2)

print(f"Train: {n_train}  |  Val: {n_val}")

# ----- Step 3: two-layer MLP probe -----
class MLPProbe(nn.Module):
    def __init__(self, d_in=512, d_hid=HIDDEN_DIM, n_cls=len(classes)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, d_hid),
            nn.ReLU(inplace=True),
            nn.Linear(d_hid, n_cls)
        )
    def forward(self, x): return self.net(x)

model = MLPProbe().to(DEVICE)
opt    = torch.optim.Adam(model.parameters(), lr=LR)
crit   = nn.CrossEntropyLoss()

# ----- Step 4: training with a 10-second heartbeat -----
best_acc = 0.0
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    processed    = 0
    t_start      = time.time()
    t_beat       = t_start

    for feats, lbls in train_loader:
        feats, lbls = feats.to(DEVICE), lbls.to(DEVICE)
        logits = model(feats)
        loss   = crit(logits, lbls)

        opt.zero_grad()
        loss.backward()
        opt.step()

        running_loss += loss.item() * feats.size(0)
        processed    += feats.size(0)

        # heartbeat every 10 s
        now = time.time()
        if now - t_beat >= 10:
            pct = processed / n_train * 100
            avg = running_loss / processed
            print(f"Epoch {epoch:2d}  {pct:6.2f}% | AvgLoss {avg:.4f}", flush=True)
            t_beat = now

    # ---------- validation ----------
    model.eval()
    preds, gts = [], []
    with torch.no_grad():
        for feats, lbls in val_loader:
            logits = model(feats.to(DEVICE))
            preds.extend(logits.argmax(dim=-1).cpu().numpy())
            gts.extend(lbls.numpy())

    acc = accuracy_score(gts, preds)
    train_loss = running_loss / n_train
    print(f"Epoch {epoch:2d} DONE | TrainLoss {train_loss:.4f} | ValAcc {acc*100:5.2f}%")

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_mlp_probe.pth")
        print("  ↳ saved new best model")

print(f"\nBest validation accuracy: {best_acc*100:.2f}%")
print("Class mapping saved above.  Script finished.")


Scanning feature files …
Discovered 21 classes over 50031 samples. Loading …
Train: 45028  |  Val: 5003
Epoch  1 DONE | TrainLoss 1.9367 | ValAcc 64.86%
  ↳ saved new best model
Epoch  2 DONE | TrainLoss 0.9878 | ValAcc 73.90%
  ↳ saved new best model
Epoch  3 DONE | TrainLoss 0.7858 | ValAcc 76.41%
  ↳ saved new best model
Epoch  4 DONE | TrainLoss 0.7086 | ValAcc 77.41%
  ↳ saved new best model
Epoch  5 DONE | TrainLoss 0.6638 | ValAcc 78.51%
  ↳ saved new best model
Epoch  6 DONE | TrainLoss 0.6335 | ValAcc 79.09%
  ↳ saved new best model
Epoch  7 DONE | TrainLoss 0.6109 | ValAcc 79.43%
  ↳ saved new best model
Epoch  8 DONE | TrainLoss 0.5936 | ValAcc 79.95%
  ↳ saved new best model
Epoch  9 DONE | TrainLoss 0.5795 | ValAcc 80.45%
  ↳ saved new best model
Epoch 10 DONE | TrainLoss 0.5684 | ValAcc 80.35%
Epoch 11 DONE | TrainLoss 0.5585 | ValAcc 80.77%
  ↳ saved new best model
Epoch 12 DONE | TrainLoss 0.5504 | ValAcc 80.91%
  ↳ saved new best model
Epoch 13 DONE | TrainLoss 0.5420 