# Compact Projection of Combined Embeddings
We’ll load `combined_raw_embeddings.npy` → train an autoencoder (encoder+decoder) and finally save just the **projected** embeddings from the encoder.

In [2]:
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [3]:

data_dir            = Path("data")
combined_path       = data_dir / "combined_raw_embeddings.npy"
ids_path            = data_dir / "combined_ids.npy"
projected_out_path  = data_dir / "combined_projected_embeddings.npy"

In [4]:
# Load data into a DataLoader
raw = np.load(combined_path)              # (N, D_in)
ids = np.load(ids_path)                   # (N,)

# Convert to torch Tensor and make dataset
x_all = torch.from_numpy(raw).float()
dataset = TensorDataset(x_all)
loader  = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=False)

print("Loaded raw:", raw.shape)

Loaded raw: (170, 2136)


In [5]:
# Define Autoencoder / Projection MLP
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

D_in, H, D_out = raw.shape[1], 512, 768

encoder = nn.Sequential(
    nn.Linear(D_in, H),
    nn.GELU(),
    nn.Linear(H, D_out)
).to(device)

decoder = nn.Sequential(
    nn.Linear(D_out, H),
    nn.GELU(),
    nn.Linear(H, D_in)
).to(device)

print(f"Autoencoder: {D_in} → {H} → {D_out} → {H} → {D_in}")

Autoencoder: 2136 → 512 → 768 → 512 → 2136


In [6]:
# Training Loop
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
loss_fn   = nn.MSELoss()

for epoch in range(1, 6):
    total_loss = 0.0
    for (xb,) in loader:
        xb = xb.to(device)
        optimizer.zero_grad()
        z  = encoder(xb)
        xr = decoder(z)
        loss = loss_fn(xr, xb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg = total_loss / len(dataset)
    print(f"Epoch {epoch}/5 — Avg Reconstruction Loss: {avg:.6f}")

Epoch 1/5 — Avg Reconstruction Loss: 0.105617
Epoch 2/5 — Avg Reconstruction Loss: 0.049041
Epoch 3/5 — Avg Reconstruction Loss: 0.033984
Epoch 4/5 — Avg Reconstruction Loss: 0.029566
Epoch 5/5 — Avg Reconstruction Loss: 0.026583


In [9]:
torch.save(decoder.state_dict(), "data/decoder.pth")
print("Saved decoder weights to data/decoder.pth")

Saved decoder weights to data/decoder.pth


In [7]:
# Generate & Save Projected Embeddings
with torch.no_grad():
    all_raw = torch.from_numpy(raw).float().to(device)
    all_proj = encoder(all_raw).cpu().numpy()   # (N, D_out)

np.save(projected_out_path, all_proj)
print("Saved projected embeddings:", all_proj.shape)

Saved projected embeddings: (170, 768)


In [8]:
data_dir            = Path("data")
proj_path           = data_dir / "combined_projected_embeddings.npy"
ids_path            = data_dir / "combined_ids.npy"

# 1. Load
proj_embs = np.load(proj_path)   # (N, 768)
ids       = np.load(ids_path)    # (N,)

# 2. Quick sanity checks
print("Projected embeddings shape:", proj_embs.shape)
print("Example champion IDs:", ids[:5])

# 3. Peek at the first 5 vectors (dims 0–9)
for idx in range(5):
    champ = ids[idx]
    vec   = proj_embs[idx, :10]
    print(f"{champ}: {np.round(vec, 4)}")

Projected embeddings shape: (170, 768)
Example champion IDs: [0 1 2 3 4]
0: [-0.3596 -0.4467  0.5269  0.0525  0.7603  0.0266  0.6629 -0.076  -0.2315
 -0.2516]
1: [-0.3005 -0.2805  0.4057  0.0298  0.6539 -0.0758  0.5893 -0.0717 -0.1746
 -0.1367]
2: [-0.41   -0.4752  0.4427  0.0734  0.6565  0.2281  0.7276 -0.2246 -0.1942
 -0.0852]
3: [-0.3945 -0.4543  0.3965  0.0719  0.6304  0.2515  0.6805 -0.2052 -0.1503
 -0.0963]
4: [-0.1061 -0.2279  0.1393  0.1387  0.6505 -0.2582  0.5135  0.0387  0.0104
 -0.1911]
