In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-29 17:34:55,899] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/gwei4/miniconda3/envs/kaggle_env/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX

In [2]:
# Paths 
data_dir     = Path("data")
parquet_path = data_dir / "lol_champions_data.parquet"
model_dir    = Path("models/gpt2-xl")
out_emb      = data_dir / "gpt2_embeddings.npy"
out_ids      = data_dir / "champion_ids.npy"

# text to embed

In [3]:
# 1. Load & build text_to_embed
df = pd.read_parquet(parquet_path)
df["text_to_embed"] = (
    df["name"].fillna("")     + " — " +
    df["role"].fillna("")     + "\n" +
    df["race"].fillna("")     + "\n" +
    df["short_bio"].fillna("")+ "\n" +
    df["full_story"].fillna("")
)
texts = df["text_to_embed"].tolist()

In [4]:
# 2. Load GPT-2 XL & tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(str(model_dir))
tokenizer.pad_token = tokenizer.eos_token
model = GPT2Model.from_pretrained(str(model_dir))

# Wrap for multi‐GPU inference
model = torch.nn.DataParallel(model)    # spreads across all available GPUs
device = torch.device("cuda")
model.to(device)
model.eval()

# Pull max context & hidden size from config
max_len     = model.module.config.n_positions  # 1024
hidden_size = model.module.config.n_embd       # 1600
print(f"Using max_length={max_len}, embedding dim={hidden_size}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.78it/s]


Using max_length=1024, embedding dim=1600


In [7]:
# 3. Define a larger‐batch, full‐context embedder
def embed_texts(texts, batch_size=16, max_length=max_len):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i : i + batch_size]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=max_length
        ).to(device)
        with torch.no_grad():
            last_hidden = model(**enc).last_hidden_state  # (B, T, D)
        mask   = enc.attention_mask.unsqueeze(-1)       # (B, T, 1)
        summed = (last_hidden * mask).sum(dim=1)        # (B, D)
        counts = mask.sum(dim=1).clamp(min=1e-9)        # (B, 1)
        pooled = (summed / counts).cpu().numpy()        # (B, D)
        all_embs.append(pooled)
    return np.vstack(all_embs)                         # (N, D)

In [9]:
# 4. Run & save
embs = embed_texts(texts, batch_size=16)
np.save(out_emb, embs)
np.save(out_ids, df.index.values)

print(f"Saved embeddings → {out_emb} (shape {embs.shape})")

Embedding: 100%|██████████| 11/11 [00:48<00:00,  4.37s/it]

Saved embeddings → data/gpt2_embeddings.npy (shape (170, 1600))



