In [None]:
# =========================================================
# H100 OPTIMIZED CHESS TRANSFORMER (ONE CELL)
# =========================================================

!pip install -q torch transformers datasets python-chess accelerate tqdm

import torch, chess, os
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)

# ---------------- CONFIG ----------------
MODEL_NAME = "gpt2-xl"        # 1.5B params (H100 friendly)
MAX_GAMES = 1_000_000         # start with 1M (scale later)
MAX_LEN = 1024                # longer context
OUTPUT_DIR = "/root"

# ---------------- CHECK GPU ----------------
assert torch.cuda.is_available(), "CUDA not available"
print("GPU:", torch.cuda.get_device_name(0))

# ---------------- LOAD DATA ----------------
dataset = load_dataset(
    "angeluriot/chess_games",
    split=f"train[:{MAX_GAMES}]"
)

def to_text(game):
    if game["winner"] == "white":
        result = "1-0"
    elif game["winner"] == "black":
        result = "0-1"
    else:
        result = "1/2-1/2"
    moves = " ".join(game["moves_san"])
    return f'[Result "{result}"] {moves}'

dataset = dataset.map(lambda g: {"text": to_text(g)})
dataset = dataset.remove_columns(dataset.column_names[:-1])

# ---------------- TOKENIZER ----------------
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
dataset.set_format("torch")

# ---------------- MODEL ----------------
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
model.cuda()

# ---------------- TRAINING ARGS (H100 OPTIMIZED) ----------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,                 # large dataset → 1 epoch
    per_device_train_batch_size=4,       # H100 VRAM
    gradient_accumulation_steps=4,       # effective batch = 16
    learning_rate=3e-5,
    bf16=True,                           # 🔥 H100 native
    fp16=False,
    logging_steps=1000,
    save_steps=5000,
    save_total_limit=3,
    dataloader_num_workers=8,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# ---------------- TRAIN ----------------
trainer.train()

# ---------------- SAVE FINAL MODEL ----------------
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\nModel saved to: {OUTPUT_DIR}")

# ---------------- GENERATE SAMPLE GAME ----------------
prompt = '[Result "1-0"]'
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output = model.generate(
    **inputs,
    max_length=400,
    temperature=0.8,
    top_p=0.95,
    top_k=50,
    do_sample=True
)

generated = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nRAW GENERATED GAME:\n", generated)

# ---------------- FILTER ILLEGAL MOVES ----------------
def filter_illegal(pgn):
    board = chess.Board()
    legal = []
    for token in pgn.split():
        try:
            board.push(board.parse_san(token))
            legal.append(token)
        except:
            break
    return " ".join(legal)

print("\nLEGAL GAME ONLY:\n", filter_illegal(generated))



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
GPU: NVIDIA A100-SXM4-40GB


README.md: 0.00B [00:00, ?B/s]

dataset.parquet:   0%|          | 0.00/7.31G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [2]:
# =========================================================
# PHASE-1 CHESS TRANSFORMER (20k LICHESS DATASET) - FIXED
# =========================================================

!pip install -q torch transformers datasets pandas python-chess tqdm

import torch
import pandas as pd
import chess
from datasets import Dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments
)

MODEL_NAME = "gpt2-medium"
CSV_PATH = "/root/games.csv"
MAX_LEN = 512
OUTPUT_DIR = "/root/chess_phase1"

# ---------------- LOAD DATA ----------------
df = pd.read_csv(CSV_PATH)

def format_game(row):
    if row["winner"] == "white":
        result = "1-0"
    elif row["winner"] == "black":
        result = "0-1"
    else:
        result = "1/2-1/2"
    return f'[Result "{result}"] {row["moves"]}'

df["text"] = df.apply(format_game, axis=1)
dataset = Dataset.from_pandas(df[["text"]])

# ---------------- TOKENIZER ----------------
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    tokens["labels"] = tokens["input_ids"].copy()  # ✅ CRITICAL FIX
    return tokens

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
dataset.set_format("torch")

# ---------------- MODEL ----------------
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
model.to("cuda" if torch.cuda.is_available() else "cpu")

# ---------------- TRAINING ----------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    logging_steps=200,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Model saved to:", OUTPUT_DIR)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Map:   0%|          | 0/20058 [00:00<?, ? examples/s]

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
200,1.0032
400,0.5761
600,0.5459
800,0.5276
1000,0.5194
1200,0.5048
1400,0.4999
1600,0.488
1800,0.4807
2000,0.4711


Model saved to: /root/chess_phase1


In [None]:
from huggingface_hub import login
login()
