# Load the ChessExplained dataset

In [1]:
import pandas as pd
df = pd.read_parquet("../data/ChessExplained_2500k_qwen3.parquet")

# Encode chess position
We're using special tokens for encoding the chess position, since FEN encoding isn't good for an LLM's tokenizer with characters getting merged. 

Ascii representation will not have the character merge issue, but will be inefficient compared to a special encoding which uses significantly lower number of tokens. We found this reduces illegal moves while training, and much faster to train due to fewer input tokens.

In [2]:
import chess

def encode_legal_moves(board):
    """Encode legal moves as special token sequence."""
    color = 'White' if board.turn == chess.WHITE else 'Black'
    promo_map = {chess.QUEEN: f'<{color}_Queen>', chess.ROOK: f'<{color}_Rook>', 
                 chess.BISHOP: f'<{color}_Bishop>', chess.KNIGHT: f'<{color}_Knight>'}
    
    moves = [f"<{chess.square_name(m.from_square)}><{chess.square_name(m.to_square)}>"
             + (promo_map[m.promotion] if m.promotion else "")
             for m in board.legal_moves]
    
    return " ".join(moves)

def piece_to_token(piece):
    if piece is None:
        return '<blank>'
    
    piece_map = {
        (chess.PAWN, chess.WHITE): '<White_Pawn>',
        (chess.KNIGHT, chess.WHITE): '<White_Knight>',
        (chess.BISHOP, chess.WHITE): '<White_Bishop>',
        (chess.ROOK, chess.WHITE): '<White_Rook>',
        (chess.QUEEN, chess.WHITE): '<White_Queen>',
        (chess.KING, chess.WHITE): '<White_King>',
        (chess.PAWN, chess.BLACK): '<Black_Pawn>',
        (chess.KNIGHT, chess.BLACK): '<Black_Knight>',
        (chess.BISHOP, chess.BLACK): '<Black_Bishop>',
        (chess.ROOK, chess.BLACK): '<Black_Rook>',
        (chess.QUEEN, chess.BLACK): '<Black_Queen>',
        (chess.KING, chess.BLACK): '<Black_King>',
    }
    
    return piece_map[(piece.piece_type, piece.color)]

def encode_board_position(fen):
    """Encode FEN to special token sequence."""
    board = chess.Board(fen)
    
    # Board tokens
    tokens = [f"<{chess.square_name(sq)}>{piece_to_token(board.piece_at(sq))}" 
              for sq in chess.SQUARES]
    
    # Metadata
    parts = fen.split()
    side = "White" if parts[1] == 'w' else "Black"
    other_info = f"{parts[2]}|{parts[3]}|{parts[4]}|{parts[5]}"
    legal_moves = encode_legal_moves(board)

    return "".join(tokens) + f"|{side}|{other_info}|{legal_moves}"

fen = "rnbqkbnr/pppp1ppp/8/4p3/8/4P3/PPPP1PPP/RNBQKBNR w KQkq - 0 2"
# fen = "4k3/P7/8/8/8/8/8/4K3 w - - 0 1"
seq = encode_board_position(fen)
print(seq)

<a1><White_Rook><b1><White_Knight><c1><White_Bishop><d1><White_Queen><e1><White_King><f1><White_Bishop><g1><White_Knight><h1><White_Rook><a2><White_Pawn><b2><White_Pawn><c2><White_Pawn><d2><White_Pawn><e2><blank><f2><White_Pawn><g2><White_Pawn><h2><White_Pawn><a3><blank><b3><blank><c3><blank><d3><blank><e3><White_Pawn><f3><blank><g3><blank><h3><blank><a4><blank><b4><blank><c4><blank><d4><blank><e4><blank><f4><blank><g4><blank><h4><blank><a5><blank><b5><blank><c5><blank><d5><blank><e5><Black_Pawn><f5><blank><g5><blank><h5><blank><a6><blank><b6><blank><c6><blank><d6><blank><e6><blank><f6><blank><g6><blank><h6><blank><a7><Black_Pawn><b7><Black_Pawn><c7><Black_Pawn><d7><Black_Pawn><e7><blank><f7><Black_Pawn><g7><Black_Pawn><h7><Black_Pawn><a8><Black_Rook><b8><Black_Knight><c8><Black_Bishop><d8><Black_Queen><e8><Black_King><f8><Black_Bishop><g8><Black_Knight><h8><Black_Rook>|White|KQkq|-|0|2|<g1><h3> <g1><f3> <g1><e2> <f1><a6> <f1><b5> <f1><c4> <f1><d3> <f1><e2> <e1><e2> <d1><h5> <d1><g4> <

In [3]:
def create_chat_messages(row, disable_thinking=False):
    """Create chat messages from row data."""
    fen = row['fen']
    explanation = row['explanation'] if 'explanation' in row else ""
    move = row['move']
    
    # User message with encoded board position
    board_encoding = encode_board_position(fen)
    user_msg = f"<chess_position>{board_encoding}</chess_position>"
    
    # Assistant message with thinking and move
    move_tokens = f"<uci_move>{move}</uci_move>"
    if disable_thinking:
        assistant_msg = f"{move_tokens}"
    else:
        assistant_msg = f"<think>\n{explanation}\n</think>\n\n{move_tokens}"
    
    messages = [
        {"role": "user", "content": user_msg},
        {"role": "assistant", "content": assistant_msg}
    ]
    return messages

# Test Jinja2 encoding matches python
The submission sysyem only uses Jinja2 to encode the position. If you use python to encode the dataset like the code above, ensure that the jinja2 template produces the same text

In [12]:
import jinja2
template = jinja2.Template(open('chess_encode_special_tokens.jinja').read())

fen = df.loc[2, 'fen']
legal_moves = chess.Board(fen).legal_moves
legal_moves = ' '.join([m.uci() for m in legal_moves])
prompt = f"{fen}|{legal_moves}"
messages = [{"role": "user", "content": prompt}]
orig_encoding = f"<chess_position>{encode_board_position(fen)}</chess_position>"
jinja_rendered = template.render(FEN=fen, legal_moves_uci=legal_moves)
assert jinja_rendered == orig_encoding
print("Jinja2 encoding matches Python encoding")

Jinja2 encoding matches Python encoding


# Encoding example
The below code is provided as an example if you want to use a different encoding structure.

If encoding a large dataset, running Threads or Processes to speed things up may be useful

In [None]:
from transformers import AutoTokenizer
chess_tokenizer = AutoTokenizer.from_pretrained("chess_tokenizer_qwen3/")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import pandas as pd
# Test on a small sample
test_df = df.head(20).copy()
test_df['messages'] = test_df.apply(lambda x: create_chat_messages(x, disable_thinking=False), axis=1)
test_df['text'] = test_df['messages'].apply(lambda x: chess_tokenizer.apply_chat_template(x, tokenize=False))

print(test_df['text'].iloc[10])
print(len(chess_tokenizer.tokenize(test_df['text'].iloc[10])))

<|im_start|>user
<chess_position><a1><blank><b1><blank><c1><White_King><d1><White_Rook><e1><blank><f1><blank><g1><blank><h1><blank><a2><blank><b2><blank><c2><White_Pawn><d2><White_Queen><e2><White_Bishop><f2><blank><g2><Black_Bishop><h2><blank><a3><blank><b3><White_Pawn><c3><blank><d3><blank><e3><blank><f3><blank><g3><Black_Queen><h3><Black_Pawn><a4><White_Pawn><b4><blank><c4><blank><d4><White_Pawn><e4><blank><f4><White_Pawn><g4><blank><h4><Black_Rook><a5><blank><b5><blank><c5><blank><d5><Black_Pawn><e5><White_Pawn><f5><blank><g5><blank><h5><blank><a6><Black_Pawn><b6><blank><c6><blank><d6><blank><e6><blank><f6><blank><g6><blank><h6><blank><a7><blank><b7><Black_Pawn><c7><Black_Pawn><d7><blank><e7><blank><f7><Black_Pawn><g7><Black_Pawn><h7><blank><a8><Black_Rook><b8><Black_Knight><c8><blank><d8><blank><e8><Black_King><f8><blank><g8><blank><h8><blank>|White|q|-|1|19|<e2><a6> <e2><h5> <e2><b5> <e2><g4> <e2><c4> <e2><f3> <e2><d3> <e2><f1> <d2><a5> <d2><b4> <d2><e3> <d2><d3> <d2><c3> <d2><e1