# Bringing in Lichess Data and translating into csv

In [1]:
import chess.pgn

def test_pgn_file(pgn_file):
    try:
        with open(pgn_file) as f:
            game = chess.pgn.read_game(f)
            if game is None:
                print("No valid games in file.")
            else:
                print("Successfully parsed a game.")
    except Exception as e:
        print(f"Error reading PGN file: {e}")

# Run the test
test_pgn_file("data/lichess_db_standard_rated_2023-07.pgn")


Successfully parsed a game.


In [2]:
import chess.pgn

def count_valid_games(pgn_file):
    count = 0
    try:
        with open(pgn_file) as f:
            while True:
                game = chess.pgn.read_game(f)
                if game is None:
                    break
                count += 1

        print(f"Total valid games in file: {count}")

    except Exception as e:
        print(f"Error reading PGN file: {e}")

# Run the test
count_valid_games("data/lichess_db_standard_rated_2023-07.pgn")



Total valid games in file: 21005010


In [3]:
import chess.pgn
import pandas as pd

def pgn_to_csv(pgn_file, output_csv):
    games_data = []
    game_count = 0

    try:
        with open(pgn_file) as f:
            while game_count < 1000000:
                game = chess.pgn.read_game(f)
                if game is None:
                    break  # End of file
                
                # Extract moves in SAN notation
                board = game.board()
                moves = []
                for move in game.mainline_moves():
                    san = board.san(move)
                    moves.append(san)
                    board.push(move)
                
                # Store game data
                games_data.append({
                    "game_id": game_count + 1,
                    "moves": " ".join(moves),
                    "num_moves": len(moves)
                })

                game_count += 1
                if game_count % 10000 == 0:
                    print(f"Processed {game_count} games...")

        # Create DataFrame
        df = pd.DataFrame(games_data)

        # Export to CSV
        df.to_csv(output_csv, index=False)
        print(f"✅ Exported {game_count} games to {output_csv}")

    except Exception as e:
        print(f"❌ Error processing PGN file: {e}")

# Example usage
pgn_to_csv("data/lichess_db_standard_rated_2023-07.pgn", "games_1m.csv")

    

Processed 10000 games...
Processed 20000 games...
Processed 30000 games...
Processed 40000 games...
Processed 50000 games...
Processed 60000 games...
Processed 70000 games...
Processed 80000 games...
Processed 90000 games...
Processed 100000 games...
Processed 110000 games...
Processed 120000 games...
Processed 130000 games...
Processed 140000 games...
Processed 150000 games...
Processed 160000 games...
Processed 170000 games...
Processed 180000 games...
Processed 190000 games...
Processed 200000 games...
Processed 210000 games...
Processed 220000 games...
Processed 230000 games...
Processed 240000 games...
Processed 250000 games...
Processed 260000 games...
Processed 270000 games...
Processed 280000 games...
Processed 290000 games...
Processed 300000 games...
Processed 310000 games...
Processed 320000 games...
Processed 330000 games...
Processed 340000 games...
Processed 350000 games...
Processed 360000 games...
Processed 370000 games...
Processed 380000 games...
Processed 390000 game

# More Data Prep:
## Creating vocab & tokenizing data, then saving

In [1]:
import pandas as pd

df = pd.read_csv('games_1m.csv')
df = df.dropna(subset=['moves'])
moves_list = df['moves'].tolist()

In [2]:
from collections import Counter

# split games into individual moves
tokenized_games = [s.split() for s in moves_list]
all_moves = [move for game in tokenized_games for move in game]


# count frequencies then sort by frequency
move_counts = Counter(all_moves)
unique_moves = sorted(move_counts.keys())

# build vocab dictionaries
move_to_id = {move: idx+3 for idx, move in enumerate(unique_moves)}
id_to_move = {idx: move for move, idx in move_to_id.items()}

# special tokens
move_to_id['<PAD>'] = 0
move_to_id['<START>'] = 1
move_to_id['<END>'] = 2

id_to_move[0] = '<PAD>'
id_to_move[1] = '<START>'
id_to_move[2] = '<END>'

# get vocab size
vocab_size = len(move_to_id)
print(f"✅ Vocab size: {vocab_size}")


✅ Vocab size: 11017


In [3]:
# convert list of moves to int ids with start, end, and padding
def encode_game(move_list, max_len=200):
    tokens = [move_to_id['<START>']] + [move_to_id[m] for m in move_list] + [move_to_id['<END>']]

    # truncate or pad
    if len(tokens) < max_len:
        tokens += [move_to_id['<PAD>']] * (max_len - len(tokens))
    else: 
        tokens = tokens[:max_len]
    return tokens


encoded_games = [encode_game(game) for game in tokenized_games]

In [10]:
'''
import torch
from torch.utils.data import Dataset, DataLoader

# prep for pytorch dataset
class ChessDataset(Dataset):
    def __init__(self, encoded_tensor):
        self.data = encoded_tensor

    def __len__(self):
        return self.data.size(0)
    
    def __getitem__(self, idx):
        x = self.data[idx, :-1]
        y = self.data[idx, 1:]
        return x, y
    
encoded_tensor = torch.tensor(encoded_games, dtype=torch.long)
dataset = ChessDataset(encoded_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)
'''

'\nimport torch\nfrom torch.utils.data import Dataset, DataLoader\n\n# prep for pytorch dataset\nclass ChessDataset(Dataset):\n    def __init__(self, encoded_tensor):\n        self.data = encoded_tensor\n\n    def __len__(self):\n        return self.data.size(0)\n    \n    def __getitem__(self, idx):\n        x = self.data[idx, :-1]\n        y = self.data[idx, 1:]\n        return x, y\n    \nencoded_tensor = torch.tensor(encoded_games, dtype=torch.long)\ndataset = ChessDataset(encoded_tensor)\ndataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)\n'

In [4]:
import json
import torch
import torch.nn as nn

# save vocab
with open('move_to_id.json', 'w') as f:
    json.dump(move_to_id, f)
with open('id_to_move.json', 'w') as f:
    json.dump(id_to_move, f)

# save tensor
encoded_tensor = torch.tensor(encoded_games, dtype=torch.long)
torch.save(encoded_tensor, 'encoded_games.pt')

MAX_LEN = 200
config = {
    'max_len': MAX_LEN,
    'vocab_size': vocab_size,
}

with open('preprocessing_config.json', 'w') as f:
    json.dump(config, f)





In [None]:
# example loading into separate training script

'''
import json
import torch

# Load vocab
with open('move_to_id.json', 'r') as f:
    move_to_id = json.load(f)
with open('id_to_move.json', 'r') as f:
    id_to_move = json.load(f)

# Load encoded games
encoded_tensor = torch.load('encoded_games.pt')

# Load config
with open('preprocessing_config.json', 'r') as f:
    config = json.load(f)

MAX_LEN = config['max_len']
vocab_size = config['vocab_size']

# Rebuild dataset
from torch.utils.data import Dataset, DataLoader

class ChessDataset(Dataset):
    def __init__(self, encoded_tensor):
        self.data = encoded_tensor

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        x = self.data[idx, :-1]
        y = self.data[idx, 1:]
        return x, y

dataset = ChessDataset(encoded_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

'''