In [None]:
import pandas as pd

def find_botvinnik_names(df):
    # Concatenate white and black columns into a single series
    all_players = pd.concat([df['white'], df['black']])
    
    
    
    # Get unique player names
    unique_players = all_players.unique()
    
    # Filter for names containing "Botvinnik" (case-insensitive)
    botvinnik_names = [name for name in unique_players 
                       if "fischer" in name.lower()]
    
    return botvinnik_names

# Example usage:
botvinnik_names = find_botvinnik_names(df)
print(botvinnik_names)

In [4]:
def standardize_botvinnik_efficient(df):
    # List of variations that represent Mikhail Botvinnik
    mikhail_variations = [
        'Botvinnik, M.', 
        'Botvinnik, Mikhail URS', 
        'Botvinnik, Mikhail',
        'Botvinnik, M2.',
        'Botvinnik, Mikhail2',
        'Botvinnik,M2'
    ]
    
    # Create a copy of the dataframe
    standardized_df = df.copy()
    
    # Create a mapping dictionary for replacement
    name_mapping = {variation: 'Botvinnik, Mikhail' for variation in mikhail_variations}
    
    # Replace in both columns at once
    standardized_df['white'] = standardized_df['white'].replace(name_mapping)
    standardized_df['black'] = standardized_df['black'].replace(name_mapping)
    
    return standardized_df

standardized_df = standardize_botvinnik_efficient(df)

In [None]:


# Tal, Mikhail number of games for white and black
# print(df[df['white'] == 'Tal, Mikhail']['white'].count())
# print(df[df['black'] == 'Tal, Mikhail']['black'].count())

In [3]:
import pandas as pd

# Define path to your file
file_path = '/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean.csv'
df = pd.read_csv(file_path)

# len of df
print(f'len of df: {len(df)}')

# remove where event starts with Titled Tue, first change dtype to string
df['event'] = df['event'].astype(str)
df = df[~df['event'].str.startswith('Titled Tue')]

# save df
df.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue.csv', index=False)

# len of df
print(f'len of df: {len(df)}')


  df = pd.read_csv(file_path)


len of df: 7244333
len of df: 6489794


In [1]:
import pandas as pd

# Define path to your file
file_path = '/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue.csv'
df = pd.read_csv(file_path)

# len of df
print(f'len of df: {len(df)}')

# remove where white_title or black_title is BOT
df = df[df['white_title'] != 'BOT']
df = df[df['black_title'] != 'BOT']

# len of df
print(f'len of df: {len(df)}')


# remove events that contains rapid or blitz or bullet lower case
# First ensure event column is string type to avoid float error
df['event'] = df['event'].astype(str)
# Then filter out rows containing rapid, blitz, or bullet
df = df[~df['event'].str.contains('rapid|blitz|bullet', case=False)]

# len of df
print(f'len of df: {len(df)}')

# save df
df.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet.csv', index=False)

  df = pd.read_csv(file_path)


len of df: 6489794
len of df: 6480244
len of df: 6202573


In [2]:
# null count
print(df.isnull().sum())

# value counts of source
print(df['source'].value_counts())

event                    0
site                152214
date                     0
round                73506
white                    0
black                    0
result                   0
white_elo           841848
black_elo           851828
white_title        5820036
black_title        5823422
eco                      0
opening            6153069
time_control       6143867
import_date        5980753
source                   0
moves                    0
eval_info          6163440
clock_info         6074706
avg_elo            1150067
elo_difference     1150067
move_count               0
result_category          0
has_eval_info            0
has_clock_info           0
eco_family               0
year                     0
dtype: int64
source
LumbrasGigaBase    6202573
Name: count, dtype: int64


In [3]:
# in df remove where unique player has less than 100 games
# Get players with at least 100 games
frequent_white_players = df['white'].value_counts()[df['white'].value_counts() >= 100].index
frequent_black_players = df['black'].value_counts()[df['black'].value_counts() >= 100].index

# Filter the dataframe to only include games where both players have at least 100 games
df3 = df[df['white'].isin(frequent_white_players) | df['black'].isin(frequent_black_players)]

# len of df
print(f'len of df: {len(df)}')

# len of df3
print(f'len of df3: {len(df3)}')

# save df3
df3.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet_min_100_games.csv', index=False)


len of df: 6202573
len of df3: 3790224


In [4]:
# remove where event contains speed
df4 = df3[~df3['event'].fillna('').str.contains('speed', case=False)]

# len of df4
print(f'len of df4: {len(df4)}')

# save df4
df4.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet_min_100_games_no_speed.csv', index=False)

len of df4: 3759666


In [1]:
# remove where site contains chess.com
import pandas as pd

# Define path to your file
file_path = '/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet_min_100_games_no_speed.csv'
df4 = pd.read_csv(file_path)

df5 = df4[~df4['site'].fillna('').str.contains('chess.com', case=False)]

# len of df5
print(f'len of df5: {len(df5)}')


# save df5
df5.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet_min_100_games_no_speed_no_chess_com.csv', index=False)

# len of df5
print(f'len of df5: {len(df5)}')



  df4 = pd.read_csv(file_path)


len of df5: 3661688
len of df5: 3661688


In [2]:
# remove where white or black contains stockfish, alphazero, leela, komodo, houdini
df6 = df5[~df5['white'].str.contains('stockfish|alphazero|leela|komodo|houdini', case=False) & ~df5['black'].str.contains('stockfish|alphazero|leela|komodo|houdini', case=False)]


# len of df6
print(f'len of df6: {len(df6)}')

# save df6
df6.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_no_titled_tue_no_bot_no_rapid_blitz_bullet_min_100_games_no_speed_no_chess_com_elo_2400_no_stockfish_alphazero_leela_komodo_houdini.csv', index=False)

len of df6: 3660298


In [3]:
# unique names that contain stockfish, alphazero, leela, komodo, houdini, bot, fish, zero
stockfish_names = df6[df6['white'].str.contains('stockfish|alphazero|leela|komodo|houdini|bot|fish|zero', case=False)]

# len of stockfish_names
print(f'len of stockfish_names: {len(stockfish_names)}')

# save unique names as txt
with open('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/bot_names.txt', 'w') as f:
    for name in stockfish_names['white'].unique():
        f.write(name + '\n')


len of stockfish_names: 9840


In [4]:
# remove where white or black is in the list of bot names
bot_names = [
    'FireFishBOT_v2',
    'LBOT007',
    'Nikitosikbot_v2',
    'RaspFish',
    'ResoluteBot',
    'yobot_v2',
    'YoBot_v2',
    'Fishbein,A',
    'M-Z_Bot',
    'MrChessTheBot',
    'Nikitosikbot',
    'NikitosikVariantsbot',
    'sf_bot',
    'Shineshou90_BOT'
]

df7 = df6.copy()
for bot_name in bot_names:
    df7 = df7[~df7['white'].str.contains(bot_name, case=True, regex=False)]
    df7 = df7[~df7['black'].str.contains(bot_name, case=True, regex=False)]

# len of df7
print(f'len of df7: {len(df7)}')

# save df7
df7.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_final.csv', index=False)

len of df7: 3652312


In [5]:
# games of each player
print(df7['white'].value_counts())
print(df7['black'].value_counts())


white
Korneev, Oleg                  1691
Wall                           1655
Carlsen, Magnus                1571
Drazic, Sinisa                 1540
Lalic, Bogdan                  1494
                               ... 
Amori, Michael                    1
Fritsch, Rudolf                   1
Fritsch, A.                       1
Amorim, Genaro Jose Melo De       1
Von Schuetz, H                    1
Name: count, Length: 187455, dtype: int64
black
Korneev, Oleg         1670
Drazic, Sinisa        1534
Carlsen, Magnus       1477
Lalic, Bogdan         1476
Ivanchuk, Vasyl       1450
                      ... 
Alhamdan, Ahmed          1
Alhasan, Hasan           1
Saad, Amer Mohamed       1
Khalifa, Ramy            1
Irion                    1
Name: count, Length: 194221, dtype: int64


In [8]:
# save only unique players names to txt
with open('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/player_names.txt', 'w') as f:
    # Combine white and black players, get unique names, and ensure each player is only listed once
    white_players = df7['white'].unique()
    black_players = df7['black'].unique()
    all_players = set(white_players) | set(black_players)  # Use set union to ensure uniqueness
    for name in all_players:
        f.write(name + '\n')



In [15]:
# only players from book 

player_names = [
    # Activists
    "Alekhine, Alexander",
    "Tal, Mikhail",
    "Spassky, Boris V.",
    "Spassky, Boris Vasilievich",
    "Kasparov, Garry",
    "Kasparov, G.",
    "Anand, Viswanathan",
    "Anand,V",
    "Shirov, Alexei",
    "Morozevich, Alexander",
    "Topalov, Veselin",
    "Pillsbury, Harry",
    "Anderssen, Adolf",
    "Bronstein, David I",
    "Bronstein, Luis Marcos",
    "Larsen, B.",
    "Larsen, Bent",
    "Taimanov, Mark E",
    "Aronian, Levon",
    "Polgar, Judit",
    "Muller, K.",
    
    # Theorists
    "Steinitz, Wilhelm",
    "Botvinnik, M.",
    "Botvinnik, Mikhail URS",
    "Kramnik, Vladimir",
    "Tarrasch, Siegbert",
    "Nimzowitsch, Aron",
    "Leko, Peter",
    "Giri, Anish",
    "Meier, Georg",
    "Andersson, Ulf",
    "Sedlak, Nikola",
    "Tiviakov, Sergei",
    "Ponomariov, Ruslan",
    "Wahls, Matthias",
    "Moskalenko, Viktor1",
    "Moskalenko, Viktor",
    "Moskalenko, V.",
    "Dorfman, Iossif",
    "Bangiev, Alexander",
    "Hansen, Lars Bo",
    
    # Reflectors
    "Capablanca, Jose",
    "Smyslov, V.",
    "Petrosian, T.",
    "Karpov, A.",
    "Karpov, Anatoly",
    "Carlsen, Magnus",
    "Adams, Michael",
    "Keymer, Vincent",
    "Bischoff, K.",
    "Bischoff,K",
    
    # Pragmatists
    "Fischer, R.",
    "Euwe, Max",
    "Korchnoi, Viktor",
    "Caruana, Fabiano",
    "Ding, Liren",
    "Karjakin, Sergey",
    "Vachier-Lagrave, Maxime"
]

df8 = df7.copy()

# remove where white or black is not in the list of player names
df8 = df8[df8['white'].isin(player_names) | df8['black'].isin(player_names)]

# len of df8
print(f'len of df8: {len(df8)}')

# save df8
df8.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_final_players_from_book.csv', index=False)

len of df8: 50448


In [None]:
# # count number of games for each player from book player_names list
# for name in player_names:
#     print(f'{name}: {df8[df8["white"] == name].shape[0] + df8[df8["black"] == name].shape[0]}')

# change names of same players to the same name
name_changes = {
    "Kasparov, G.": "Kasparov, Garry",
    "Spassky, Boris Vasilievich": "Spassky, Boris V.",
    "Anand,V": "Anand, Viswanathan",
    "Larsen, B.": "Larsen, Bent",
    "Botvinnik, M.": "Botvinnik, Mikhail URS",
    "Moskalenko, Viktor1": "Moskalenko, Viktor",
    "Moskalenko, V.": "Moskalenko, Viktor",
    "Smyslov, V.": "Smyslov",
    "Petrosian, T.": "Petrosian",
    "Karpov, A.": "Karpov, Anatoly",
    "Bischoff,K": "Bischoff, K.",
    "Fischer, R.": "Fischer",
}

def replace_name(name):
    return name_changes.get(name, name)

df8['white'] = df8['white'].apply(replace_name)
df8['black'] = df8['black'].apply(replace_name)

player_names = [replace_name(name) for name in player_names]
player_names = list(set(player_names))

# remove where white or black is not in the list of player names
df8 = df8[df8['white'].isin(player_names) | df8['black'].isin(player_names)]

# len of df8
print(f'len of df8 after name changes: {len(df8)}')

# save df8
df8.to_csv('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/chess_games_clean_final_players_from_book.csv', index=False)

# count number of games for each player from book player_names list
for name in player_names:
    print(f'{name}: {df8[df8["white"] == name].shape[0] + df8[df8["black"] == name].shape[0]}')


len of df8 after name changes: 50448
Taimanov, Mark E: 803
Botvinnik, Mikhail URS: 271
Petrosian: 312
Larsen, Bent: 459
Anderssen, Adolf: 305
Capablanca, Jose: 725
Bischoff, K.: 1165
Polgar, Judit: 815
Aronian, Levon: 2561
Spassky, Boris V.: 531
Vachier-Lagrave, Maxime: 2511
Pillsbury, Harry: 197
Bangiev, Alexander: 341
Muller, K.: 213
Kasparov, Garry: 823
Sedlak, Nikola: 2352
Tal, Mikhail: 566
Karjakin, Sergey: 1951
Carlsen, Magnus: 3048
Korchnoi, Viktor: 1707
Andersson, Ulf: 347
Dorfman, Iossif: 733
Leko, Peter: 1426
Shirov, Alexei: 2833
Kramnik, Vladimir: 1912
Tarrasch, Siegbert: 235
Ponomariov, Ruslan: 1707
Adams, Michael: 2212
Euwe, Max: 610
Nimzowitsch, Aron: 228
Morozevich, Alexander: 1394
Tiviakov, Sergei: 2555
Moskalenko, Viktor: 994
Fischer: 133
Giri, Anish: 2278
Anand, Viswanathan: 1923
Hansen, Lars Bo: 270
Karpov, Anatoly: 1399
Steinitz, Wilhelm: 308
Bronstein, Luis Marcos: 445
Smyslov: 217
Alekhine, Alexander: 669
Bronstein, David I: 412
Meier, Georg: 1436
Keymer, Vincent:

In [7]:
import pandas as pd

# Base path for all files
base_path = '/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/data/processed/lumbrasgigabase/'

# Define starting file path
input_file = f'{base_path}chess_games_clean.csv'
output_file = f'{base_path}chess_games_cleaned_final.csv'

print("Starting chess dataset cleaning process...")

# Load the initial dataset
df = pd.read_csv(input_file)
print(f'Initial dataset size: {len(df)} games')

# Step 1: Remove games where event starts with Titled Tue
df['event'] = df['event'].astype(str)
df = df[~df['event'].str.startswith('Titled Tue')]
print(f'After removing Titled Tuesday events: {len(df)} games')

# Step 2: Remove games with BOT players
df = df[df['white_title'] != 'BOT']
df = df[df['black_title'] != 'BOT']
print(f'After removing BOT players: {len(df)} games')

# Step 3: Remove rapid, blitz, and bullet games
df = df[~df['event'].str.contains('rapid|blitz|bullet', case=False)]
print(f'After removing rapid/blitz/bullet games: {len(df)} games')


# Step 5: Remove games containing "speed" in the event name
df = df[~df['event'].fillna('').str.contains('speed', case=False)]
print(f'After removing speed chess events: {len(df)} games')

# Step 6: Remove chess.com games
df = df[~df['site'].fillna('').str.contains('chess.com', case=False)]
print(f'After removing chess.com games: {len(df)} games')


# Step 8: Remove engine players (standard engine names)
engine_names = ['stockfish', 'alphazero', 'leela', 'komodo', 'houdini']
df = df[~df['white'].str.contains('|'.join(engine_names), case=False)]
df = df[~df['black'].str.contains('|'.join(engine_names), case=False)]
print(f'After removing standard engine names: {len(df)} games')

# Step 9: Remove specific bot players from the provided list
bot_names = [
    'FireFishBOT_v2', 'LBOT007', 'Nikitosikbot_v2', 'RaspFish', 'ResoluteBot',
    'yobot_v2', 'YoBot_v2', 'Fishbein,A', 'M-Z_Bot', 'MrChessTheBot',
    'Nikitosikbot', 'NikitosikVariantsbot', 'sf_bot', 'Shineshou90_BOT'
]

for bot_name in bot_names:
    df = df[~df['white'].str.contains(bot_name, case=True, regex=False)]
    df = df[~df['black'].str.contains(bot_name, case=True, regex=False)]

print(f'Final dataset size: {len(df)} games')

# Save the final clean dataset
df.to_csv(output_file, index=False)
print(f'Final cleaned dataset saved to {output_file}')

Starting chess dataset cleaning process...


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.path import Path
import matplotlib as mpl

# Set up the figure with a specific size - increasing size for better visibility
fig, ax = plt.subplots(figsize=(16, 10))
fig.patch.set_facecolor('#f8f9fa')
ax.set_facecolor('#f8f9fa')

# Remove axis ticks and labels
ax.set_xlim(0, 13)  # Increased x-range for more space
ax.set_ylim(0, 8)   # Increased y-range for more space
ax.axis('off')

# Title - adjusted position
ax.text(7, 7.5, 'Chess Data Preprocessing Pipeline', 
        fontsize=27, fontweight='bold', ha='center')  # Increased font size

# Create boxes with rounded corners using patches
def create_box(x, y, width, height, title, content, color, edge_color):
    # Main box
    box = patches.FancyBboxPatch(
        (x, y), width, height, boxstyle=patches.BoxStyle("Round", pad=0.02, rounding_size=0.1),
        facecolor=color, edgecolor=edge_color, linewidth=2
    )
    ax.add_patch(box)
    
    # Title
    ax.text(x + width/2, y + height - 0.3, title, 
            fontsize=16, fontweight='bold', ha='center')  # Increased font size
    
    # Content
    if isinstance(content, list):
        for i, line in enumerate(content):
            ax.text(x + 0.3, y + height - 0.6 - i * 0.35, line, fontsize=13)  # Increased spacing and font size
    else:
        ax.text(x + width/2, y + height/2, content, fontsize=13, ha='center')  # Increased font size

# Draw arrows
def draw_arrow(start_x, start_y, end_x, end_y):
    ax.annotate('', xy=(end_x, end_y), xytext=(start_x, start_y),
                arrowprops=dict(arrowstyle='->', color='#666666', lw=2))  # Increased line width

# Data Source box - adjusted positions
create_box(1, 5, 3, 1.5, 'Data Source', 
           ['LumbrasGigaBase', '(15M games, 700K+ players)'], 
           '#e6f2ff', '#3385ff')

# PGN Processing box - adjusted positions and increased size
create_box(6, 5, 3, 1.5, 'PGN Processing', 
           ['• Memory-mapped I/O (mmap)', 
            '• Multi-core parallel processing', 
            '• Regex optimization'], 
           '#e6ffe6', '#33cc33')

# Data Cleaning box - adjusted positions and increased size
create_box(6, 3, 3, 1.5, 'Data Cleaning & Filtering', 
           ['• BOT player removal', 
            '• Time control filtering', 
            '• Invalid game removal'], 
           '#fff2e6', '#ff9933')

# Game Selection box - adjusted positions and increased size
create_box(6, 1, 3, 1.5, 'Game Selection', 
           ['• Prioritize classified players', 
            '• Balanced sampling', 
            '• Quality control checks'], 
           '#ffe6f2', '#ff3385')

# Final Dataset box - adjusted positions and increased size
create_box(9.5, 1, 3, 1.5, 'Final Dataset', 
           ['~5,000 selected games', 
            'Ready for feature extraction'], 
           '#e6e6ff', '#3333cc')

# Process Overview box - adjusted positions and increased size
create_box(1, 1, 3, 3.5, 'Process Overview', '', '#f0f0f0', '#999999')
ax.text(1.3, 3.9, 'Input Format:', fontsize=14, fontweight='bold')
ax.text(1.5, 3.6, '• PGN files', fontsize=13)
ax.text(1.3, 3.2, 'Transformations:', fontsize=14, fontweight='bold')
ax.text(1.5, 2.8, '• Extract metadata', fontsize=13)
ax.text(1.5, 2.4, '• Clean move sequences', fontsize=13)
ax.text(1.5, 2.0, '• Extract evaluations', fontsize=13)
ax.text(1.3, 1.6, 'Output Format:', fontsize=14, fontweight='bold')
ax.text(1.5, 1.2, '• Structured CSV', fontsize=13)

# Key Metrics box - adjusted positions and increased size
create_box(9.5, 3, 3, 1.5, 'Key Metrics', '', '#f0f0f0', '#999999')
# Add color rectangles as legend
ax.add_patch(patches.Rectangle((9.7, 3.9), 0.25, 0.25, facecolor='#e6f2ff', edgecolor='#3385ff'))
ax.text(10, 4.0, 'Initial: 15M games', fontsize=13)
ax.add_patch(patches.Rectangle((9.7, 3.5), 0.25, 0.25, facecolor='#fff2e6', edgecolor='#ff9933'))
ax.text(10, 3.6, 'After cleaning: ~10M', fontsize=13)
ax.add_patch(patches.Rectangle((9.7, 3.1), 0.25, 0.25, facecolor='#e6e6ff', edgecolor='#3333cc'))
ax.text(10, 3.2, 'Final: ~5K games', fontsize=13)

# Draw arrows connecting the boxes - adjusted positions
# Horizontal arrows
draw_arrow(4, 5.75, 6, 5.75)  # Data Source to PGN Processing
draw_arrow(9, 1.75, 9.5, 1.75)  # Game Selection to Final Dataset

# Vertical arrows
draw_arrow(7.5, 5, 7.5, 4.5)  # PGN Processing to Data Cleaning
draw_arrow(7.5, 3, 7.5, 2.5)  # Data Cleaning to Game Selection

# Add tight layout and save the figure with higher DPI
plt.tight_layout(pad=1)  # Increased padding
plt.savefig('/Users/samir/Desktop/Uppsala/Thesis/thesis_chess_code/src/data_processing/chess_data_pipeline.png', dpi=600, bbox_inches='tight')  # Increased DPI and specified full path
plt.show() 