In [1]:
from pathlib import Path
import csv
import gzip
import random
import re


import pandas as pd

from tqdm.notebook import tqdm

from src.lichess_decorate import fixt_trepo
from src.lichess import *

In [2]:
opath = Path("/share/path/for/big/data")
opath.mkdir(exist_ok=True)

ofile = opath / "games.csv.gz"
mfile = opath / "moves.csv.gz"

In [3]:
move_cols = ["gameId", "moveNo", "move", "clock"]

In [4]:
id_rex = re.compile("\[Site \"https://lichess.org/(.*)\"\]")
move_rex = re.compile("(\d+\.+) (.*?) \{ \[\%clk (.*?)\] \}")

In [12]:
pd.read_csv(ofile, nrows=10)

Unnamed: 0,tournamentId,gameId,White,Black,Result,UTCDate,UTCTime,WhiteElo,BlackElo,TimeControl,ECO
0,lBp3jGE1,zu4ZFgN0,ZorinaSofia,MajorCorneel,1-0,2023.09.17,09:21:22,2073,1729,180+0,B24
1,Yg5U2hzK,TR1Wjq3u,Star_is_shining,zaddik,1-0,2023.09.17,03:18:06,1938,1970,180+0,B13
2,vmPPq44A,ct2G24PH,Boophesh10,Fly_Emirates_emerald,1-0,2023.09.17,11:35:52,2026,2156,60+0,B00
3,vmPPq44A,yPz787kv,ASM_NCC,spramuditha,1-0,2023.09.17,11:37:39,2079,1863,60+0,A01
4,qdVUxKPl,TqAVTYiM,SNeGoBuK,patelsmit1288,1/2-1/2,2023.09.17,15:10:19,1700,1667,180+0,A10
5,jKXAtD3Q,ro56IbYV,andronkuzmin,mouseslept,1-0,2023.09.17,08:47:56,2074,1732,300+0,D00
6,oLmtu5a8,WPWpHaP9,pesfisher,gusmate,0-1,2023.09.17,12:40:59,1708,1953,180+0,A00
7,Kw8JvegN,mCO9ACxB,biasa_jah,jochkie152623,1-0,2023.09.17,08:34:12,1997,1534,180+0,C40
8,ThXo5D3j,JC7uPecs,Fisherick,rngbgd,0-1,2023.09.17,23:27:28,1674,1636,180+0,D10
9,3wBVrrY3,jEQj7ipV,JBrnz,Jins_jons,1-0,2023.09.17,15:37:39,1792,1453,180+0,B06


In [11]:
pd.read_csv(mfile, nrows=10)

Unnamed: 0,gameId,moveNo,move,clock
0,Lo8HlOYS,50.,Kc3,0:00:03
1,dXQWNFzv,10...,Nxe3,0:02:14
2,Sei80oNN,2.,Bg5,0:00:59
3,54LUw5xY,10...,b4,0:00:55
4,5pqy6X1w,19...,gxf6,0:00:26
5,bKbqZBwP,33...,Rxd8,0:00:10
6,F78bM1kl,1...,e5,0:01:00
7,KvgAa7Qz,24...,Rc7,0:01:42
8,jzDI8MKw,2.,Nf3,0:00:57
9,7yE8CKga,19...,Rhe8,0:00:07


In [5]:
rng = random.Random(742)

In [6]:
dicts = []
mlimit = rng.randint(1_000_000, 5_000_000)
dza = LichessDza(global_run=True)
with gzip.open(mfile.as_posix(), "wt") as fp:
    csv_handle = csv.DictWriter(fp, move_cols)
    csv_handle.writeheader()
    for el in tqdm(dza.get_all_events(GetTournamentGames)):
        tour_id = el.url.split("/")[-2]
        for game_str in el.content.decode().strip().split("\n\n\n"):
            meta, moves = game_str.split("\n\n")
            game_id = id_rex.findall(meta)[0]        
            for move in move_rex.findall(moves):
                dicts.append({"gameId": game_id} | dict(zip(move_cols[1:], move)))
                if len(dicts) > mlimit:
                    mlimit = rng.randint(1_000_000, 5_000_000)
                    rng.shuffle(dicts)
                    csv_handle.writerows(dicts)
                    dicts = []

[2m2024-02-18 15:12:48[0m [[32m[1minfo     [0m] [1mpulling 1 status contexts     [0m
[2m2024-02-18 15:12:48[0m [[32m[1minfo     [0m] [1mpulling 1 status dbs          [0m
[2m2024-02-18 15:12:48[0m [[32m[1minfo     [0m] [1mpulling 1 runs                [0m


0it [00:00, ?it/s]

In [8]:
def f(df):
    return df.loc[:, "Variant"] == "Standard"

In [9]:
cols = [
    "tournamentId",
    "gameId",
    "White",
    "Black",
    "Result",
    "UTCDate",
    "UTCTime",
    "WhiteElo",
    "BlackElo",
    "TimeControl",
    "ECO",
]

In [10]:
with gzip.open(ofile.as_posix(), "wt") as fp:
    csv_handle = csv.DictWriter(fp, cols)
    csv_handle.writeheader()
    for df in tqdm(fixt_trepo.dfs):
        csv_handle.writerows(df.assign(gameId=lambda df: df["Site"].str.split("/").str[-1]).sample(frac=1).loc[f, cols].to_dict("records"))

0it [00:00, ?it/s]

there are moves in the moves table associated with games that are not present in the games file. disregard these moves

- most imbalanced ECO opening, both in favor of white and black
  - for ECOs that represent at least 0.5% of all games
  - for ECOs that represent at least 0.1% of all games
- average difference in number of wins so far in tournament
- number of unique users
  - with at least 1, 2, 3 and 10 wins
  - with less than 2 wins
- what is the move that is most often played by both sides (black and white) in the same game
- what is the move that is made in at least 0.5% of the games and takes the most amount of thinking time
- unique number of different moves made in at least 10, 100, 1000, 10_000 games
- 2nd move by white associated with the most different ECOs
- player who has the largest shannon entropy of ECOs (separately player as black and player as white)