In [2]:
import pandas as pd
import numpy as np
import urllib.request
import bz2
import re
import threading
import gc


In [3]:
def parse_events(f, out):
    events = np.array(
        [
            x.split(b'"')[1].split(b" ")[1].decode("UTF-8")
            for x in re.findall(b"\[Event.*\]", f)
        ]
    )
    out.append(events)


def parse_results(f, out):
    results = np.array(
        [x.split(b'"')[1].decode("UTF-8") for x in re.findall(b"\[Result.*\]", f)]
    )
    out.append(results)


def parse_white_ELO(f, out):
    whiteELOs = np.array(
        [
            int(x.split(b'"')[1]) if x.split(b'"')[1] != b"?" else 0
            for x in re.findall(b"\[WhiteElo.*\]", f)
        ]
    )
    out.append(whiteELOs)


def parse_black_ELO(f, out):
    blackELOs = np.array(
        [
            int(x.split(b'"')[1]) if x.split(b'"')[1] != b"?" else 0
            for x in re.findall(b"\[BlackElo.*\]", f)
        ]
    )
    out.append(blackELOs)


def parse_move_nums(f, out):
    moveNums = np.array(
        [
            (
                0
                if len(re.findall(b"\d+\.", x[0])) == 0
                else int(re.findall(b"\d+\.", x[0])[-1][:-1])
            )
            for x in re.findall(b"(\]\\n\\n.*?(0-1|1-0|1/2-1/2))", f)
        ]
    )
    out.append(moveNums)


In [4]:
with urllib.request.urlopen(
    "https://database.lichess.org/standard/lichess_db_standard_rated_2015-05.pgn.bz2"
) as f:
    decompressed = bz2.BZ2File(f, "r")
    bString = decompressed.read()
    del decompressed
    gc.collect()


In [5]:
events = []
results = []
whiteELOs = []
blackELOs = []
moveNums = []

events_thread = threading.Thread(
    target=parse_events,
    args=(
        bString,
        events,
    ),
)
results_thread = threading.Thread(
    target=parse_results,
    args=(
        bString,
        results,
    ),
)
whiteELOs_thread = threading.Thread(
    target=parse_white_ELO,
    args=(
        bString,
        whiteELOs,
    ),
)
blackELOs_thread = threading.Thread(
    target=parse_black_ELO,
    args=(
        bString,
        blackELOs,
    ),
)
moveNums_thread = threading.Thread(
    target=parse_move_nums,
    args=(
        bString,
        moveNums,
    ),
)

events_thread.start()
results_thread.start()
whiteELOs_thread.start()
blackELOs_thread.start()
moveNums_thread.start()

events_thread.join()
results_thread.join()
whiteELOs_thread.join()
blackELOs_thread.join()
moveNums_thread.join()

events = np.array(events)
results = np.array(results)
whiteELOs = np.array(whiteELOs)
blackELOs = np.array(blackELOs)
moveNums = np.array(moveNums)
all_data = np.vstack((events, results, whiteELOs, blackELOs, moveNums)).T
df = pd.DataFrame(
    all_data,
    columns=[
        "Game Type",
        "Result",
        "White ELO",
        "Black ELO",
        "Moves",
    ],
)
df.to_pickle("./Chess_Data.pickle")


Let's check to make sure these grabbed the values we want, starting with the original PGN and then the values grabbed.


In [6]:
print(bString[:830].decode("UTF-8"))

print("-" * 130)

print(df.loc[0])


[Event "Rated Bullet game"]
[Site "https://lichess.org/w1PkhuLY"]
[White "gimmylove"]
[Black "UESC"]
[Result "0-1"]
[UTCDate "2015.04.30"]
[UTCTime "22:00:18"]
[WhiteElo "1489"]
[BlackElo "1797"]
[WhiteRatingDiff "-3"]
[BlackRatingDiff "+4"]
[ECO "C01"]
[Opening "French Defense: Carlson Gambit"]
[TimeControl "60+0"]
[Termination "Time forfeit"]

1. e4 e6 2. Nf3 d5 3. d4 dxe4 4. Ne5 Nd7 5. Qe2 Nxe5 6. dxe5 Qd4 7. Nc3 Qxe5 8. Qxe4 Qxe4+ 9. Nxe4 Nf6 10. Nxf6+ gxf6 11. Be3 Bd6 12. Bb5+ c6 13. Bd3 Bd7 14. O-O-O O-O-O 15. Be4 Kc7 16. Rd2 e5 17. Rhd1 Be6 18. g3 b6 19. Rd3 Be7 20. Rc3 Rxd1+ 21. Kxd1 Rd8+ 22. Ke2 Bd5 23. Bxd5 Rxd5 24. Rd3 Rxd3 25. Kxd3 b5 26. Ke4 Kd7 27. b3 a6 28. Bb6 Ke6 29. h3 Bb4 30. g4 Bd2 31. f3 Bf4 32. Be3 Bxe3 33. Kxe3 Kd5 34. Kd3 c5 35. Ke3 a5 36. Kd3 a4 37. Ke3 axb3 38. axb3 c4 39. b4 Ke6 40. c3 f5 0-1
----------------------------------------------------------------------------------------------------------------------------------
Game Type    Bullet
Result          0-