In [2]:
import pandas as pd
import numpy as np
import urllib.request
import bz2
import re

In [11]:
with urllib.request.urlopen('https://database.lichess.org/standard/lichess_db_standard_rated_2015-04.pgn.bz2') as f:
    decompressed = bz2.BZ2File(f, 'r')
    bString = decompressed.read()
    # find the events strings and pull the game type
    events = np.array([x.split(b"\"")[1].split(b" ")[1].decode('UTF-8') for x in re.findall(b"\[Event.*\]", bString)])
    # find the results string and pull the result only
    results = np.array([x.split(b"\"")[1].decode('UTF-8') for x in re.findall(b"\[Result.*\]", bString)])
    # find white's elo and pull it as an int (if ? then we set it to 0)
    whiteELOs = np.array([int(x.split(b"\"")[1]) if x.split(b"\"")[1] != b"?" else 0 for x in re.findall(b"\[WhiteElo.*\]", bString)])
    # find black's elo and pull it as an int (if ? then we set it to 0)
    blackELOs = np.array([int(x.split(b"\"")[1]) if x.split(b"\"")[1] != b"?" else 0 for x in re.findall(b"\[BlackElo.*\]", bString)])
    # find the time control string and pull the game style
    times = np.array([x.split(b"\"")[1].decode('UTF-8') for x in re.findall(b"\[TimeControl.*\]", bString)])
    # find the termination type of the game
    terminations = np.array([x.split(b"\"")[1].decode('UTF-8') for x in re.findall(b"\[Termination.*\]", bString)])
    # find the last move recorded in the move list and pull it as an int
    moveNums = np.array([(0 if len(re.findall(b"\d+\.", x[0])) == 0 else int(re.findall(b"\d+\.", x[0])[-1][:-1])) for x in re.findall(b"(\]\\n\\n.*?(0-1|1-0|1/2-1/2))", bString)])
    
    all_data = np.vstack((events, results, whiteELOs, blackELOs, times, terminations, moveNums)).T
    df = pd.DataFrame(all_data, columns=['Game Type', 'Result', 'White ELO', 'Black ELO', 'Time Control', 'Termination', 'Moves'])
    df.to_pickle('./Chess_Data.pickle')


Let's check to make sure these grabbed the values we want, starting with the original PGN and then the values grabbed.

In [17]:
print(bString[:605].decode('UTF-8'))

print('-'*130)

print(df.loc[0,:])


[Event "Rated Blitz game"]
[Site "https://lichess.org/2xXa7xLj"]
[White "TheMagBumper"]
[Black "hbustamantep"]
[Result "1-0"]
[UTCDate "2015.03.31"]
[UTCTime "22:00:09"]
[WhiteElo "1577"]
[BlackElo "1383"]
[WhiteRatingDiff "+5"]
[BlackRatingDiff "-5"]
[ECO "B06"]
[Opening "Modern Defense"]
[TimeControl "300+0"]
[Termination "Normal"]

1. e4 g6 2. c3 e6 3. d4 Bg7 4. Nf3 a6 5. Be3 b6 6. Qd2 Bb7 7. Bd3 Ne7 8. O-O a5 9. Bh6 O-O 10. Bxg7 Kxg7 11. Qf4 Nbc6 12. Nbd2 d6 13. Qh4 e5 14. d5 Nb8 15. Ng5 Rh8 16. f4 Nd7 17. g4 c6 18. c4 h6 19. Nh3 b5 20. b3 a4 21. f5 axb3 22. f6+ Nxf6 23. Qxf6+ Kg8 24. Qxf7# 1-0
----------------------------------------------------------------------------------------------------------------------------------
Game Type        Blitz
Result             1-0
White ELO         1577
Black ELO         1383
Time Control     300+0
Termination     Normal
Moves               24
Name: 0, dtype: object
