In [1]:
import chess.pgn
import os
import pandas as pd
import numpy as np

# Table of Contents <a name = 'toc'></a>
* [How to visualize the data?](#howtovisualizedata)

### How to visualize the data? <a name = "howtovisualizedata"></a>
* [By move](#bymove) 

#### By Move
Need a way to construct a dataset from these chess games.

* One-hot encode every possible move
    * won't work because sequence info is lost
* sequence encode every possilbe move
    * Method: Every column is a possilbe move and the value is the sequence number (i.e. first move gets a 1, second move gets a 2 etc...).  As far as I know this is an orginal method.
    * Rationale: Only way I can think of to encode the sequence of moves into a training set. 
    * Hypothesis 1: There is a finite number of moves that can be made using the pgn notation (i.e. e2e4).  By my estimate that number is 2304.  
        * test: make a set of all moves in the entire database and call the length.


In [2]:
def move_extracter(game):
    board = game.board()
    list_of_moves = []
    for i, move in enumerate(game.mainline_moves()):
            board.push(move)
            list_of_moves.append(str(move))
    return list_of_moves

Headers(Event='Russia', Site='Russia', Date='1837.??.??', Round='?', White='Petrov, Alexander', Black='NN', Result='1-0', BlackElo='', ECO='C43', WhiteElo='')
CPU times: user 6.59 ms, sys: 1.61 ms, total: 8.21 ms
Wall time: 6.96 ms


In [54]:
%%time
path = './chess_games_pgn/pgnmentor_library/classical_queen_pawn/PetroffOther3.pgn'
#moves = move_extracter(path)
pgn = open(path)
game_1 = chess.pgn.read_game(pgn)
game_2 = chess.pgn.read_game(pgn)
print(move_extracter(game_1))
print(game_1.headers)
print(game_2.headers)

['e2e4', 'e7e5', 'g1f3', 'g8f6', 'd2d4', 'f6e4', 'f1d3', 'd7d5', 'f3e5', 'f8d6', 'e1g1', 'e8g8', 'c2c4', 'f7f5', 'f2f4', 'c7c6', 'c1e3', 'c8e6', 'c4d5', 'c6d5', 'b1c3', 'b8c6', 'a1c1', 'f8f6', 'd3e4', 'f5e4', 'c3b5', 'c6e7', 'b5d6', 'd8d6', 'g2g4', 'g7g6', 'f4f5', 'g6f5', 'e3g5', 'f6f8', 'g5h6', 'f8c8', 'd1d2', 'd6d8', 'c1c8', 'a8c8', 'g4f5', 'e7f5', 'd2g2', 'g8h8', 'f1f5']
Headers(Event='Russia', Site='Russia', Date='1837.??.??', Round='?', White='Petrov, Alexander', Black='NN', Result='1-0', BlackElo='', ECO='C43', WhiteElo='')
Headers(Event='Berlin m2', Site='Berlin', Date='1842.??.??', Round='?', White='Von Heydebrand und der L, Tassilo', Black='Von Jaenisch, Carl Friedrich', Result='1-0', BlackElo='', ECO='C43', WhiteElo='')
CPU times: user 11.2 ms, sys: 1.18 ms, total: 12.4 ms
Wall time: 11.5 ms


In [44]:
pgn = open(path)
for g in chess.pgn.read_game(pgn):
    print(g)
events = []
while True:
    g = chess.pgn.read_game(pgn)
    events.append(g.headers['Event'])

1. e4 e5 2. Nf3 Nf6 3. d4 Nxe4 4. Bd3 d5 5. Nxe5 Bd6 6. O-O O-O 7. c4 f5 8. f4 c6 9. Be3 Be6 10. cxd5 cxd5 11. Nc3 Nc6 12. Rc1 Rf6 13. Bxe4 fxe4 14. Nb5 Ne7 15. Nxd6 Qxd6 16. g4 g6 17. f5 gxf5 18. Bg5 Rff8 19. Bh6 Rfc8 20. Qd2 Qd8 21. Rxc8 Rxc8 22. gxf5 Nxf5 23. Qg2+ Kh8 24. Rxf5


AttributeError: 'NoneType' object has no attribute 'headers'

In [51]:
len(events)

16991

In [62]:
list(game_1.headers)

['Event',
 'Site',
 'Date',
 'Round',
 'White',
 'Black',
 'Result',
 'BlackElo',
 'ECO',
 'WhiteElo']

In [8]:
def pgn_extractor(path, num_games):
    pgn = open(path)
    df = pd.DataFrame()
    num_g = 0 # index for game number
    for i in range(num_games):
        g = chess.pgn.read_game(pgn)
        if g == None: break
        for meta in list(g.headers):
            df.loc[num_g,meta] = g.headers[meta]
        for i, move in enumerate(move_extracter(g)):
            df.loc[num_g,move] = i
        num_g +=1
    return df

        
        

In [33]:
%%time
path = './chess_games_pgn/pgnmentor_library/classical_queen_pawn/PetroffOther3.pgn'
df = pgn_extractor(path,1000)

CPU times: user 40.3 s, sys: 3.25 s, total: 43.6 s
Wall time: 43.7 s


In [34]:
len(df.columns)

1795

In [25]:
df.columns[0:10]

Index(['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result',
       'BlackElo', 'ECO', 'WhiteElo'],
      dtype='object')

In [27]:
os.listdir('chess_games_pgn/pgnmentor_library/classical_queen_pawn')



['PetroffOther3.pgn',
 'RuyLopezOpen.pgn',
 'RuyLopezMarshall.pgn',
 'QG-Chigorin.pgn',
 'SlavMain.pgn',
 'QGDOther34.pgn',
 'RuyMoeller-SteinDef.pgn',
 'QGAOther3.pgn',
 'RuyLopezChigorin.pgn',
 'RuyLopezModSteinitz.pgn',
 'QGDOrthoMain.pgn',
 'QGAOther4.pgn',
 'QGAMain.pgn',
 'QGDExchange.pgn',
 'SlavOther5.pgn',
 'QGDTarrasch.pgn',
 'Ponziani.pgn',
 'RuyLopezBreyer.pgn',
 'FourKnights.pgn',
 'CenterGame-Danish.pgn',
 'Colle.pgn',
 'RuyLopezKar-Smy-Khol.pgn',
 'QGA3e4.pgn',
 'SlavOther34.pgn',
 'SemiSlavOther5.pgn',
 'BishopsOpening.pgn',
 'RuyLopezOther9.pgn',
 'ScotchOther4.pgn',
 'SemiTarr5e3-Nc6.pgn',
 'GoringGambit.pgn',
 'Latvian-Elephant.pgn',
 'RuyLopezOther6.pgn',
 'QG-Albin.pgn',
 'RuyLopezClassical.pgn',
 'SemiTarraschMain.pgn',
 'PetroffMain.pgn',
 'SlavExchange.pgn',
 'RuyLopezOther5.pgn',
 'ScotchGambit.pgn',
 'RuyLopezFlohr-Zaitsev.pgn',
 'Hungarian.pgn',
 'RuyLopezBerlin.pgn',
 'Hodgson.pgn',
 'Vienna.pgn',
 'RuyLopezOther3.pgn',
 'ThreeKnights.pgn',
 'Slav4a6.pgn',
 

In [59]:
ser = pd.Series()
df = pd.DataFrame()

In [60]:
#ser.loc[0,'blah'] = 'test'
df.loc[0,'blah'] = 'test'

In [56]:
dir(ser)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__p

In [6]:
df

Unnamed: 0,col1,col2
0,0,3
1,1,4
2,2,5
