# Prepairing chesscom data to use

## Import and functions

In [3]:
# coding: utf-8

# our all
import numpy as np
import pandas as pd

# usefull pandas settings
pd.set_option('display.max_rows', 45000)
pd.set_option('display.max_columns', 50000)
pd.set_option('display.max_colwidth', 5000)

# for API working and current time
import requests
import datetime

# chess pgn-reading tool
from pgn_parser import pgn, parser

# multistreaming
import threading

# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')

In [4]:
# convert unixtime to time:
# fr_unixtime(1565211491) -> '2019-08-07 20:58:11'
def fr_unixtime(ts):
    from datetime import datetime
    return datetime.utcfromtimestamp(int(ts)).strftime('%Y-%m-%d %H:%M:%S')

# convert integer month number to string API format:
# 1 -> '01' 
# 9 -> '09' 
# 12 > '12'
# 112 -> -1
def get_number(x):
    if x>=10 and x<100:
        return str(x)
    elif x<10:
        return '0'+str(x)
    else:
        return -1
    
# retunr classoc Elo propabilities
# elo_prob(2882, 2722) -> 0.7152 (72% chanses Carlsen (2882) to beat Wan Hao (2722))
def elo_prob(rw, rb):
    return 1/(1+np.power(10, (rb-rw)/400))


# working with pgn
# pip install pgn-parser
def get_pgn(text):
    from pgn_parser import pgn, parser
    game = parser.parse(text, actions=pgn.Actions())
    
    try:
        score=game.tag_pairs['Result']
    except:
        score='Unknown'  
    try:
        date=game.tag_pairs['Date']
    except:
        date='Unknown'
    try:
        time=game.tag_pairs['UTCTime']
    except:
        time='Unknown'
    try:
        eco=game.tag_pairs['ECO']
    except:
        eco='Unknown'
    try:
        ecourl=game.tag_pairs['ECOUrl']
    except:
        ecourl='Unknown'
    return {'score':score, 
            'date': date, 
            'time':time, 
            'ECO':eco, 
            'ECO_url':ecourl}

# functions for predictions
# find col with target user (white or black)
def find_col(target_user, col1, wh_val, bl_val):
    if col1==target_user:
        return wh_val
    else:
        return bl_val
# reverse fun of find_col    
def find_opp(target_user, col1, wh_val, bl_val):
    if col1!=target_user:
        return wh_val
    else:
        return bl_val

In [5]:
# get json form site and return string
# exmple get_api_data_to_str('eric', '2014', '01')
def get_api_data_to_str(player, year, month):
    import requests
    response = requests.get('https://api.chess.com/pub/player/'+player+'/games/'+year+'/'+month)
    st=response.text
    if response.status_code !=200:
        print(response.status_code)
        print(st[0:3000])
    if response.status_code ==429:
        with open('data/429.txt', 'w+') as f:
            f.write('variant, ')
            f.write(st)
            f.close()
    return st

# convert api data to pandas for next working
def get_str_data_to_pandas(strng):
    import json
    js = json.loads(strng)
    df=pd.DataFrame(js)
    
    return df

# convert multilevel json to table
def prepair_pandas_multilevel_data(df):
    num=len(df)
    new_df=pd.DataFrame()
    for i in range(num):
        b=pd.io.json.json_normalize(df.values[i])
        new_df=pd.concat([new_df, b])
        
    return new_df 

# get player games in month with preparation
def get_pl_stat(game_stat):    
    # chess result dictionary
    res_dict={
    'win': 1.0,
    'checkmated': 0.0,
    'agreed': 0.5,
    'repetition': 0.5,
    'timeout': 0.0,
    'resigned': 0.0,
    'stalemate': 0.5,
    'lose': 0.0,
    'insufficient': 0.5,
    '50move': 0.5,
    'abandoned': 0.0,
    'kingofthehill': 0.5,
    'threecheck': 0.5,
    'timevsinsufficient': 0.5,
    'bughousepartnerlose': 0.0
    }

    # number of games
    num=len(game_stat)
    
    # cut long and unusefull columns
    game_stat=game_stat[['rules', 'time_class', 'time_control', 'rated',
        'white.@id', 'white.rating', 'white.result', 'white.username',
         'black.@id', 'black.rating', 'black.result', 'black.username',
         'end_time', 'pgn', 
         'url']]
    # score 0.0, 0.5 or 1.0
    game_stat['white.score']=game_stat['white.result'].map(res_dict)
    game_stat['black.score']=game_stat['black.result'].map(res_dict)
    
    # pgn - long string
    game_stat['pgn']=game_stat['pgn'].apply(get_pgn)
    # result - string like '1-0'
    game_stat['result']=game_stat['pgn'].apply(lambda x: x['score'])
    
    game_stat['date']=game_stat['pgn'].apply(lambda x: x['date'])
    game_stat['time']=game_stat['pgn'].apply(lambda x: x['time'])
    
    # ECO - string like 'B10'
    game_stat['eco']=game_stat['pgn'].apply(lambda x: x['ECO'])
    # ECOurl - url like 'https://www.chess.com/openings/B10-Caro-Kann-Defense-2.Nf3-d5'
    game_stat['eco_url']=game_stat['pgn'].apply(lambda x: x['ECO_url'])
    
    # propability from classic formula
    game_stat['white_elo_forecast']=game_stat[['white.rating', 'black.rating']].apply(lambda x: elo_prob(*x), axis=1)
    game_stat['black_elo_forecast']=game_stat[['black.rating', 'white.rating']].apply(lambda x: elo_prob(*x), axis=1)
    
    # after it np.sum(df['game']) means number of games
    game_stat['game']=1
    game_stat['date'] = game_stat['date'].astype('datetime64[ns]')
    # period - stirng like '2019-08'
    game_stat['period']=game_stat['date'].dt.year.astype('str')+'-'+game_stat['date'].dt.month.astype('str')
        
    game_stat.columns=['rules', 'time_class', 'time_control', 'rated', 'white_url',
       'white.rating', 'white.result', 'white.username', 'black_url',
       'black.rating', 'black.result', 'black.username', 'end_time', 'pgn', 'game_url',
       'white.score', 'black.score', 'result', 'date', 'time', 'eco',
       'eco_url', 'white_elo_forecast', 'black_elo_forecast',
       'game', 'period']
    game_stat=game_stat[[
        'rules', 'time_class', 'time_control', 'rated', 'game',
        'result', 'date', 'time', 'period',
        'white.username', 'black.username',
        'white.rating', 'black.rating', 
        'white.result', 'black.result', 
        'white.score', 'black.score', 
        'white_elo_forecast', 'black_elo_forecast',
        'eco', 'eco_url', 
        'game_url', 'white_url', 'black_url'
    ]]
    return game_stat
    


In [6]:
# main function
def get_btch(player, year, month):
    import datetime
    
    # API -> string
    
    delay_lst=[0, 1, 4]
    # we give 3 attemps to get data
    for cs in range(3):
        try:
            data_string=get_api_data_to_str(player, year, month)
        except Exception:
            print('Error: appempt num', cs)
            # wait 0, 1 or 2 seconds after next attempt
            time.sleep(delay_lst[cs])
            if cs==2:
                print('No load data')
    
    # string -> json -> DataFrame
    multidata=get_str_data_to_pandas(data_string)
    
    # DataFrame -> use json normalise -> good df
    usefull_data=prepair_pandas_multilevel_data(multidata)
    
    # game prepair
    workng_data=get_pl_stat(usefull_data)
    
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    workng_data.to_csv('source/'+player+'_'+year+'_'+month+'_'+now+'.csv')
    
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(now, ': btch '+player+'_'+year+'_'+month+' finished')

In [7]:
# start getting with threading magic
def user_parse(user_list, year_range):   
    import threading
    # status counter
    i=0
    for user in user_list:
        for year in year_range:
            for month in range(1,13):
                t = threading.Thread(target=get_btch, args=(user, str(year), get_number(month)))
                t.start()
                i=i+1
    print('All btch', i,  'are started')   

In [8]:
# read all json in folder and concate DataFrame
def read_files(path):
    # reading
    l=[]
    cntr=0
    import os
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                 l.append(os.path.join(root, file))
            cntr=cntr+1
    print('Total', cntr, 'files founded')

    # merging
    i=0
    df=pd.DataFrame()
    for link in l:
        try:
            dfb=pd.read_csv(link)
        except:
            dfb=pd.DataFrame()
        df=pd.concat([df, dfb])
        
    df=df.drop_duplicates()
    return df

In [9]:
# data for predict current user
def learn_prepair(df, target_user):
    df=df[(df['white.username']==target_user)|(df['black.username']==target_user)]
    df['target_user']=target_user
    df['score']=df[['target_user', 'white.username', 'white.score', 'black.score']].apply(lambda x: find_col(*x), axis=1)
    df['rating']=df[['target_user','white.username', 'white.rating', 'black.rating']].apply(lambda x: find_col(*x), axis=1)
    df['opponent_rating']=df[['target_user', 'white.username', 'white.rating', 'black.rating']].apply(lambda x: find_opp(*x), axis=1)
    df['base_elo_forec']=df[['target_user', 'white.username', 'white_elo_forecast', 'black_elo_forecast']].apply(lambda x: find_col(*x), axis=1)
    df['color']=np.where(df['white.username']==target_user, 'w', 'b')
    
    df['date'] = df['date'].astype('datetime64[ns]')
    df['year']=df['date'].dt.year
    df['month']=df['date'].dt.month
    
    return df

In [10]:
# get players list by countries
def get_county_players_list(country):
    import requests
    response = requests.get('https://api.chess.com/pub/country/'+country+'/players')
    st=response.text
    if response.status_code !=200:
        print(response.status_code)
        print(st[0:3000])
        
    import datetime
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
    try:
        file = open('player_lists/players_list_'+country+'_'+now+'.json','w') 
        file.write(st)
        file.close()
    except Exception:
        error_st=response.status_code+st[0:3000]
        file = open('player_lists/players_list_'+country+'_'+now+'.json','w') 
        file.write( error_st)
        file.close()


## Start working

In [13]:
# collect players by countries
# countries_list=['RU']
# get_county_players_list(countries_list[0])

# user_list=pd.read_json('player_lists/players_list_RU_2019-08-11 19:55:21.json').sample(50)['players'].values
user_list=['andreyvict', 'sever043', 'Rosolimo']

In [14]:
user_list[0:30]

['andreyvict', 'sever043', 'Rosolimo']

In [17]:
year_range=range(2019, 2020)
user_parse(user_list, year_range)

All btch 36 are started
429

429

429429



429

429429



429

429

429

429

429

429

429

429

429429
429


429

429

429

429



429

429

429

429

429

429
404

429

{"message":"Date cannot be set in the future","code":0}
429
404
{"message":"Date cannot be set in the future","code":0}

404
{"message":"Date cannot be set in the future","code":0}
429
429


429

429

429404
{"message":"Date cannot be set in the future","code":0}
429

429



429
429


429

404
{"message":"Date cannot be set in the future","code":0}
404
{"message":"Date cannot be set in the future","code":0}
404
{"message":"Date cannot be set in the future","code":0}
404
{"message":"Date cannot be set in the future","code":0}
404
{"message":"Date cannot be set in the future","code":0}
404429


{"message":"Date cannot be set in the future","code":0}
429

429

429

429

429

429

429429





Exception in thread Thread-25:
Traceback (most recent call last):
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-3cefa340ef5c>", line 20, in get_btch
    multidata=get_str_data_to_pandas(data_string)
  File "<ipython-input-5-901706374060>", line 21, in get_str_data_to_pandas
    df=pd.DataFrame(js)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 392, in __init__
    mgr = init_dict(data, index, columns, dtype=dtype)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 212, in init_dict
    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 51, in arrays_to_mgr
    index = extract

429

429

429

404
{"message":"Date cannot be set in the future","code":0}
429

404
{"message":"Date cannot be set in the future","code":0}
429

429



Exception in thread Thread-14:
Traceback (most recent call last):
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-3cefa340ef5c>", line 20, in get_btch
    multidata=get_str_data_to_pandas(data_string)
  File "<ipython-input-5-901706374060>", line 21, in get_str_data_to_pandas
    df=pd.DataFrame(js)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 392, in __init__
    mgr = init_dict(data, index, columns, dtype=dtype)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 212, in init_dict
    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
  File "/home/roman/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 51, in arrays_to_mgr
    index = extract

429

429

429

429

429

429

429

404
{"message":"Date cannot be set in the future","code":0}


Exception in thread Thread-16:
Traceback (most recent call last):
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-3cefa340ef5c>", line 20, in get_btch
    multidata=get_str_data_to_pandas(data_string)
  File "<ipython-input-5-901706374060>", line 20, in get_str_data_to_pandas
    js = json.loads(strng)
  File "/home/roman/anaconda3/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Exception

429

429

429429





Exception in thread Thread-9:
Traceback (most recent call last):
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-3cefa340ef5c>", line 20, in get_btch
    multidata=get_str_data_to_pandas(data_string)
  File "<ipython-input-5-901706374060>", line 20, in get_str_data_to_pandas
    js = json.loads(strng)
  File "/home/roman/anaconda3/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Exception 

429

429

429

429

429



Exception in thread Thread-31:
Traceback (most recent call last):
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/roman/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-3cefa340ef5c>", line 20, in get_btch
    multidata=get_str_data_to_pandas(data_string)
  File "<ipython-input-5-901706374060>", line 20, in get_str_data_to_pandas
    js = json.loads(strng)
  File "/home/roman/anaconda3/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/roman/anaconda3/lib/python3.7/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Exception

2019-08-25 00:01:582019-08-25 00:01:58 : btch andreyvict_2019_08 finished : btch sever043_2019_05 finished

2019-08-25 00:02:00 : btch sever043_2019_04 finished
2019-08-25 00:02:03 : btch sever043_2019_02 finished
2019-08-25 00:02:07 : btch andreyvict_2019_03 finished
2019-08-25 00:02:07 : btch andreyvict_2019_02 finished
2019-08-25 00:02:10 : btch andreyvict_2019_04 finished
2019-08-25 00:02:11 : btch andreyvict_2019_05 finished
2019-08-25 00:02:12 : btch sever043_2019_06 finished
2019-08-25 00:02:14 : btch andreyvict_2019_01 finished
2019-08-25 00:02:17 : btch andreyvict_2019_07 finished
2019-08-25 00:02:20 : btch sever043_2019_03 finished
2019-08-25 00:02:22 : btch Rosolimo_2019_02 finished


In [18]:
%%time
df=read_files('source/')

Total 13 files founded
CPU times: user 193 ms, sys: 7.57 ms, total: 201 ms
Wall time: 235 ms


In [19]:
len(df)

658

In [20]:
df.sample(6)

Unnamed: 0.1,Unnamed: 0,rules,time_class,time_control,rated,game,result,date,time,period,white.username,black.username,white.rating,black.rating,white.result,black.result,white.score,black.score,white_elo_forecast,black_elo_forecast,eco,eco_url,game_url,white_url,black_url
16,0,chess,daily,1/86400,True,1,1-0,2019-07-31,12:20:40,2019-7,Andreyvict,sharp1000,1373,1342,win,resigned,1.0,0.0,0.544495,0.455505,C55,https://www.chess.com/openings/C55-Italian-Game-Anti-Fried-Liver-Defense,https://www.chess.com/daily/game/233333578,https://api.chess.com/pub/player/andreyvict,https://api.chess.com/pub/player/sharp1000
57,0,chess,blitz,300,True,1,1-0,2019-03-21,16:37:43,2019-3,mojarali,sever043,1388,1382,win,timeout,1.0,0.0,0.508634,0.491366,C23,https://www.chess.com/openings/C23-Bishops-Opening-Boi-Philidor-Variation-3...Nf6,https://www.chess.com/live/game/3553005939,https://api.chess.com/pub/player/mojarali,https://api.chess.com/pub/player/sever043
41,0,chess,blitz,180+2,True,1,0-1,2019-02-12,12:17:18,2019-2,Rosolimo,Madi1997,1567,1572,checkmated,win,0.0,1.0,0.492805,0.507195,E14,https://www.chess.com/openings/E14-Queens-Indian-Defense-Spassky-System-4...Bb7-5.Nc3,https://www.chess.com/live/game/3453118242,https://api.chess.com/pub/player/rosolimo,https://api.chess.com/pub/player/madi1997
43,0,chess,blitz,600,True,1,0-1,2019-07-21,19:38:59,2019-7,zigzag47,Andreyvict,1237,1212,abandoned,win,0.0,1.0,0.535916,0.464084,A20,https://www.chess.com/openings/A20-English-Opening-Kings-English-Variation,https://www.chess.com/live/game/3882805661,https://api.chess.com/pub/player/zigzag47,https://api.chess.com/pub/player/andreyvict
7,0,chess,blitz,600,True,1,1-0,2019-05-02,07:39:36,2019-5,ced741,Andreyvict,1147,1107,win,resigned,1.0,0.0,0.557312,0.442688,C60,https://www.chess.com/openings/C60-Ruy-Lopez-Opening-Gunderam-Variation,https://www.chess.com/live/game/3664262969,https://api.chess.com/pub/player/ced741,https://api.chess.com/pub/player/andreyvict
47,0,chess,blitz,300,True,1,0-1,2019-03-20,03:40:34,2019-3,sever043,noelia111,1395,1415,timeout,win,0.0,1.0,0.471249,0.528751,C30,https://www.chess.com/openings/C30-Kings-Gambit,https://www.chess.com/live/game/3548995976,https://api.chess.com/pub/player/sever043,https://api.chess.com/pub/player/noelia111


In [75]:
ds=learn_prepair(df, 'Rosolimo')
#ds=learn_prepair(df, 'sever043')

In [76]:
ds.sample(3)

Unnamed: 0.1,Unnamed: 0,rules,time_class,time_control,rated,game,result,date,time,period,white.username,black.username,white.rating,black.rating,white.result,black.result,white.score,black.score,white_elo_forecast,black_elo_forecast,eco,eco_url,game_url,white_url,black_url,target_user,score,rating,opponent_rating,base_elo_forec,color,year,month
57,0,chess,blitz,300,True,1,1-0,2016-09-16,19:53:50,2016-9,buzanin,Rosolimo,1608,1651,win,resigned,1.0,0.0,0.438432,0.561568,A00,https://www.chess.com/openings/A00-Grob-Opening-Grob-Gambit,https://www.chess.com/live/game/1732040676,https://api.chess.com/pub/player/buzanin,https://api.chess.com/pub/player/rosolimo,Rosolimo,0.0,1651,1608,0.561568,b,2016,9
14,0,chess,blitz,180+2,True,1,1-0,2017-04-05,18:17:56,2017-4,zingarobarone,Rosolimo,1658,1534,win,resigned,1.0,0.0,0.671241,0.328759,A08,https://www.chess.com/openings/A08-Kings-Indian-Attack-French-Variation-4.d4,https://www.chess.com/live/game/2034406588,https://api.chess.com/pub/player/zingarobarone,https://api.chess.com/pub/player/rosolimo,Rosolimo,0.0,1534,1658,0.328759,b,2017,4
209,0,chess,blitz,300,True,1,1-0,2013-10-24,12:41:27,2013-10,P4ULIU5,Rosolimo,1529,1486,win,resigned,1.0,0.0,0.561568,0.438432,D93,https://www.chess.com/openings/D93-Gruenfeld-Defense-Hungarian-Attack-5...O-O-6.e3,https://www.chess.com/live/game/629463175,https://api.chess.com/pub/player/p4uliu5,https://api.chess.com/pub/player/rosolimo,Rosolimo,0.0,1486,1529,0.438432,b,2013,10


In [77]:
len(ds)

11504

In [78]:
ds.to_csv('data/cur_user.csv')

In [79]:
ds.groupby(['year', 'month']).sum()[['game', 'score']]#.plot()

Unnamed: 0_level_0,Unnamed: 1_level_0,game,score
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,9,95,55.0
2013,9,215,105.5
2013,10,284,137.5
2013,11,287,146.0
2013,12,384,183.5
2014,1,355,182.5
2014,2,294,132.5
2014,3,262,127.0
2014,4,173,85.5
2014,5,202,89.5


In [18]:
# tst=get_api_data_to_str('Rosolimo', '2015', '11')