# Prepairing chesscom data to use

## Import and functions

In [44]:
# coding: utf-8

# our all
import numpy as np
import pandas as pd

# usefull pandas settings
pd.set_option('display.max_rows', 45000)
pd.set_option('display.max_columns', 50000)
pd.set_option('display.max_colwidth', 5000)

# for API working and current time
import requests
import datetime

# chess pgn-reading tool
from pgn_parser import pgn, parser

# multistreaming
import threading

# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')

In [46]:
# get json form site and return string
# exmple get_api_data_to_str('eric', '2014', '01')
def get_api_data_to_str(player, year, month):
    import requests
    response = requests.get('https://api.chess.com/pub/player/'+player+'/games/'+year+'/'+month)
    st=response.text
    if response.status_code !=200:
        print(response.status_code)
        print(st[0:3000])
    if response.status_code ==429:
        with open('data/429.txt', 'w+') as f:
            f.write('variant, ')
            f.write(st)
            f.close()
    return st

# convert api data to pandas for next working
def get_str_data_to_pandas(strng):
    import json
    js = json.loads(strng)
    df=pd.DataFrame(js)
    
    return df

# convert multilevel json to table
def prepair_pandas_multilevel_data(df):
    num=len(df)
    new_df=pd.DataFrame()
    for i in range(num):
        b=pd.io.json.json_normalize(df.values[i])
        new_df=pd.concat([new_df, b])
        
    return new_df 

# get player games in month with preparation
def get_pl_stat(game_stat):    
    # chess result dictionary
    res_dict={
    'win': 1.0,
    'checkmated': 0.0,
    'agreed': 0.5,
    'repetition': 0.5,
    'timeout': 0.0,
    'resigned': 0.0,
    'stalemate': 0.5,
    'lose': 0.0,
    'insufficient': 0.5,
    '50move': 0.5,
    'abandoned': 0.0,
    'kingofthehill': 0.5,
    'threecheck': 0.5,
    'timevsinsufficient': 0.5,
    'bughousepartnerlose': 0.0
    }

    # number of games
    num=len(game_stat)
    
    # cut long and unusefull columns
    game_stat=game_stat[['rules', 'time_class', 'time_control', 'rated',
        'white.@id', 'white.rating', 'white.result', 'white.username',
         'black.@id', 'black.rating', 'black.result', 'black.username',
         'end_time', 'pgn', 
         'url']]
    # score 0.0, 0.5 or 1.0
    game_stat['white.score']=game_stat['white.result'].map(res_dict)
    game_stat['black.score']=game_stat['black.result'].map(res_dict)
    
    # pgn - long string
    game_stat['pgn']=game_stat['pgn'].apply(get_pgn)
    # result - string like '1-0'
    game_stat['result']=game_stat['pgn'].apply(lambda x: x['score'])
    
    game_stat['date']=game_stat['pgn'].apply(lambda x: x['date'])
    game_stat['time']=game_stat['pgn'].apply(lambda x: x['time'])
    
    # ECO - string like 'B10'
    game_stat['eco']=game_stat['pgn'].apply(lambda x: x['ECO'])
    # ECOurl - url like 'https://www.chess.com/openings/B10-Caro-Kann-Defense-2.Nf3-d5'
    game_stat['eco_url']=game_stat['pgn'].apply(lambda x: x['ECO_url'])
    
    # propability from classic formula
    game_stat['white_elo_forecast']=game_stat[['white.rating', 'black.rating']].apply(lambda x: elo_prob(*x), axis=1)
    game_stat['black_elo_forecast']=game_stat[['black.rating', 'white.rating']].apply(lambda x: elo_prob(*x), axis=1)
    
    # after it np.sum(df['game']) means number of games
    game_stat['game']=1
    game_stat['date'] = game_stat['date'].astype('datetime64[ns]')
    # period - stirng like '2019-08'
    game_stat['period']=game_stat['date'].dt.year.astype('str')+'-'+game_stat['date'].dt.month.astype('str')
        
    game_stat.columns=['rules', 'time_class', 'time_control', 'rated', 'white_url',
       'white.rating', 'white.result', 'white.username', 'black_url',
       'black.rating', 'black.result', 'black.username', 'end_time', 'pgn', 'game_url',
       'white.score', 'black.score', 'result', 'date', 'time', 'eco',
       'eco_url', 'white_elo_forecast', 'black_elo_forecast',
       'game', 'period']
    game_stat=game_stat[[
        'rules', 'time_class', 'time_control', 'rated', 'game',
        'result', 'date', 'time', 'period',
        'white.username', 'black.username',
        'white.rating', 'black.rating', 
        'white.result', 'black.result', 
        'white.score', 'black.score', 
        'white_elo_forecast', 'black_elo_forecast',
        'eco', 'eco_url', 
        'game_url', 'white_url', 'black_url'
    ]]
    return game_stat
    


In [47]:
# main function
def get_btch(player, year, month):
    import datetime
    
    # API -> string
    
    delay_lst=[0, 1, 4]
    # we give 3 attemps to get data
    for cs in range(3):
        try:
            data_string=get_api_data_to_str(player, year, month)
        except Exception:
            print('Error: appempt num', cs)
            # wait 0, 1 or 2 seconds after next attempt
            time.sleep(delay_lst[cs])
            if cs==2:
                print('No load data')
    
    # string -> json -> DataFrame
    multidata=get_str_data_to_pandas(data_string)
    
    # DataFrame -> use json normalise -> good df
    usefull_data=prepair_pandas_multilevel_data(multidata)
    
    # game prepair
    workng_data=get_pl_stat(usefull_data)
    
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    workng_data.to_csv('source/'+player+'_'+year+'_'+month+'_'+now+'.csv')
    
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(now, ': btch '+player+'_'+year+'_'+month+' finished')

In [48]:
# start getting with threading magic
def user_parse(user_list, year_range):   
    import threading
    # status counter
    i=0
    for user in user_list:
        for year in year_range:
            for month in range(1,13):
                t = threading.Thread(target=get_btch, args=(user, str(year), get_number(month)))
                t.start()
                i=i+1
    print('All btch', i,  'are started')   

In [49]:
# read all json in folder and concate DataFrame
def read_files(path):
    # reading
    l=[]
    cntr=0
    import os
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                 l.append(os.path.join(root, file))
            cntr=cntr+1
    print('Total', cntr, 'files founded')

    # merging
    i=0
    df=pd.DataFrame()
    for link in l:
        try:
            dfb=pd.read_csv(link)
        except:
            dfb=pd.DataFrame()
        df=pd.concat([df, dfb])
        
    df=df.drop_duplicates()
    return df

In [50]:
# data for predict current user
def learn_prepair(df, target_user):
    df=df[(df['white.username']==target_user)|(df['black.username']==target_user)]
    df['target_user']=target_user
    df['score']=df[['target_user', 'white.username', 'white.score', 'black.score']].apply(lambda x: find_col(*x), axis=1)
    df['rating']=df[['target_user','white.username', 'white.rating', 'black.rating']].apply(lambda x: find_col(*x), axis=1)
    df['opponent_rating']=df[['target_user', 'white.username', 'white.rating', 'black.rating']].apply(lambda x: find_opp(*x), axis=1)
    df['base_elo_forec']=df[['target_user', 'white.username', 'white_elo_forecast', 'black_elo_forecast']].apply(lambda x: find_col(*x), axis=1)
    df['color']=np.where(df['white.username']==target_user, 'w', 'b')
    
    df['date'] = df['date'].astype('datetime64[ns]')
    df['year']=df['date'].dt.year
    df['month']=df['date'].dt.month
    
    return df

In [51]:
# get players list by countries
def get_county_players_list(country):
    import requests
    response = requests.get('https://api.chess.com/pub/country/'+country+'/players')
    st=response.text
    if response.status_code !=200:
        print(response.status_code)
        print(st[0:3000])
        
    import datetime
    now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
    try:
        file = open('player_lists/players_list_'+country+'_'+now+'.json','w') 
        file.write(st)
        file.close()
    except Exception:
        error_st=response.status_code+st[0:3000]
        file = open('player_lists/players_list_'+country+'_'+now+'.json','w') 
        file.write( error_st)
        file.close()


## Start working

In [52]:
# collect players by countries
# countries_list=['RU']
# get_county_players_list(countries_list[0])

# user_list=pd.read_json('player_lists/players_list_RU_2019-08-11 19:55:21.json').sample(50)['players'].values
user_list=['andreyvict', 'sever043', 'Rosolimo']

In [53]:
user_list[0:30]

['andreyvict', 'sever043', 'Rosolimo']

In [17]:
year_range=range(2012, 2020)
#user_parse(user_list, year_range)

In [54]:
%%time
df=read_files('source/')

Total 551 files founded
CPU times: user 43.5 s, sys: 77.5 ms, total: 43.6 s
Wall time: 43.7 s


In [55]:
len(df)

56873

In [56]:
df.sample(6)

Unnamed: 0.1,Unnamed: 0,rules,time_class,time_control,rated,game,result,date,time,period,white.username,black.username,white.rating,black.rating,white.result,black.result,white.score,black.score,white_elo_forecast,black_elo_forecast,eco,eco_url,game_url,white_url,black_url
257,0,chess,blitz,300,True,1,0-1,2013-12-21,21:30:56,2013-12,Rosolimo,lesiles,1441,1447,resigned,win,0.0,1.0,0.491366,0.508634,A85,https://www.chess.com/openings/A85-Dutch-Defense-Queens-Knight-Variation,https://www.chess.com/live/game/678407825,https://api.chess.com/pub/player/rosolimo,https://api.chess.com/pub/player/lesiles
102,0,chess,blitz,180+2,True,1,0-1,2019-01-06,12:34:25,2019-1,casusbelliinchess,Urgen1970,1392,1239,timeout,win,0.0,1.0,0.706975,0.293025,C33,https://www.chess.com/openings/C33-Kings-Gambit-Accepted-Tumbleweed-Variation,https://www.chess.com/live/game/3354092298,https://api.chess.com/pub/player/casusbelliinchess,https://api.chess.com/pub/player/urgen1970
403,0,chess,blitz,180+2,True,1,0-1,2018-12-27,09:55:48,2018-12,Urgen1970,kantunal,1352,1571,resigned,win,0.0,1.0,0.220859,0.779141,A40,https://www.chess.com/openings/A40-Queens-Pawn-Opening-Horwitz-Defense,https://www.chess.com/live/game/3326919057,https://api.chess.com/pub/player/urgen1970,https://api.chess.com/pub/player/kantunal
90,0,chess,blitz,600,True,1,1-0,2016-01-06,23:58:30,2016-1,Luna2014,dbrax,1530,1514,win,resigned,1.0,0.0,0.52301,0.47699,B00,https://www.chess.com/openings/B00-Kings-Pawn-Opening-Duras-Gambit,https://www.chess.com/live/game/1413437320,https://api.chess.com/pub/player/luna2014,https://api.chess.com/pub/player/dbrax
151,0,chess,blitz,300,True,1,1-0,2018-04-12,13:51:27,2018-4,kass1963,oneelevenchemp,868,898,win,timeout,1.0,0.0,0.456934,0.543066,C20,https://www.chess.com/openings/C20-Kings-Pawn-Opening-Wayward-Queen-Attack-2...Nc6-3.Bc4,https://www.chess.com/live/game/2740284944,https://api.chess.com/pub/player/kass1963,https://api.chess.com/pub/player/oneelevenchemp
8,0,chess,blitz,180+2,True,1,0-1,2018-10-01,20:58:20,2018-10,Rosolimo,cemalaza,1654,1670,checkmated,win,0.0,1.0,0.47699,0.52301,A40,https://www.chess.com/openings/A40-Modern-Defense-with-1-d4-2.c4,https://www.chess.com/live/game/3113554233,https://api.chess.com/pub/player/rosolimo,https://api.chess.com/pub/player/cemalaza


In [57]:
ds=learn_prepair(df, 'Rosolimo')
#ds=learn_prepair(df, 'sever043')

In [58]:
ds.sample(3)

Unnamed: 0.1,Unnamed: 0,rules,time_class,time_control,rated,game,result,date,time,period,white.username,black.username,white.rating,black.rating,white.result,black.result,white.score,black.score,white_elo_forecast,black_elo_forecast,eco,eco_url,game_url,white_url,black_url,target_user,score,rating,opponent_rating,base_elo_forec,color,year,month
13,0,chess,blitz,300,True,1,0-1,2014-02-05,10:02:49,2014-2,Rosolimo,EDONG,1612,1745,resigned,win,0.0,1.0,0.31743,0.68257,A53,https://www.chess.com/openings/A53-Old-Indian-Defense,https://www.chess.com/live/game/718520748,https://api.chess.com/pub/player/rosolimo,https://api.chess.com/pub/player/edong,Rosolimo,0.0,1612,1745,0.31743,w,2014,2
33,0,chess,blitz,300,True,1,1-0,2015-12-02,18:09:14,2015-12,Rosolimo,jt_109,1556,1558,win,resigned,1.0,0.0,0.497122,0.502878,D10,https://www.chess.com/openings/D10-Slav-Defense,https://www.chess.com/live/game/1371034272,https://api.chess.com/pub/player/rosolimo,https://api.chess.com/pub/player/jt_109,Rosolimo,1.0,1556,1558,0.497122,w,2015,12
61,0,chess,blitz,300,True,1,0-1,2015-07-05,13:46:42,2015-7,Matenpocas,Rosolimo,1486,1539,abandoned,win,0.0,1.0,0.424313,0.575687,A09,https://www.chess.com/openings/A09-Reti-Opening-Reti-Gambit-Declined-Advance-Variation-3.b4,https://www.chess.com/live/game/1206039930,https://api.chess.com/pub/player/matenpocas,https://api.chess.com/pub/player/rosolimo,Rosolimo,1.0,1539,1486,0.575687,b,2015,7


In [59]:
len(ds)

11504

In [61]:
ds.to_csv('data/cur_user.csv')

In [60]:
ds.groupby(['year', 'month']).sum()[['game', 'score']]#.plot()

Unnamed: 0_level_0,Unnamed: 1_level_0,game,score
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,9,95,55.0
2013,9,215,105.5
2013,10,284,137.5
2013,11,287,146.0
2013,12,384,183.5
2014,1,355,182.5
2014,2,294,132.5
2014,3,262,127.0
2014,4,173,85.5
2014,5,202,89.5


In [18]:
# tst=get_api_data_to_str('Rosolimo', '2015', '11')