In [2]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [10]:
path = 'games.csv'
df = pd.read_csv(path)

In [11]:
df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              20058 non-null  object 
 1   rated           20058 non-null  bool   
 2   created_at      20058 non-null  float64
 3   last_move_at    20058 non-null  float64
 4   turns           20058 non-null  int64  
 5   victory_status  20058 non-null  object 
 6   winner          20058 non-null  object 
 7   increment_code  20058 non-null  object 
 8   white_id        20058 non-null  object 
 9   white_rating    20058 non-null  int64  
 10  black_id        20058 non-null  object 
 11  black_rating    20058 non-null  int64  
 12  moves           20058 non-null  object 
 13  opening_eco     20058 non-null  object 
 14  opening_name    20058 non-null  object 
 15  opening_ply     20058 non-null  int64  
dtypes: bool(1), float64(2), int64(4), object(9)
memory usage: 2.3+ MB


# Preprocessing

In [13]:
df['winner'] = df['winner'].map({'white': 1, 'black': 0, 'draw': 2})
df['rating_difference'] = abs(df['white_rating'] - df['black_rating'])
df = pd.get_dummies(df, columns=['victory_status'])

In [14]:
df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply,rating_difference,victory_status_draw,victory_status_mate,victory_status_outoftime,victory_status_resign
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,1,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5,309,False,False,True,False
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,0,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4,61,False,False,False,True
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,1,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3,4,False,True,False,False
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,1,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3,15,False,True,False,False
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,1,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5,54,False,True,False,False


In [20]:
features = ['winner', 'white_rating', 'black_rating', 'rating_difference', 'increment_code']
features += [cols for cols in df.columns if cols.startswith('victory_status_')]
df = df[features]

In [21]:
X = df.drop(columns=['winner'])
y = df['winner']

In [22]:
pd.DataFrame(X)

Unnamed: 0,white_rating,black_rating,rating_difference,increment_code,victory_status_draw,victory_status_mate,victory_status_outoftime,victory_status_resign
0,1500,1191,309,15+2,False,False,True,False
1,1322,1261,61,5+10,False,False,False,True
2,1496,1500,4,5+10,False,True,False,False
3,1439,1454,15,20+0,False,True,False,False
4,1523,1469,54,30+3,False,True,False,False
...,...,...,...,...,...,...,...,...
20053,1691,1220,471,10+10,False,False,False,True
20054,1233,1196,37,10+0,False,True,False,False
20055,1219,1286,67,10+0,False,True,False,False
20056,1360,1227,133,10+0,False,False,False,True
