In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Chess Games Clustering

This notebook explores the lichess chess game dataset. There is some light EDA and feature engineering, and then we cluster the games to see if any patterns emerge.

En Passant will not be a feature this time around.

<img src="https://i.redd.it/hgjiywrbxyh71.jpg" width="400px">

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans


import matplotlib.pyplot as plt 
import seaborn as sns 
from category_encoders.count import CountEncoder

import plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

%matplotlib inline 

In [None]:
df = pd.read_csv('../input/chess/games.csv')
df.head()

I keep only relevant features and serious games where the game lasts at least 10 moves. I leave out 'rated' for the purposes of this notebook.

In [None]:
df = df[(df['turns'] > 19)]
df = df[['turns','victory_status','winner','increment_code','white_rating','black_rating','moves','opening_eco','opening_name','opening_ply']]

In [None]:
# Reducing opening name to two words for grouping ie."Sicilian Defense: Alapin Variation" --> "Sicilian Defense"

df['opening_pref'] = df['opening_name'].apply(lambda x: ' '.join(x.split(' ')[:2]))
df['opening_pref'] = df['opening_pref'].apply(lambda x: x[:-1] if str(x).endswith(':') else x)


# captures_per_ply measures how aggressively the players exchanged pieces

df['captures_per_ply']= df['moves'].apply(lambda x: str(x).count('x')) / df['turns']


# get average rating of two players

df['rating_avg'] = (df['white_rating'] + df['black_rating']) / 2


# get lower rating of two players

def rating_floor(df):
    return min(df['white_rating'],df['black_rating'])
    
df['rating_floor'] = df.apply(rating_floor, axis=1)


# The winner rating difference is calculated by (winner rating - loser rating). If the result is draw, winner_diff is 1/2 negative value.
# Larger negative difference means larger upset. 

def upset(df):
    if df['winner'] == 'white':
        return df['white_rating'] - df['black_rating']
    elif df['winner'] == 'black':
        return df['black_rating'] - df['white_rating']
    else:
        return abs(df['black_rating'] - df['white_rating']) * -.5
    

df['winner_diff'] = df.apply(upset, axis=1)

        

Now let's create a feature for the time control of the game. The existing time control feature is formatted A+B (ie. 3+2) where each player gets A minutes for the entire game and B seconds added to their clock after each move.

There are different ways to factor in the increment. As the total time added across a game depends on the number of moves both players make, we should review game length.

In [None]:
avg_all = np.mean(df['turns'])
avg_expert= np.mean(df[(df['rating_floor'] >= 2000)]['turns'])

print('Average game length for all games is:', avg_all)
print('Average game length for higher-rated games is:', avg_expert)

Looks like the average game length is a little over 30 moves per side. We can calculate time_control as (increment * moves / 60(seconds/min)).

So a 5+10 game in which each side makes 30 moves would have a time_control equivalent of a 5 + (10 * 30)/60 = 10+0 minute game. In practice a 5+10 game plays a little differently than a 10+0 game, but this is a decent approximation.

In [None]:
# Calculate time_control

def time_control(df):
    
    time_control = [int(x) for x in df['increment_code'].split('+')]
    
    return time_control[0] + np.floor((time_control[1] * df['turns']/2) / 60)
   

df['time_control'] = df.apply(time_control, axis=1)


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
sns.displot(df['time_control'])
plt.xlim(0, 50)

Viewing the time control distribution, we see peaks at 10, 15, 20, and 30 minutes. Most games are blitz or rapid chess.

In [None]:
cluster = df[['turns','opening_eco','opening_pref','opening_ply','captures_per_ply','rating_avg','rating_floor','winner_diff','time_control']]

Opening ECO and the name of an opening are related, since there are so many variations and transpositions, we can keep both fields. For example, here are some ECO codes of the Sicilian Defense:

In [None]:
cluster[['opening_eco','opening_pref']].loc[cluster['opening_pref'] == 'Sicilian Defense'].head()

And here are some opening names of ECO code C20.

In [None]:
cluster[(cluster['opening_eco'] == 'C20')][['opening_eco','opening_pref']].drop_duplicates()

More information is here:

https://en.wikipedia.org/wiki/Encyclopaedia_of_Chess_Openings

In [None]:
cluster['opening_pref'].value_counts()[:10]

There are familiar faces in the top 10, with the Silician Defense leading the pack. We count encode openings so that more popular choices have larger values. 

In [None]:
encoder = CountEncoder()

cluster[['op_name','op_eco']] = encoder.fit_transform(cluster[['opening_pref','opening_eco']])

In [None]:
cluster.head()

Let's correlate all our features so far and see if any trends come up.

In [None]:
f, ax = plt.subplots(figsize=(12, 10))
plt.title('Pearson Correlation of Chess Game Features')

sns.heatmap(cluster[['turns','op_name','op_eco','opening_ply','captures_per_ply','rating_avg','rating_floor','winner_diff','time_control']].corr(), cmap="magma", annot=True)

Correlations:

* rating_floor/avg vs opening_ply- as rating increases, so does the tendency to play more book moves
* rating_floor/avg vs opening_eco- as rating increases, players explore less popular ECO codes
* captures_per_ply vs turns- as game length increases, captures become more spread out 
* opening eco is defined by a set series of opening ply

That wraps up the initial phase. Before we go further, we will scale our features since we use some scale-sensitive tools.

In [None]:
scaler = StandardScaler()

features = ['turns','op_name','op_eco','opening_ply','captures_per_ply','rating_avg','rating_floor','winner_diff','time_control']
X = scaler.fit_transform(cluster[features])
X_processed = pd.DataFrame(X, columns = features)

Let's see if we can find a natural number of groups to which we can assign our games. We will the Yellowbrick elbow visualizer.

https://www.scikit-yb.org/en/latest/api/cluster/elbow.html

In [None]:
from yellowbrick.cluster import KElbowVisualizer
   
kmeans = KMeans(random_state=0)
# Compute cluster centers and predict cluster indices
visualizer = KElbowVisualizer(kmeans, k=(2,12))
visualizer.fit(X_processed)        # Fit the data to the visualizer
visualizer.show()

There are no clear elbows in the chart, but 6 groups seems like a good enough cutoff point. It is very clear that anything more than 7-8 groups is diminishing returns as distortion is not being meaningfully reduced.

So far so good. Now we perform PCA and K-Means clustering to view and find groups within our games. We plot 3 principal components.

In [None]:
pca = PCA(n_components=3,random_state=0)
pca_df = pd.DataFrame(pca.fit_transform(X_processed), columns = ['p1','p2','p3'])
   
kmeans = KMeans(n_clusters=6,random_state=0)

# Compute cluster centers and predict cluster indices

X_clustered = kmeans.fit_predict(pca_df)


In [None]:
# Sanity check

np.unique(X_clustered)

In [None]:
def plotly_scatter3d(data, feat1, feat2, feat3, color) :

    df = data
    x = df[feat1]
    y = df[feat2]
    z = df[feat3]

    trace1 = go.Scatter3d( x = x, y = y, z = z,
                           mode='markers',
                           marker=dict( size=5, color = color,               
                                        colorscale='Viridis',  
                                        opacity=0.8 )
                          )
    data = [trace1]
    camera = dict( up=dict(x=0, y=0, z=1),
                   center=dict(x=0, y=0, z=0.0),
                   eye=dict(x=2.5, y=0.1, z=0.8) )

    layout = go.Layout( title= feat3 + " as function of " +  
                               feat1 + " and " + feat2 ,
                        autosize=False, width=700, height=600,               
                        margin=dict( l=15, r=25, b=15, t=30 ) ,
                        scene=dict(camera=camera,
                                   xaxis = dict(title=feat1),
                                   yaxis = dict(title=feat2),
                                   zaxis = dict(title=feat3),                                   
                                  ),
                       )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

LABEL_COLOR_MAP = {0 : '#fab3a9', 1 : '#c6ad94', 2 : '#7fb285', 3:'#463239', 4:'#764248', 5:'#ed6b86'} 
label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]


plotly_scatter3d(pca_df, 'p1', 'p2', 'p3', label_color)


Looks like scoops of ice-cream. Mint chocolate chip, anyone?

Unfortunately, it looks like our clusters are melted together. This may be due to the fact that 3 principal components do not sufficiently explain the variance in the original feature space.


In [None]:
print("Components = ", pca.n_components_ , "\nTotal explained variance = ", round(pca.explained_variance_ratio_.sum(),5)  )

That being said we should look at each cluster closer. We will describe the game characteristics of each group. Before that, let's first review the statistics of the entire dataset so that we have a frame of reference.

In [None]:
cluster.describe().transpose()

In [None]:
cluster['group'] = X_clustered
cluster['group'].value_counts()

In [None]:
cluster0 = cluster[cluster['group']==0]
cluster1 = cluster[cluster['group']==1]
cluster2 = cluster[cluster['group']==2]
cluster3 = cluster[cluster['group']==3]
cluster4 = cluster[cluster['group']==4]
cluster5 = cluster[cluster['group']==5]

In [None]:
cluster0.describe().transpose()

Group 0 has high rated players. Games remain in the the opening for longer with higher opening_ply, as players may have more knowledge of opening theory. Op_name is higher than the other groups, which shows preference for more popular openings.

In [None]:
cluster1.describe().transpose()

Group 1 had lower ratings, with games moving quickly away from known openings. The op_eco value is very high. Exploring this further, we can see that this group is characterized by unconventional openings.

In [None]:
cluster1['opening_eco'].value_counts()[:5]

The most represented ECO code here is A00, which corresponds to unknown/unanalyzed opening moves like 1. g4, a3, h3, etc.

https://www.chessgames.com/perl/chessopening?eco=a00

In [None]:
cluster2.describe().transpose()

Group 2 is the underdog group. Winner_diff is negative, meaning lower-rated players are defeating higher-rated players. These games have the longest time_control by far, with games averaging more than 30 minutes each.

In [None]:
cluster3.describe().transpose()

Group 3 games also have high ECO averages. Reviewing counts, it looks like many French Defense (C00) and unusual openings (A00).

In [None]:
cluster3['opening_eco'].value_counts()[:5]

In [None]:
cluster4.describe().transpose()

Group 4 is the most similar to the general population across the board.

In [None]:
cluster5.describe().transpose()

Group 5 has high-rated players similar to group 0. However, the amount of turns per game is larger, and winner-diff is lower. More games are drawn, as draws appear in higher frequency as rating increases.

That's all, thanks for reading! I think it could be interesting to repeat this analysis for just the rated games, which is a feature I did not bring into account. Another cool experiment could be to cluster very high-rated games (2400+) to try and find opening patterns among players that are serious about opening theory.