In [None]:
from IPython.core.display import HTML
styles = '''@import url('https://fonts.googleapis.com/css?family=Quicksand&display=swap');
 * {
	 margin: 0;
	 padding: 0;
	 box-sizing: border-box;
}
 h3 {
	 font-family: Comic Sans MS;
}
 .alert {
	 width: 80%;
	 margin: 20px auto;
	 padding: 30px;
	 position: relative;
	 border-radius: 5px;
	 box-shadow: 0 0 15px 5px #ccc;
}
 .close {
	 position: absolute;
	 width: 30px;
	 height: 30px;
	 opacity: 0.5;
	 border-width: 1px;
	 border-style: solid;
	 border-radius: 50%;
	 right: 15px;
	 top: 25px;
	 text-align: center;
	 font-size: 1.6em;
	 cursor: pointer;
}
 .simple-alert {
	 background-color: #aed6e5;
	 border-left: 5px solid #245b70;
}
 .simple-alert .close {
	 border-color: #245b70;
	 color: #245b70;
}
 .success-alert {
	 background-color: #aee5c0;
	 border-left: 5px solid #24703d;
}
 .success-alert .close {
	 border-color: #24703d;
	 color: #24703d;
}
 .danger-alert {
	 background-color: #e5aeae;
	 border-left: 5px solid #702424;
}
 .danger-alert .close {
	 border-color: #702424;
	 color: #702424;
}
 .warning-alert {
	 background-color: #ffe6a9;
	 border-left: 5px solid #a97800;
}
 .warning-alert .close {
	 border-color: #a97800;
	 color: #a97800;
}
'''
HTML("<style>"+styles+"</style>")

!pip install rich
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from rich.console import Console
from rich.theme import Theme
import matplotlib.gridspec as gridspec
import matplotlib_venn as vplt

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
palette2 = sns.diverging_palette(120, 220, n=20)
custom_palette(palette)

custom_theme = Theme({
    "info" : "italic bold cyan",
    "warning": "italic bold magenta",
    "danger": "bold blue"
})

console = Console(theme=custom_theme)
import warnings
warnings.filterwarnings('ignore')

![](https://kgcorner.com/wp-content/uploads/2021/05/fq4cqqdmz4jv9agitg72.jpeg)
# Exploratory Data Analysis

In [None]:
df = pd.read_csv('../input/those-features-won-t-engineer-themselves/df_clean.csv')
df.head(2)

In [None]:
no_of_unique_users = set(df['white_username'].unique()).union(set(df['black_username']))
print(f'No of unique users in the dataset: {len(no_of_unique_users)}')

In [None]:
sns.kdeplot(df['rating_difference'],color=palette[0], shade=True)
ax = plt.gca()
ax.set_title('Rating_Difference',font="Serif")
ax.xaxis.set_visible(False)

The rating_difference is almost normal distribution. This makes sense, as usually Computer sets up match for you, and you are usually given people who are in your rating level to play with. In tournaments however, this is not the case, where the pairing happens not on the basis of rating, but on the basis of you position in the leaderboard

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
 
fig = plt.figure(figsize =([15, 10]))
 
gs = gridspec.GridSpec(3, 8)
gs.update(wspace = 1.5, hspace = 0.3)

df_tournament = df[df['Is_tournament']==True] 
df_tournanot = df[df['Is_tournament']==False]
#Yeah, I ran out of feature names

ax0 = plt.subplot(gs[0:2, 0:3])
ax1 = plt.subplot(gs[0:2, 3:6],sharey=ax0)
plt.setp(ax1.get_yticklabels(), visible=False)
ax3 = plt.subplot(gs[2,6:])

plt.suptitle('Rating_Difference')
sns.kdeplot(df_tournanot['rating_difference'],color=palette[4], shade=True, ax=ax0)
sns.kdeplot(df_tournament['rating_difference'],color=palette[3], shade=True,ax=ax1)
sns.countplot(df['Is_tournament'],ax=ax3)
ax0.set_title("Not Tournament",font="Serif")
ax1.set_title("Tournament",font="Serif")
ax0.axvline(df_tournanot['rating_difference'].quantile(0.95), color=palette[0],linestyle=':', linewidth=2)
ax1.axvline(df_tournament['rating_difference'].quantile(0.95), color=palette[1],linestyle=':', linewidth=2)
ax0.axvline(df_tournanot['rating_difference'].quantile(0.05), color=palette[0],linestyle=':', linewidth=2)
ax1.axvline(df_tournament['rating_difference'].quantile(0.05), color=palette[1],linestyle=':', linewidth=2)
plt.show()

The lines you see in the plot, are the .95 and .05 quantiles.<br>
We have proved our above hypothesis, that in case of Tournament games, there are more games with unbalance in rating between players.

![](https://www.researchgate.net/publication/344404859/figure/fig1/AS:963465715937314@1606719535864/Heavy-fat-and-long-versus-thin-and-short-tails-The-value-of-the-random-variable.png)

In [None]:
sns.countplot(df['time_class'])

**bullet:-** here players have 1 or 3 minutes in their clock, and so they have to play really fast, it is the shortest time format.<br>
**blitz:-** here players have 3 to 10 minutes in their clock, and so they have to play fast, they still get time to think, but not enough to think long sequences.<br>
**rapid:-** here players have from 10 to infinite time in their clock, so they can play really carefully, and think out many steps ahead (Sherlock style!!!).
**daily:-** In these type of games, the game stretches over days, where a player gets 1 day or more to make a move. The person can analyze a lot moves ahead, and play really carefully.

In case of time controls with more time, like daily and rapid, people have more time to use chess engines i.e. to cheat. So number of upsets must be more in rapid and daily time controls.<br>
Although with more time, player with the higher rating should be able to make less mistake, and with his superior play win more.<br>
The higher rated player would in shorter time formats like bullet and blitz make more mistakes, and lose more to lower rated player due to silly mistakes.<br>

To check this, we first need to create define upset.<br>
According to chess statistics, If the rating difference is 400 points, then the higher rated player has 90% of winning the game.<br>
So lets say if with a 400 point difference, the lower rated player wins, it is a upset.

In [None]:
df['lower_rated'] = df['rating_difference'].apply(lambda x: 'White' if x>0 else 'Black' )
upsets_time_class = dict(df[(np.abs(df['rating_difference']) > 400) & (df['Result']==df['lower_rated'])]['time_class'].value_counts())
whole_time_class = dict(df['time_class'].value_counts()) 

In [None]:
print(f"Percentage of upsets in bullet: {(upsets_time_class['bullet']/whole_time_class['bullet'])*100:.3f}")
print(f"Percentage of upsets in blitz: {(upsets_time_class['blitz']/whole_time_class['blitz'])*100:.3f}")
print(f"Percentage of upsets in rapid: {(upsets_time_class['rapid']/whole_time_class['rapid'])*100:.3f}")
print(f"Percentage of upsets in daily: {(upsets_time_class['daily']/whole_time_class['daily'])*100:.3f}")

Does the increase in the number of upsets in daily mean something?<br>
Does it indicate that people tend to cheat in daily, or does it mean that lower rated players when give a lot of time to play a move, tend to perform better.<br>
Upsets, and particularly games where a player cheated can be bad for our model.

In [None]:
upsets_400_diff = len(df[(np.abs(df['rating_difference']) > 400) & (df['Result']==df['lower_rated'])])
upsets_300_diff = len(df[(np.abs(df['rating_difference']) > 300) & (df['Result']==df['lower_rated'])])
upsets_200_diff = len(df[(np.abs(df['rating_difference']) > 200) & (df['Result']==df['lower_rated'])])
upsets_100_diff = len(df[(np.abs(df['rating_difference']) > 100) & (df['Result']==df['lower_rated'])])
upsets_50_diff = len(df[(np.abs(df['rating_difference']) > 50) & (df['Result']==df['lower_rated'])])
upsets_25_diff = len(df[(np.abs(df['rating_difference']) > 25) & (df['Result']==df['lower_rated'])])
print(f"Percentage of 400 rating difference upsets: {(upsets_400_diff/len(df))*100:.3f}")
print(f"Percentage of 300 rating difference upsets: {(upsets_300_diff/len(df))*100:.3f}")
print(f"Percentage of 200 rating difference upsets: {(upsets_200_diff/len(df))*100:.3f}")
print(f"Percentage of 100 rating difference upsets: {(upsets_100_diff/len(df))*100:.3f}")
print(f"Percentage of 50 rating difference upsets: {(upsets_50_diff/len(df))*100:.3f}")
print(f"Percentage of 25 rating difference upsets: {(upsets_25_diff/len(df))*100:.3f}")

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,10),sharex='all',sharey='all')
sns.kdeplot(df['white_rating'],color=palette[4], shade=True, ax=ax[0])
sns.kdeplot(df['black_rating'],color=palette[3], shade=True,ax=ax[1])

White and Black ratings have identical distribution. The graph is skewed to the left.

In [None]:
sns.countplot(df['rated'])

In [None]:
sns.countplot(y = df['result_type'],palette='tab10')

Most of the chess.com users like to play rated.<br>
Due to this finding unrated games is takes longer time, due to which most players tend to only play rated.<br>
When the games are unrated, people tend to resign in losing positions more often, as they won't lose any ELO points.<br>

In [None]:
temp = df[['result_type','rated']].copy()
temp = pd.DataFrame(temp.groupby('rated')['result_type'].value_counts(normalize=True))
fig, ax = plt.subplots(1,2,figsize=(20,10),sharex='all',sharey='all')
sns.heatmap(temp.T[1].T,ax=ax[0],cmap='BuPu',fmt='.2g',annot=True)
sns.heatmap(temp.T[0].T,ax=ax[1],cmap='BuPu',fmt='.2g',annot=True)
ax[0].set_title('Rated',font="Serif")
ax[0].xaxis.grid(False)
ax[0].xaxis.set_visible(False)
ax[1].set_title('Unrated',font="Serif")
ax[1].xaxis.set_visible(False)
ax[1].yaxis.set_visible(False)

As we thought, When we move from Rated games to Unrated, we see an increase in the percentage of resignations.

In [None]:
import ast
import math
df.Moves = df.Moves.apply(lambda x: ast.literal_eval(x))
df['no_of_moves'] = df.Moves.apply(lambda x: math.floor(len(x)/2))
sns.histplot(x = df.no_of_moves,bins=50)
print('median is',df.no_of_moves.median())

On an average No of Moves played is 29 Moves with a left skewness.

In [None]:
df['White_first_move'] = df['Moves'].apply(lambda x: x[0] if len(x)!=0 else None)
df['Black_first_move'] = df['Moves'].apply(lambda x: x[1] if len(x)!=0 and len(x)!=1 else None)
df['White_second_move'] = df['Moves'].apply(lambda x: x[2] if len(x)!=0 and len(x)!=1 and len(x)!=2 else None)

fig, ax = plt.subplots(1,2,figsize=(20,10))
 
# ax1.set_ylabel('ylabel', labelpad = 0, fontsize = 12)
temp_df = pd.DataFrame(df.White_first_move.value_counts().nlargest(5))
sns.barplot(y = temp_df.index, x = temp_df.White_first_move, ax = ax[0])
 
# ax2.set_ylabel('ylabel', labelpad = 0, fontsize = 12)
temp_df = pd.DataFrame(df.Black_first_move.value_counts().nlargest(5))
sns.barplot(y = temp_df.index, x = temp_df.Black_first_move, ax = ax[1])

ax[0].set_title('White_first_move',font="Serif")
ax[0].xaxis.grid(False)
ax[0].xaxis.set_visible(False)
ax[1].set_title('Black_first_move',font="Serif")
ax[1].xaxis.set_visible(False)

The most common first moves by white is either e4 or d4.<br>
e5(Open Game) and c5(Sicilian Defence) is usually played as a response to e4.<br> d5 is usually played against d4.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,10))
 
# ax1.set_ylabel('ylabel', labelpad = 0, fontsize = 12)
temp_df = pd.DataFrame(df[df.White_first_move=='e4'].Black_first_move.value_counts().nlargest(5))
sns.barplot(y = temp_df.index, x = temp_df.Black_first_move, ax = ax[0])
 
# ax2.set_ylabel('ylabel', labelpad = 0, fontsize = 12)
temp_df = pd.DataFrame(df[df.White_first_move=='d4'].Black_first_move.value_counts().nlargest(5))
sns.barplot(y = temp_df.index, x = temp_df.Black_first_move, ax = ax[1])

ax[0].set_title('Black_moves_against_e4',font="Serif")
ax[0].xaxis.grid(False)
ax[0].xaxis.set_visible(False)
ax[1].set_title('Black_moves_against_d4',font="Serif")
ax[1].xaxis.set_visible(False)

This represents the same as I stated above<br>
* e5(Open Games) and c5(Sicilian Defence) against e4
* d5(Closed Games) and Nf6(Modern Openings like, Kings Indian Defence)
<br>
These are the most common choice of black against the move White plays

In [None]:
temp_df = pd.DataFrame(df.Moves.value_counts())
temp_df = temp_df.reset_index().rename(columns = {'Moves':'no_of_occurences', 'index':'Moves'})
print('No. of games played',len(df))
print('No. of unique positions reached',len(temp_df))

In [None]:
def is_checkmate(Moves_list):
    if(len(Moves_list) == 0):
        return False
    last_move = Moves_list[-1]
    return last_move[-1]=='#'
temp_df['Was_checkmate'] = temp_df['Moves'].apply(is_checkmate)

In [None]:
for mistake in temp_df[temp_df['Was_checkmate']].head(10).Moves:
    print('  '.join(mistake))

These are the most common mistakes that a person can do, that results in a quick checkmate. <br>
As you can see most of these end with Q takes on f7.<br>
![f7 chess](https://www.thechesswebsite.com/wp-content/uploads/2012/07/attackf7-big.jpg)<br>
Every other pawn is defended twice at the starting of the game except the f7 pawn, and so White players usually try to take advantage of this fact and checkmate you on f7.
<div class="alert simple-alert">
So next time you play chess take special care to protect your f7 pawn.
</div>



In [None]:
sns.barplot(x = df.EcoName.value_counts().nlargest(10), y = df.EcoName.value_counts().nlargest(10).index)

In [None]:
Opening_codes_to_names = {'A00' : 'Irregular Openings',
                         'B01' : 'Scandinavian Defence',
                         'A40' : "Queen's Pawn Game(Atypical Defenses)",
                         'D00' : "Queen's Pawn Game(1.d4 1.d5)",
                         'C00' : "French Defence",
                         'B00' : "King's Pawn Opening(Atypical Defenses)",
                         'C20' : "King's Pawn Game(1.e4 1.e5)", 
                         'B20' : "Sicilian Defence",
                         'C41' : "Philidor Defence",
                         'B07' : "Pirc Defence"}

In [None]:
df.Eco_Temp = df.Eco.map(Opening_codes_to_names).copy()
sns.barplot(x = df.Eco_Temp.value_counts().nlargest(10), y = df.Eco_Temp.value_counts().nlargest(10).index)

Here you can see, 
* Irregular Openings takes first place, which makes sense. As a lot of new players don't know the concept of openings and play unusual moves.
* A lot of people play Scandinavian Defence in our Dataset. I have faced a few but not a lot of them. Instead Sicilian Defence and French Defence are more common in my experience.

In [None]:
sns.countplot(df.Result)

Finally the results (our target variable).<br>
**Chess is a balanced game**<br>
There is a lot of symmetry, and so the person who moves first (White) usually has an advantage. That's why if a black player has to win he usually tries to play a unsymmetrical opening (like Sicilian Defence) and tries to create an unbalance in the game to create some winning chances.

In [None]:
df.to_csv('output_EDA.csv',index=False)

# Do upvote if you found this to be a good read!! This is still a work in progress
**Regarding the design a lot of inspiration (specially color palette) has been taken from Ruchi Bhatia, and Andrada Olteanu. So special thanks to them!!**<br>
You can check these notebooks of theirs
* [Commonlit readability prize eda baseline](https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline)
* [G2net searching the sky pytorch effnet w meta](https://www.kaggle.com/adityajha1504/g2net-searching-the-sky-pytorch-effnet-w-meta/edit)