In [None]:
# import utility modules
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy import stats

In [None]:
# load niemann dataset
df = pd.read_csv('../data/10480_games_with_centipawn_metrics.csv', index_col=0)

Minor data cleaning

In [None]:
# dropping indexes of df with null values for white
i = df[df['White Name'].isna()].index
df = df.drop(i)

# check missing data
# df.isna().sum()

In [None]:
# drop unnecessary features
df = df.drop(columns = ['PGN', 'White Expected Rating by ACPL', 'Black Expected Rating by ACPL'])
# df.head()

Data Analysis

In [None]:
# starting by looking at the statistics of the numerical values of df
df.describe()

In [None]:
sns.lineplot(data=df, x="White ELO", y="Black ELO")

In [None]:
sns.lineplot(data=df, x="White ELO", y="Moves")
sns.lineplot(data=df, x="Black ELO", y="Moves", alpha=0.4)

In [None]:
sns.histplot(df['White ELO'])
sns.histplot(df['Black ELO'], alpha=0.4)

In [None]:
df_23 = df.copy()
df_23 = df_23[df_23['White ELO'] > 2300]
df_23 = df_23[df_23['Black ELO'] > 2300]

In [None]:
# Calculating the IQR-inner quartile range
q3, q1 = np.percentile(df_23['White ELO'], [75 ,25])
iqr = q3-q1
iqr

In [None]:
bin_width = 2 * iqr / (len(df_23['White ELO']))**(1/3)
bin_number = (max(df_23['White ELO']) - min(df_23['White ELO'])) / bin_width
print (bin_number, bin_width)
# using matplotlib.pyplots hist -plot to return list of bins 
bins = plt.hist(df_23['White ELO'], bins=int(bin_number))

In [None]:
df_23['Bin_White'] = pd.cut(x=df['White ELO'], bins=bins[1], include_lowest=True).map(lambda x: (x.left+x.right)/2)
df_23['Bin_Black'] = pd.cut(x=df['Black ELO'], bins=bins[1], include_lowest=True).map(lambda x: (x.left+x.right)/2)

In [None]:
g = sns.lmplot(x='Bin_White', y='White Av CP Loss', x_estimator=np.mean, data=df_23, scatter_kws={'s':20})
g.set(ylim=(0,60))

In [None]:
g = sns.lmplot(x='Bin_Black', y='Black Av CP Loss', x_estimator=np.mean, data=df_23, scatter_kws={'s':20})
g.set(ylim=(0,60))

Calculate linearity of ELO and ACPL

In [None]:
r1, p1 = stats.pearsonr(df_23['Bin_Black'], df_23['Black Av CP Loss'])
r2, p2 = stats.pearsonr(df_23['Bin_White'], df_23['White Av CP Loss'])
r1, r2, p1, p2

In [None]:
df['ELO_diff'] = df['White ELO']-df['Black ELO']
acpl_higher = []
acpl_lower = []
for index, row in df.iterrows():
    if row['ELO_diff'] >= 0:
        acpl_higher.append(row['White Av CP Loss'])
        acpl_lower.append(row['Black Av CP Loss'])
    elif row['ELO_diff'] < 0:
        acpl_lower.append(row['White Av CP Loss'])
        acpl_higher.append(row['Black Av CP Loss'])

print(f'The average CP loss of the higher rated player was {np.round(np.mean(acpl_higher),2)}, while the lower rated player had a CP loss of {np.round(np.mean(acpl_lower),2)}') 

In [None]:
df['ACPL Lower'] = acpl_lower
df['ACPL Higher'] = acpl_higher 

In [None]:
g = sns.lineplot(data=df, x='ELO_diff', y='ACPL Higher')
g.set(ylim=(0, 100), xlim=(-500, 500))
g.set_title('Higher Rated Player')

In [None]:
g = sns.lineplot(data=df, x='ELO_diff', y='ACPL Lower')
g.set(ylim=(0, 100), xlim=(-500, 500))
g.set_title('Lower Rated Player')