In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<br><h1 style="font-family:COMIC SANS;text-align:center;background-color:lightblue" ><b>Table of Contents</b></h1><br>

We will try to answer to the following

1. Preprocess Year column to make it integer (remove “Jan”)
2. Who had the highest ELO? In which year?
3. What is the top 20 average ELO? What is the time trend for the average ELO?
4. Are the players getting stronger, weaker or there is no significant difference?
5. What is the minimum ELO of a player who ever appeared in top 20? Who is this?
6. What is the top 20 average age? What is the time trend for the average age?
7. Are the players getting older, younger or there is no significant difference?
8. who appeared most times in the top 20?
9. what is the average age of peak performance of top 10 players?
10. The probability of winning for player A is P(A) = 1/(1+10^m) where m is the rating difference (rating(B)-rating(A)) divided by 400. If a chess     Engine has ELO 3100, what is the chance of win for “best ever” Magnus Carlsen? Currently, Stockfish engine have ELO 3512, what is the chance of a win for Magnus Carlsen? What is the chance of drawing?
11. create a model which uses age as a predictor of ELO. Try to predict Magnus Carlsen’s next year rating!

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
plt.style.use('ggplot')


## Import the data

In [None]:
df = pd.read_csv('/kaggle/input/top-20-chess-ratings-20002021/Chess.csv')
df.head()

In [None]:
df.info()

In [None]:
df.corr()

## Preprocess Year column to make it integer (remove “Jan”)

In [None]:
df['Date'] = df['Date'].replace(' Jan', '', regex=True).astype(int)
df.info()

## Who had the highest ELO? In which year?

In [None]:
df.nlargest(1, 'ELO')

## What is the top 20 average ELO?

In [None]:
average_20_elo = df.nlargest(20, 'ELO')['ELO'].mean()
print('The average ELO of the top 20 position is: %s'%average_20_elo)

##  What is the time trend for the average ELO?

In [None]:
plt.figure(figsize=(18, 6))
plt.plot(np.sort(df['Date'].unique()), df.groupby('Date')['ELO'].mean())
plt.xlabel('Date')
plt.ylabel('Elo')
plt.title('Time Trend For Average ELO');

## Are the players getting stronger, weaker or there is no significant difference?

In [None]:
# Singular player trend over the years
name_list = list(df['Name'].unique())
def trend_play(name):
    plt.figure(figsize=(18, 6))
    for i in name_list:
        if i == name:
            df_name = df[['Name', 'Date', 'ELO']].loc[df['Name']==i].sort_values('Date')
    return plt.plot(df_name['Date'], df_name['ELO']), plt.xlabel('Years'), plt.title(name+''+' trend' );

for i in name_list[:10]:
    print(trend_play(i))


In [None]:
# Analysing the trend of all the players
plt.figure(figsize=(18, 10))

sns.barplot(x='Date', y='ELO', data=df, ci=False)
plt.xticks(rotation=45)
plt.title('Average ELO trend over years')
plt.show()

We can see that there is a slight positive trend over the years

## What is the minimum ELO of a player who ever appeared in top 20? Who is he?

In [None]:
df.nlargest(20, 'ELO').min()

## What is the top 20 average age?

In [None]:
top_20_average_age = df.nlargest(20, 'ELO').Age.mean()
print(f'Average age of top 20 ELO players: {top_20_average_age:.2f}')

## What is the time trend for the average age?

In [None]:
mean_age = df.groupby('Date')['Age'].mean()
# display(mean_age)
plt.figure(figsize=(18, 6))
plt.plot(mean_age);
plt.xlabel('Date')
plt.ylabel('Age')
plt.title('Average age time trend')

## Are the player getting older, younger or there is no significant difference?

In [None]:
plt.figure(figsize=(18, 6))
sns.barplot(x='Date', y='Age', data=df, ci=False)
plt.xticks(rotation=45)
plt.title('Age distribution over the years')

## Who appeared most times in the top 20?

In [None]:
df.nlargest(20, 'ELO')['Name'].value_counts()

## What is the average age of peak performance of top 10 players?

In [None]:
average_age_top_10 = df.nlargest(10, 'ELO')['Age'].mean()
print(f'The average age of top 10 players is: {average_age_top_10:.2f}')

In [None]:
df.nlargest(10, 'ELO').groupby('Name')['Age'].mean()

## The probability of winning for player A is P(A)= 1/(1+10^m) where m is the rating difference (rating(B)-rating(A)) divided by 400. If a chess engine has ELO 3100, what is the chance of win for "best ever" Magnus Carlsen? Currently Stockfish engine have ELO 3512, what is the chance of a win for Magnus Carlsen? What is the chance of drawing?

In [None]:
best_ever_carlsen = df.loc[df['Name']=='Carlsen']['ELO'].max()
def prob_win(name_1, name_2, rating_b):
    rating_a = df.loc[df['Name']==name_1]['ELO'].max()
    m = (rating_b - rating_a)/400
    prob = round((1 / (1 + 10**m)*100), 2)
    return f'The probability of winning for "best ever" {name_1} against {name_2} is {prob}%.\
 The probability of draw is {prob/2:0.2f}%'

print(prob_win('Carlsen', 'Undetermined Engine', 3100))
print(prob_win('Carlsen', 'Stockfish', 3512))

## Create a model which uses age as predictor of ELO. Try to predict Magnus Carlsen next year rating!

First, we are going to find the outliers in the age column. To decide which method of finding outliers we should use, we must plot the histogram of the variable and look at its distribution.

In [None]:
df = df.copy()

In [None]:
df['Age'].plot.hist(bins=50, title = "Histogram of the Age variable")

In [None]:
df['ELO'].plot.hist(bins=50, title = 'Histogram of the ELO variable')

### Z-score

It looks a little bit like Gaussian distribution so we will use z-score. Z-score is the difference between the value and the sample mean expressed as the number of standard deviations. To check what percentage of values covers a range of z-score values we should take a look at a (z-table)[http://www.z-table.com/].

In [None]:
from scipy.stats import zscore
df["Age_zscore"] = zscore(df["Age"])
df["is_outlier"] = df["Age_zscore"].apply(lambda x: x <= -2.5 or x >= 2.5)
df[df["is_outlier"]]

In [None]:
df["ELO_zscore"] = zscore(df["ELO"])
df["elo_is_outlier"] = df["ELO_zscore"].apply(lambda x: x <= -2.5 or x >= 2.5)
df[df["elo_is_outlier"]]

We can now proceed to the prediction

In [None]:
data = df[['Age', 'ELO']].where(df['Name'] == 'Carlsen').dropna().sort_values('Age', ascending=True)
X_age = data[['Age']].to_numpy()
Y_elo = data[['ELO']].to_numpy()
display(data)
display(X_age.shape)
display(Y_elo.shape)

In [None]:
# Split the data in to training/testing sets
X_age_train = X_age[:-4]
X_age_test = X_age[-4:]
# Split the targets in to training/testing sets 
Y_elo_train = Y_elo[:-4]
Y_elo_test = Y_elo[-4:]

# Create linear regression object
regr = linear_model.LinearRegression()

In [None]:
# Train the model using the training set
regr.fit(X_age_train, Y_elo_train)

In [None]:
# Make predictions using the testing test
Y_elo_pred = regr.predict(X_age_test)
print('Prediction for next year Carlsen\'s ELO is: %.02f'%Y_elo_pred[0])
# The coefficients
print('Coefficients:', regr.coef_)
# The mean squared error
print('Mean squared error : %.2f'%mean_squared_error(Y_elo_test, Y_elo_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'%r2_score(Y_elo_test, Y_elo_pred))

In [None]:
# Plot outputs
plt.figure(figsize=(18, 6))
plt.scatter(X_age_test, Y_elo_test, color='black')
plt.plot(X_age_test, Y_elo_pred, color='blue', linewidth=3)
plt.xlabel('Age')
plt.ylabel('ELO')
# plt.xticks(rotation=45)
# plt.yticks(())
plt.title('Linear regression study Age/ELO - Magnus Carlsen')

plt.show()

Thank you!!