In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn .preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

Hello and welcome to my notebook :) Here I'm gonna inspect and visualize **Top Women Chess Data**. Please note this notebook will be frequently updating. I hope so you will find an interesting content here. If you like this notebook you can let me know in comment or by upvoting. 
Enjoy!

## Loading Data

In [None]:
# Set path to data 
path = "../input/top-women-chess-players/top_women_chess_players_aug_2020.csv"

In [None]:
#Read data
data = pd.read_csv(path)

## Basic Information

In [None]:
data.sample(10)

In [None]:
data.describe()

## Data Analysing

At first see how much playar is active

In [None]:
np.array(data.Inactive_flag != "wi").sum()

Let's analyse only active players

In [None]:
data = data[data.Inactive_flag != "wi"]

### Year of birth

In [None]:
plt.figure(figsize=(13, 6))
data.Year_of_birth.hist(bins=20)

In [None]:
current_age = 2020
plt.figure(figsize=(13, 6))
data["Age"] = current_age - data.Year_of_birth
data.Age.plot.box()

We can spot some outliers in Age distribution

In [None]:
print("Median of age: ", data.Age.median())

## Federation 

In [None]:
federation_data = data.groupby("Federation").size().head(20)

In [None]:
federation_data = federation_data.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Active players by federation")
plt.ylabel("Count")
federation_data.plot.bar()

## Gender


Sure there is no need to analyse gender data :)

## Title

In chess we have four main classic titles such as:
#### Grand Master
The usual way to obtain the title is to achieve the required title norms over 27 or more games and a FIDE rating of 2500 or more.
#### International Master
The usual way to obtain the title is to achieve the required title norms over 27 or more games and a FIDE rating of 2400 or more.
#### FIDE Master
The usual way for a player to qualify for the FIDE Master title is by achieving an Elo rating of 2300 or more.
#### Candidate Master
The usual way for a player to qualify for the Candidate Master title is by achieving an Elo rating of 2200 or more.

In [None]:
data.Title.unique()

In [None]:
title_data = data.groupby("Title")

In [None]:
plt.figure(figsize=(9, 9))
title_data.size().plot.pie()

## Ratings!


No we are gonna analyse i think the most interesting data which is Ratings. At the begining let's look at standard rating!

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Standard rating hist")
plt.xlabel("Rating")
plt.ylabel("Count")
data.Standard_Rating.hist()

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Blitz rating hist")
plt.xlabel("Rating")
plt.ylabel("Count")
data.Blitz_rating.hist()

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Rapid rating hist")
plt.xlabel("Rating")
plt.ylabel("Count")
data.Rapid_rating.hist()

In [None]:
srt_by_federation = data.groupby("Federation").Standard_Rating.mean()
srt_by_federation = srt_by_federation.sort_values(ascending=False)
srt_by_federation = srt_by_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Standard rating mean")
plt.ylabel("Rating")
plt.axis([0,0,2000, 2200])
srt_by_federation.plot.bar()

In [None]:
brt_by_federation = data.groupby("Federation").Blitz_rating.mean()
brt_by_federation = brt_by_federation.sort_values(ascending=False)
brt_by_federation = brt_by_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Blitz rating mean")
plt.ylabel("Rating")
plt.axis([0,0,1800, 2200])
brt_by_federation.plot.bar()

In [None]:
rrt_by_federation = data.groupby("Federation").Rapid_rating.mean()
rrt_by_federation = rrt_by_federation.sort_values(ascending=False)
rrt_by_federation = rrt_by_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Rapid rating mean")
plt.ylabel("Rating")
plt.axis([0,0,1900, 2200])
rrt_by_federation.plot.bar()

In [None]:
srt_max_federation = data.groupby("Federation").Standard_Rating.max()
srt_max_federation = srt_max_federation.sort_values(ascending=False)
srt_max_federation = srt_max_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Standard rating max value")
plt.ylabel("Rating")
plt.axis([0,0,2400, 2700])
srt_max_federation.plot.bar()

In [None]:
brt_max_federation = data.groupby("Federation").Blitz_rating.max()
brt_max_federation = brt_max_federation.sort_values(ascending=False)
brt_max_federation = brt_max_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Blitz rating max value")
plt.ylabel("Rating")
plt.axis([0,0,2300, 2700])
brt_max_federation.plot.bar()

In [None]:
rrt_max_federation = data.groupby("Federation").Rapid_rating.max()
rrt_max_federation = rrt_max_federation.sort_values(ascending=False)
rrt_max_federation = rrt_max_federation.head(20)

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Rapid rating max value")
plt.ylabel("Rating")
plt.axis([0,0,2300, 2700])
rrt_max_federation.plot.bar()

In [None]:
rating_data = data[["Standard_Rating", "Blitz_rating", "Rapid_rating"]]

Now let's see how related are ratings

In [None]:
rating_data.plot.scatter(x="Blitz_rating", y="Standard_Rating", figsize=(13,6),title=("Standard and blitz rating relation"))

In [None]:
rating_data.plot.scatter(x="Rapid_rating", y="Standard_Rating", figsize=(13,6), title=("Standard and rapid rating relation"))

In [None]:
rating_data.plot.scatter(x="Rapid_rating", y="Blitz_rating", figsize=(13,6), title=("Blitz and rapid rating relation"))

In [None]:
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection='3d')
ax.scatter3D(data.Blitz_rating, data.Rapid_rating, data.Standard_Rating, zdir="z")
ax.set_xlabel('Blitz Rating')
ax.set_ylabel('Rapid Rating')
ax.set_zlabel('Standard Rating');

In [None]:
cm= sns.light_palette("seagreen", as_cmap=True)
sns.heatmap(rating_data.corr(), cmap=cm)

So as we can see ratings are strong correlated

Now let's see what is the median of rating for each title.

In [None]:
plt.figure(figsize=(13, 6))
plt.title("Title standard rating")
plt.ylabel("Rating")
plt.axis([0,0,1900, 2500])
title_data.Standard_Rating.mean().plot.bar()

Here we can spot that the mean values for each title are smaller than treshold we wrote earlier. Interesting, it's need invastigation later. If you know why is that please share with me in comment :)

# Title prediction

At the end let's try to train model to predict player title based on given data. For simplicifcation let's predict only 3 clases: GrandMasters, InternationalMasters and other clases labeled as Other. As we can expect not all values are useful. We can take for sure ratings and drop id, names, activity flag and gender. But what about age and nationality? Let's check that!

At first, let's drop columns we don't need for sure.

In [None]:
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
data.drop(["Fide id", "Name", "Gender", "Inactive_flag", "Year_of_birth"], axis=1, inplace=True)
data.head()

Now let's take care of Nan values

In [None]:
data.info()

In [None]:
data.Blitz_rating.fillna(data.Blitz_rating.mean(), inplace=True)
data.Rapid_rating.fillna(data.Rapid_rating.mean(), inplace=True)
data.dropna(axis=0, inplace=True)

As we have clean dataset we can now change Title values to only three classes

In [None]:
data.Title = [ "Other" if x != 'GM' and x != 'IM' else x for x in data.Title]

And encode categorical data as Title and Federation

In [None]:
lb = LabelEncoder()
title_map = {"GM":2, "IM":1, "Other":0}
data.Title = data.Title.map(title_map)
data.Federation = lb.fit_transform(data.Federation)

In [None]:
data[["Federation", "Title", "Age"]].corr()

There are few more techniques to identify future importance, but as we can see in correlation matrix federation and age does not impact on title. As a result we can easily drop it from dataset.

In [None]:
data.drop(["Federation","Age"],axis=1, inplace=True)

It is time to creat train test and separate labels

In [None]:
X = data.drop(["Title"], axis=1)
y = data.Title

Some scaling

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

And we can start training our model. I took simple but really strong model which is Logistic Regression. As we don't have a big dataset after cleaning we use cross validatio nscore for scoring

In [None]:
sgd = LogisticRegression()
cross_val_score(sgd, X, y, cv=5)

As we can see the results seems good, that what we want.

Thanks for spending some time with me and top women chess players. Be ready for some improvements of this notebook soon :)