In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score , confusion_matrix
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/chess/games.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
def plot_pie(column, title="All"):
    fig,axs = plt.subplots(1,1)
    data = df[column].value_counts()
    plt.pie(data,autopct='%1.2f%%',labels=data.index)
    plt.title(title)
    plt.show()
    
def plot_hist(column, title="all"):
    plt.hist(df[column],density=True)
    plt.title(title)
    plt.show()

def plot_bar(column, sort=False, title="all"):
    if sort:
        data_all = df[column].value_counts().sort_index()
    else:
        data_all = df[column].value_counts()
    plt.bar(data_all.index,data_all)
    plt.title(title)
    plt.show()

# rated

In [None]:
plot_pie('rated')

In [None]:
df['rated'].value_counts()

# turns

In [None]:
df['turns'].describe()

In [None]:
plot_hist('turns')

# victory_status

In [None]:
plot_bar('victory_status')

In [None]:
plot_pie('victory_status')

# winner

In [None]:
plot_bar('winner')

In [None]:
plot_pie('winner')

# white_id

In [None]:
df['white_id'].value_counts().head(10)

# white_rating

In [None]:
df['white_rating'].describe()

In [None]:
plot_hist('white_rating')

# black_id

In [None]:
df['black_id'].value_counts().head(10)

# black_rating

In [None]:
df['black_rating'].describe()

In [None]:
plot_hist('black_rating')

# opening_eco

In [None]:
df['opening_eco'].value_counts().head(10)

# opening_name

In [None]:
df['opening_name'].value_counts().head(10)

# opening_ply

In [None]:
df['opening_ply'].describe()

In [None]:
plot_hist('opening_ply')

# all rating (white_rating + black_rating)

In [None]:
data = df['white_rating']
data = data.append(df['black_rating'])
data = pd.DataFrame({'rating':data}).reset_index()
data = data.drop('index',axis=1)
data.head(10)

In [None]:
data['rating'].describe()

I'll divided the rating into 3 level : 0 (low), 1 (middle), 2 (high)

low rating : rating < 1394
middle rating : rating >= 1394 and rating < 1788
high rating : rating > 1788

# white_rating_level

In [None]:
df['white_rating_level'] = [0 if x<1394 else 1 if x>=1394 and x<1788 else 2 for x in df['white_rating']]
df['white_rating_level'].value_counts()

In [None]:
plot_bar('white_rating_level')

# black_rating_level

In [None]:
df['black_rating_level'] = [0 if x<1394 else 1 if x>=1394 and x<1788 else 2 for x in df['black_rating']]
df['black_rating_level'].value_counts()

In [None]:
plot_bar('black_rating_level')

# favorite opening

### favorite opening by white player who has rating level 2

In [None]:
data = df[df['white_rating_level']==2]
data.shape

In [None]:
data['opening_eco'].value_counts().head(10)

In [None]:
data['opening_name'].value_counts().head(10)

### favorite opening by black player who has rating level 2

In [None]:
data = df[df['black_rating_level']==2]
data.shape

In [None]:
data['opening_eco'].value_counts().head(10)

In [None]:
data['opening_name'].value_counts().head(10)

### favorite opening when white won and white has rating level 2

In [None]:
data = df[(df['winner']=='white') & (df['white_rating_level']==2)]

In [None]:
data['opening_eco'].value_counts().head(10)

In [None]:
data['opening_name'].value_counts().head(10)

### favorite opening when black won and black has rating level 2

In [None]:
data = df[(df['winner']=='black') & (df['black_rating_level']==2)]

In [None]:
data['opening_eco'].value_counts().head(10)

In [None]:
data['opening_name'].value_counts().head(10)

# Highest Rated Player

### White Highest Rated Player

In [None]:
data = df[['white_id','white_rating']].groupby(['white_id']).max()
data.sort_values(by='white_rating', ascending=False).head(5)

### Black Highest Rated Player

In [None]:
data = df[['black_id','black_rating']].groupby(['black_id']).max()
data.sort_values(by='black_rating', ascending=False).head(5)

# Data Preprocessing

In [None]:
X = df.copy()

y = X['winner']

#Drop the Attrition_Flag Column
X = X.drop(['winner',"id",'rated','created_at', 'last_move_at', 'turns','victory_status','increment_code','white_id','black_id','moves'], axis=1)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
y = pd.DataFrame({'winner': y})

In [None]:
y

In [None]:
X.columns

In [None]:
X = pd.get_dummies(X, columns=['opening_eco','opening_name'],drop_first=True)
X.columns

In [None]:
#Split to data train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=1234)

X_smote, y_smote = sm.fit_resample(X_train, y_train)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_smote.shape}''')

print('\nBalance of positive and negative classes (%):')
y_smote['winner'].value_counts(normalize=True) * 100

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_smote = sc.fit_transform(X_smote)
X_test = sc.transform(X_test)

In [None]:
# Import ML Libraries
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

classifiers = [[CatBoostClassifier(verbose=0),'CatBoost Classifier'],[XGBClassifier(),'XGB Classifier'], [RandomForestClassifier(),'Random Forest'], 
    [KNeighborsClassifier(), 'K-Nearest Neighbours'], [SGDClassifier(),'SGD Classifier'], [SVC(),'SVC'],[LGBMClassifier(),'LGBM Classifier'],
              [GaussianNB(),'GaussianNB'],[DecisionTreeClassifier(),'Decision Tree Classifier'],[LogisticRegression(),'Logistic Regression']]

In [None]:
def predict(X_smote, y_smote,X_test,y_test):
    for cls in classifiers:
        model = cls[0]
        model.fit(X_smote, y_smote)

        y_pred = model.predict(X_test)
        print(cls[1])
        print ('Confusion Matrix:')
        print(confusion_matrix(y_test, y_pred))
        print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
        #print("Recall : ", recall_score(y_test, y_pred) *  100)
        #print("Precision : ", precision_score(y_test, y_pred) *  100)

In [None]:
predict(X_smote, y_smote,X_test,y_test)