# Using Chess Data to develop search functions

Inspired by Hikaru's and Gotham Chess' video on opening tiel list, I want to create my own search function with the data avalible from Linchess. 
* Intially I will provide a projection across ranges of rating to determine the best and most popular openings. 
* The search function will allow a user to input their rating and thus find the best and most popular opening at around their rating. 
* Another search function will allow a user to reverse search an opening anfind the win percentages for black and white across all rating or a rating they enter.

*Note: I will be using **black rating** as the standard for rating for sake of simplicity*

## Other intersting graphs

* How many turns does the average player make based on rating and opening used? (DONE)

In [None]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulaiton
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
import plotly.express as px
plt.style.use('seaborn-whitegrid')
from mpl_toolkits.mplot3d import Axes3D

#Preprocessing
from sklearn.preprocessing import StandardScaler
import os

#SQL
import sqlite3, csv
from sqlalchemy import create_engine

print("Setup Complete")

# Ignoring warning 
import warnings
warnings.filterwarnings('ignore')

In [None]:
chess_filepath = "../input/chess/games.csv"
games_df = pd.read_csv(chess_filepath)
print("Import Complete")

# Early Exploratory Analysis

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
nRowsRead = 10000 # Cutting data into ~ half
# games.csv has 20058 rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('../input/chess/games.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'games.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# Viewing first 5 rows
df1.head()

### Column Distribution

In [None]:
plotPerColumnDistribution(df1, 10, 5)

### Correlation Matrix

In [None]:
plotCorrelationMatrix(df1, 8)

### Scatter and Density Plots

In [None]:
plotScatterMatrix(df1, 18, 10)

In [None]:
print("Games has {} rows".format(len(games_df)))

## Data Description

* Game ID
* Rated (T/F): Is the game rated?
* Start Time
* End Time
* Number of Turns
* Game Status: How did the game end?
* Winner: What side won?
* Time Increment
* White Player ID
* White Player Rating
* Black Player ID
* Black Player Rating
* All Moves in Standard Chess Notation
* Opening Eco: Standardised Code for any given opening
* Opening Name
* Opening Ply: Number of moves in the opening phase

## Missing Values

In [None]:
games_df.isnull().sum()

## Preparing two dataframes for analysis

* Discretised continuous variables (continuous varibles that have been sorted into catergories)
* Continuous variables

In [None]:
df_bin = pd.DataFrame() # for discretised continuous variables
df_con = pd.DataFrame() # for continuous variables
print("Variables have been set")

In [None]:
# Data types in the dataframe
games_df.dtypes

In [None]:
games_df.head()

### Feature: victory_status

This feature will be one hot encoded later on for simpler coding.

In [None]:
print("There are {} unique variables in victory_status".format(len(games_df.victory_status.unique())))

In [None]:
#Adding to subset
df_bin['victory_status'] = games_df['victory_status']
df_con['victory_status'] = games_df['victory_status']

df_bin['opening_eco'] = games_df['opening_eco']
df_con['opening_eco'] = games_df['opening_eco']

df_bin['opening_name'] = games_df['opening_name']
df_con['opening_name'] = games_df['opening_name']
print("Added into subset")

In [None]:
# Adding to subset
df_bin['winner'] = games_df['winner']
df_con['winner'] = games_df['winner']
print("Added into subset")

In [None]:
# Bar chart for game results

plt.figure(figsize=(10,6))

sns.countplot(x = 'victory_status', hue = 'winner', data = df_con)
plt.legend()


### Feature: Rating (Black and White)

In [None]:
df_con['white_rating'] = pd.cut(games_df['white_rating'], 8)
df_bin['white_rating'] = pd.cut(games_df['white_rating'], 8)
print("Added to subset")

In [None]:
df_con['black_rating'] = pd.cut(games_df['black_rating'], 8)
df_bin['black_rating'] = pd.cut(games_df['black_rating'], 8)

print("Added to subset")

In [None]:
df_bin.head()

In [None]:
plt.figure(figsize=(20,5))

sns.countplot(x = 'white_rating', hue = 'winner', data = df_con)
plt.legend()
plt.title("Games result vs White rating")

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(x = 'black_rating', hue = 'winner', data = df_con)
plt.legend()
plt.title("Games result vs Black rating")

Graphs show that players are more likely to lose to the other side when they are lower rated. Their win percentage increases with a higher rating. This observation assumes that players are playing others around their rating and does not account for outlier such as a novice challenging an IM. As matchmaking AI accounts for this in ranked games, I will drop unranked games as they will likely be outliers.

## How long do games last across player ratings?

In [None]:
# Plotting jointplot comparing white rating with number of turns.
sns.jointplot(x = games_df['turns'], y = games_df['white_rating'], shade = True, kind = 'kde')


In [None]:
# Plotting jointplot with black rating against the number of turns.
sns.jointplot(x = games_df['turns'], y = games_df['black_rating'], shade = True, kind = 'kde')


Joint distribution plot shows that the average game lasts about 50 moves with lower rated with the lower number of turns favoring the lower rated games (<= 1000) and the highest number of turns for games with an average player rating 1500 to 1800. 

Plot also shows that the average rating in the database ~ 1500 with suprisingly fewer player ranked < 1000 than there are those ranked > 2000.

### Uploading CSV file onto SQL

In [None]:
# Converting CSV using pandas.DataFram.to_sql
engine = create_engine('sqlite://',  echo = False) 
games_df.to_sql('games_sql', con = engine) 

In [None]:
sql_query = '''SELECT name FROM sqlite_master WHERE type='table';'''
print(engine.execute(sql_query).fetchall())

In [None]:
# Creating a column for average rating and ensuring that games evaluated are close in rating (within 200)
sql_query = ("""
            SELECT *, (white_rating + black_rating)/2 AS average_rating, 
                ABS(black_rating - white_rating) AS rating_diff
            from games_sql
            WHERE rating_diff < 201
            """)
adj_rating = pd.read_sql_query(sql_query, con = engine)

adj_rating.to_sql('adj_rating_', con = engine)


### SQL code for most popular openings

In [None]:
sql_query = ("""                
            SELECT DISTINCT opening_name, opening_ply, 
                white_rating, black_rating, COUNT(opening_name)
                AS opening_freq
            FROM adj_rating_
            WHERE average_rating < 1099 AND opening_ply >= 2
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
beginner = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT DISTINCT opening_name, opening_ply, 
                white_rating, black_rating, COUNT(opening_name)
                AS opening_freq
            FROM adj_rating_
            WHERE (average_rating BETWEEN 1100 AND 1599)
                AND opening_ply >= 2
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
intermediate = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT DISTINCT opening_name, opening_ply, 
                white_rating, black_rating, COUNT(opening_name)
                AS opening_freq
            FROM adj_rating_
            WHERE average_rating BETWEEN 1600 AND 2099
                AND opening_ply >= 2
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
advanced = pd.read_sql_query(sql_query, con = engine) 

sql_query=("""
            SELECT DISTINCT opening_name, opening_ply, 
                white_rating, black_rating, COUNT(opening_name)
                AS opening_freq
            FROM adj_rating_
            WHERE average_rating >= 2100
                AND opening_ply >= 2
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
master = pd.read_sql_query(sql_query, con = engine)         

### SQL code for most winningest openings

In [None]:
# Black winner
sql_query = ("""                
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE average_rating < 1099 AND opening_ply >= 2
                AND winner LIKE '%black%'
            GROUP BY opening_name
            ORDER BY COUNT(winner) DESC
            LIMIT 10; 
           """)
beginner_b = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE (black_rating BETWEEN 1100 AND 1599)
                AND opening_ply >= 2 AND winner LIKE '%black%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
intermediate_b = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE black_rating BETWEEN 1600 AND 2099
                AND opening_ply >= 2 AND winner LIKE '%black%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
advanced_b = pd.read_sql_query(sql_query, con = engine) 

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE black_rating >= 2100
                AND opening_ply >= 2 AND winner LIKE '%black%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
master_b = pd.read_sql_query(sql_query, con = engine) 

#----------------------------------------------------
# White winner
sql_query = ("""                
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE average_rating < 1099 AND opening_ply >= 3
                AND winner LIKE '%white%'
            GROUP BY opening_name
            ORDER BY COUNT(winner) DESC
            LIMIT 10; 
           """)
beginner_w = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE (black_rating BETWEEN 1100 AND 1599)
                AND opening_ply >= 2 AND winner LIKE '%white%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
intermediate_w = pd.read_sql_query(sql_query, con = engine)

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE black_rating BETWEEN 1600 AND 2099
                AND opening_ply >= 2 AND winner LIKE '%white%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
advanced_w = pd.read_sql_query(sql_query, con = engine) 

sql_query=("""
            SELECT opening_name, opening_ply, winner, 
                white_rating, black_rating, COUNT(winner)
                AS winner_freq
            FROM adj_rating_
            WHERE black_rating >= 2100
                AND opening_ply >= 2 AND winner LIKE '%white%'
            GROUP BY opening_name
            ORDER BY COUNT(opening_name) DESC
            LIMIT 10; 
           """)
master_w = pd.read_sql_query(sql_query, con = engine) 

In [None]:
intermediate_b.head()

## Search Function

Once the above two steps are done, import the SQL tables onto python, plot graphs for most popular openings where opening_ply >= 3.

In [None]:
# Testing desired barplot
sns.barplot(x = 'winner_freq', y = 'opening_name', data = beginner_b)

In [None]:
# Creating function to get the name of the dataframe
def get_df_name(data):
    name =[x for x in globals() if globals()[x] is data][0]
    return name

In [None]:
#Creating auto bar plot function
def auto_plot(data_1,data_2, data_3, figsize = (20, 20)):
    """
    Function to plot a bar plot showing the top 10 most popular openings for 
    each rating tier.
    ::param_data = target dataframe
    ::param_figsize:: = size of figure (width, height)
    """
    fig = plt.figure(figsize=figsize)
    plt.subplot(3,1,1)
    sns.barplot(x = 'opening_freq', y  ='opening_name', data = data_1)
    plt.xlabel('Opening Frequency')
    plt.ylabel('Opening Name')
    plt.title('Frequency of openings for {} tier'.format(get_df_name(data_1)));
    plt.subplot(3,1,2)
    sns.barplot(x = 'winner_freq', y = 'opening_name', data = data_2)
    plt.xlabel('Black Win Frequency')
    plt.ylabel('Opening Name')
    plt.title('Frequency of black winner openings for {} tier'.format(get_df_name(data_2)));
    plt.subplot(3,1,3)
    sns.barplot(x = 'winner_freq', y = 'opening_name', data = data_3)
    plt.xlabel('White Win Frequency')
    plt.ylabel('Opening Name')
    plt.title('Frequency of white openings for {} tier'.format(get_df_name(data_3)));

# Popular Opening Search Tool

This tool will show the top 10 most popular openings based on the rating you specify.

*Please note that this black rating and wins have been used as a standard for this function*

In [None]:
# Creating a search function
def rating_tool(rating):
    """
    This function will return the barplot for the top 10 most
    popular openings in their rating tier
    ::param_rating = User chess rating
    """
    # Beginner
    if rating < 1100:
        print("You have entered {} making you a Beiginner (<1100), is this correct? (Enter Y or N)".format(rating))
        answer_b=input()
        print(answer_b)
        if answer_b.casefold() in ['Y', 'y', 'yes', 'Yes']:
            auto_plot(beginner, beginner_b, beginner_w)
        else:
            print("Please re-enter your correct rating in the function")

    # Intermediate
    elif 1100 <= rating < 1600:
        print("""You have entered {} making you a Intermediate (Between 1100 and 1599), is this correct? (Enter Y or N)""".format(rating))
        answer_i=input()
        if answer_i.casefold() in ['Y', 'y', 'yes', 'Yes']:
            auto_plot(intermediate, intermediate_b, intermediate_w)
        else:
            print("Please re-enter your correct rating in the function")
            

    # Advanced
    elif 1600 <= rating < 2100:
        print("""You have entered {} making you a Advanced (Between 1600 and 2099), is this correct? (Enter Y or N)""".format(rating))
        answer_a=input()
        if answer_a.casefold() in ['Y', 'y', 'yes', 'Yes']:
            auto_plot(advanced, advanced_b, advanced_w)
        else:
            print("Please re-enter your correct rating in the function")

    # Master
    elif rating >= 2100:
        print("""You have entered {} making you a Master (>2100), is this correct? (Enter Y or N)""".format(rating))
        answer_b=input()
        if answer_b.casefold() in ['Y', 'y', 'yes', 'Yes']:
            auto_plot(master, master_b, advanced_w)
        else:
            print("Please re-enter your correct rating in the function")

    else:
        print("That's not a number my friend! Try again...")
        print("""Enter 'rating_tool(chess rating)' your chess rating must be an integer """)
        
    
   

Feel free to play around with the function below to manually explore opening trends in the database :)

In [None]:
# Enter "rating_tool(chess rating goes here)"
rating_tool(2100)