In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px

from sklearn.model_selection import train_test_split as tts

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv('../input/chess/games.csv')

In [None]:
data

# Preprocessing

In [None]:
data['winner'].unique()

In [None]:
moves = np.array(
    data.query("winner != 'draw'")['moves']
)

In [None]:
labels = data.query("winner != 'draw'")['winner'].apply(lambda x:1 if x=='white' else 0)

In [None]:
all_moves = set()
for move_list in moves:
    for move in move_list.split(" "):
        if move not in all_moves:
            all_moves.add(move)
            
max_vocab = len(all_moves)

In [None]:
max_vocab

### Length of the longest sequence moves

In [None]:
max_len = 0
for move_list in moves:
    total = 0
    for move in move_list.split(" "):
        total+=1
    if total > max_len:
        max_len = total
        

In [None]:
max_len

## Create input vectors

In [None]:
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts(moves)

sequences = tokenizer.texts_to_sequences(moves)

word_index = tokenizer.word_index


In [None]:
model_inputs  = pad_sequences(sequences,maxlen=max_len)

In [None]:
model_inputs.shape

In [None]:
labels.shape

# Training

In [None]:
train_inputs,test_inputs,train_labels,test_labels = tts(model_inputs,labels,train_size=0.7,random_state=24)

In [None]:
embedding_dim = 256
inputs = tf.keras.Input(shape = max_len)
embedding = tf.keras.layers.Embedding(
    input_dim = max_vocab,
    output_dim = embedding_dim,
    input_length = max_len
)(inputs)

gru = tf.keras.layers.GRU(units = embedding_dim)(embedding)

outputs = tf.keras.layers.Dense(1,activation = 'sigmoid')(gru)

model = tf.keras.Model(inputs = inputs,outputs = outputs)

In [None]:
model.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',
    metrics = [
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)



In [None]:
batch_size = 32
epochs = 5

In [None]:
history = model.fit(
    train_inputs,
    train_labels,
    validation_split=0.2,
    batch_size = batch_size,
    epochs = epochs,
    callbacks = [tf.keras.callbacks.ReduceLROnPlateau()]
    
)

In [None]:
fig = px.line(
    history.history,
    y = ['loss','val_loss'],
    labels = {
        "x" : 'Epochs',
        "y" : 'Loss'
    },
    title = "Loss Over Time"
    

)
fig.show()

In [None]:
fig = px.line(
    history.history,
    y = ['auc','val_auc'],
    labels = {
        "x" : 'Epochs',
        "y" : 'Loss'
    },
    title = "AUC Over Time"
    

)
fig.show()

In [None]:
model.evaluate(test_inputs,test_labels)