In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, we train a CNN to detect the configuration of piecies on a chess board. The dataset consists of 80000 400x400 images of a chess board in 28 styles and with 5-15 pieces on it from one of 32 styles. The filename decribes the layout in standard PEN notation (pieces on each row, lower case is black, upper case is white, numbers indicate a number of consecutive empty squares). We could train a CNN on the images but it is probably more efficient to train a network on individual squares/pieces.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from glob import glob

# View the data

List of files:

In [None]:
files=glob("*.jpeg",root_dir='/kaggle/input/chess-positions/train/')

In [None]:
img=mpimg.imread('/kaggle/input/chess-positions/train/'+files[0])
plt.imshow(img);#note that the shape is 400x400x3
#print(img.dtype)

In [None]:
row=4
col=3
plt.imshow(img[row*50:(row+1)*50,col*50:(col+1)*50,:]);

# Parse the file names

In [None]:
PEN=files[0][:-5].split('-')
print(PEN)

Let's explicitly write 0 for each empty square. This will make it easier to identifty the position of the pieces

In [None]:
d={'1':'0','2':'00','3':'000','4':'0000','5':'00000','6':'000000','7':'0000000','8':'00000000'}
trans_table=str.maketrans(d)
s=[item.translate(trans_table) for item  in PEN]
print(s)

And we invert the transformation as follows

In [None]:
s1=s
for k, v in reversed(list(d.items())):
    s1=[item.replace(v, k) for item in s1]
s1

We map pieces to ints by

In [None]:
map_to_ints={'n':0,'N':1,'b':2,'B':3,'r':4,'R':5,'p':6,'P':7,'k':8,'K':9,'q':10,'Q':11,'0':12}
labels=[map_to_ints[char] for char in s[0]]
labels

Define the reverse map from ints to characters

In [None]:

map_from_ints=dict((v, k) for k, v in map_to_ints.items())
x=[map_from_ints[k] for k in labels]
print(x)

# Define a model for classifying individual squares

We use the a similar toplogy to the MNIST digits problem i.e. 2 convolutional layers and 2 fully connected layers

In [None]:
class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()

      # First 2D convolutional layer. Takes in the 3x50x50 images
      # outputting 32 convolutional features, with a square kernel size of 5
      self.conv1 = nn.Conv2d(3, 32, 5)
        
      # Second 2D convolutional layer, taking in the 32 input layers,
      # outputting 64 convolutional features, with a square kernel size of 5
      self.conv2 = nn.Conv2d(32, 64, 5)

      # Designed to ensure that adjacent pixels are either all 0s or all active
      # with an input probability
      self.dropout1 = nn.Dropout2d(0.25)
      self.dropout2 = nn.Dropout(0.5)

      # First fully connected layer
      self.fc1 = nn.Linear(5184, 128)
      # Second fully connected layer that outputs our 13 labels
      self.fc2 = nn.Linear(128, 13)
    
    def forward(self, x):

      x = self.conv1(x)               #Nx3x50x50 -> Nx32x46x46
      x = F.relu(x) 
      x = F.max_pool2d(x, 2)            ##Nx32x46x46 -> Nx32x23x23

      x = self.conv2(x)               #Nx32x23x23 -> Nx64x19x19
      x = F.relu(x)
      x = F.max_pool2d(x, 2)            ##Nx64x19x19 -> Nx64x9x9
        
      # Pass data through dropout1
      x = self.dropout1(x)
      # Flatten x with start_dim=1
      x = torch.flatten(x, 1)
      # Pass data through ``fc1``
      x = self.fc1(x)
      x = F.relu(x)
      x = self.dropout2(x)
      x = self.fc2(x)

      # Apply softmax to x
      #output = F.log_softmax(x, dim=1)# nn.CrossEntropyLoss() takes in logits
      return x#output

my_nn = Net()
print(my_nn)

Test the model first

In [None]:
input=torch.tensor(img[0:50,0:50,:].transpose(2,0,1),dtype=torch.float32)/255# convert to tensor of floats
input=torch.unsqueeze(input,0)#batch dimension
print(input.shape)
result = my_nn(input)
print(result)

Set the optimizer

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(my_nn.parameters(), lr=0.001, momentum=0.9)

# Train the model

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)
my_nn.to(device)

In [None]:
n_epochs=30
accuracy_train=np.zeros(n_epochs,)
loss_train=np.zeros(n_epochs,)

In [None]:
map_to_ints={'n':0,'N':1,'b':2,'B':3,'r':4,'R':5,'p':6,'P':7,'k':8,'K':9,'q':10,'Q':11,'0':12}

N_samples=3000

x_train=np.ndarray((64,3,50,50),np.uint8)
y_train=np.ndarray((64,),np.uint8)

all_predictions=np.ndarray((N_samples*64,),dtype='int64')
all_labels=np.ndarray((N_samples*64,),dtype='int64')


for epoch in range(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0,N_samples):
        img=mpimg.imread('/kaggle/input/chess-positions/train/'+files[i])
        
        s=files[i][:-5].replace('-','').translate(trans_table)
        y_train=[map_to_ints[x] for x in s]
        
        j=0
        for row in range(8):
            for col in range(8):
                x_train[j,:,:,:]=img[row*50:(row+1)*50,col*50:(col+1)*50,:].transpose(2,0,1)
                j+=1
                
        batch=torch.tensor(x_train,dtype=torch.float32).to(device)/255
        labels=torch.tensor(y_train,dtype=torch.long).to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = my_nn(batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        
        # print statistics
        running_loss += loss.item()
        if i% 1000 == 0:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 1000:.3f}')
            

#after each epoch, run again on the training data
    with torch.no_grad():
        
        loss=0
        for i in range(0,N_samples):
            img=mpimg.imread('/kaggle/input/chess-positions/train/'+files[i])
        
            s=files[i][:-5].replace('-','').translate(trans_table)
            y_train=[map_to_ints[x] for x in s]
            all_labels[64*i:(64*(i+1))]=y_train
        
            j=0
            for row in range(8):
                for col in range(8):
                    x_train[j,:,:,:]=img[row*50:(row+1)*50,col*50:(col+1)*50,:].transpose(2,0,1)
                    j+=1
                
            batch=torch.tensor(x_train,dtype=torch.float32).to(device)/255
            labels=torch.tensor(y_train,dtype=torch.long).to(device)

            outputs = my_nn(batch)
            loss += criterion(outputs, labels).item()
            _, batch_predictions = torch.max(outputs.data, 1)
            all_predictions[64*i:(64*(i+1))]=batch_predictions.cpu().numpy()
            
        
        accuracy_train[epoch]=sum(all_labels==all_predictions)/all_labels.shape[0]
        loss_train[epoch]=loss
        

    
    print(f'[{epoch + 1}, {i + 1:5d}] training loss: {loss_train[epoch]}, training accuracy: {accuracy_train[epoch]}')

print('Finished Training')

In [None]:
import seaborn as sns
fig,ax=plt.subplots(1,2,figsize=(12,4))

sns.lineplot(pd.DataFrame({'loss':loss_train },index=range(n_epochs)),ax=ax[0]);
ax[0].set_yscale('log')
ax[0].set_ylabel('loss');
ax[0].set_xlabel('epochs');

sns.lineplot(pd.DataFrame({'training':accuracy_train },index=range(n_epochs)),ax=ax[1]);
ax[1].set_ylim(0.98,1);
ax[1].set_ylabel('accuracy');
ax[1].set_xlabel('epochs');

In [None]:
#save the model
PATH = '/kaggle/working/my_nn.pth'
torch.save(my_nn.state_dict(), PATH)

In [None]:
#load the model
PATH = '/kaggle/working/my_nn.pth'
my_nn = Net()
my_nn.load_state_dict(torch.load(PATH))

# Run the model on the test data

Let's rerun on the test data

In [None]:
test_files=glob("*.jpeg",root_dir='/kaggle/input/chess-positions/test/')

In [None]:
x_test=np.ndarray((64,3,50,50),np.uint8)

N_test=1000 # first N_test images

results=np.ndarray((N_test,),np.bool_)


for j in range(N_test):
    file=test_files[j]
    img=mpimg.imread('/kaggle/input/chess-positions/test/'+file)
    i=0
    for row in range(8):
        for col in range(8):
            x_test[i,:,:,:]=img[row*50:(row+1)*50,col*50:(col+1)*50,:].transpose(2,0,1)
            i+=1

    with torch.no_grad():
            batch=torch.tensor(x_test,dtype=torch.float32).to(device)/255
            outputs = my_nn(batch)
            _, batch_predictions = torch.max(outputs.data, 1)
            p=[map_from_ints[k] for k in batch_predictions.cpu().numpy()]
            s=[''.join(p[:8]),''.join(p[8:16]),''.join(p[16:24]),''.join(p[24:32]),''.join(p[32:40]),''.join(p[40:48]),''.join(p[48:56]),''.join(p[56:64])]        
            PEN='-'.join(s)
            for k, v in reversed(list(d.items())):
                PEN=PEN.replace(v, k)
            results[j]=file[:-5]==PEN
            if results[j]==False:
                print(file[:-5],PEN)

In [None]:
sum(results)/len(results)

This is consistent with the single square training accuracy 0.9997^64=0.98. If we train on all 80000 test images, the accuracy may improve further but a more complex model may also be required

****Important lesson****

Since most of the chess board is empty, the empty squares dominate the dataset. One may think that it would be better to choose a balanced dataset on which to train. However, while this still leads to high test (and validation) accuracy on individual squares (as obtained here), the test acurracy is worse than expected (e.g. 90% rather than 98%). On inspection, the majority of the errors are due the mis-classification of empty squares and, in particular, empty squares that have high variance due to a non-uniform board pattern. Therefore it appears that the model has not seen sufficient empty squares to obtain the high test accuracy required. Since the test data is inherently unbalanced, it is makes sense to also train on the entire boards as we have done here. Then the obtained test accuracy on boards is consistent with the prediction test_accuracy**64