In [1]:
# Requirements

# To easily load sudoku dataset usuable with pytorch
!pip -q install datasets

# To optimize hyperparameter tuning
!pip -q install optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
try:
  from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, AdamW, get_scheduler
except:
  !pip -q install transformers
  from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, AdamW, get_scheduler
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
import optuna
import pandas as pd
import numpy as np
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [4]:
# Uploading Kaggle Json required to download sudoku dataset
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Move kaggle.json into the folder where the API expects to find it
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


In [5]:
# download and unzip sudoku dataset
!kaggle datasets download -d rohanrao/sudoku
!unzip sudoku.zip

Downloading sudoku.zip to /content
 98% 609M/620M [00:10<00:00, 103MB/s] 
100% 620M/620M [00:10<00:00, 62.9MB/s]
Archive:  sudoku.zip
  inflating: sudoku.csv              


In [6]:
# load and inspect a few samples from the dataset
path ="/content/sudoku.csv"
df = pd.read_csv(path, nrows=300000)
data = df.sample(frac=0.8,random_state=200)
test = df.drop(data.index)
test[:1]

Unnamed: 0,puzzle,solution
0,0700000430400096108006349000940520003584600200...,6795182435437296188216349577943521863584617292...


In [7]:
# display the sudoku string into 3*3 grids
def display_sudoku(sudoku_str):
  print("-"*21)
  for i in range(0, 9):
    for j in range(0, 9):
      if sudoku_str[i*9 + j] == "0": print("  ", end="")
      else: print(sudoku_str[i*9 + j] + " ", end="")
      if j in [2, 5]: print("| ", end="")
    print("")
    if i in [2,5,8]: print("-" * 21)

In [8]:
print("Puzzle")
display_sudoku(test['puzzle'][0])
print("Solution")
display_sudoku(test['solution'][0])

Puzzle
---------------------
  7   |       |   4 3 
  4   |     9 | 6 1   
8     | 6 3 4 | 9     
---------------------
  9 4 |   5 2 |       
3 5 8 | 4 6   |   2   
      | 8     | 5 3   
---------------------
  8   |   7   |   9 1 
9   2 | 1     |     5 
    7 |   4   | 8   2 
---------------------
Solution
---------------------
6 7 9 | 5 1 8 | 2 4 3 
5 4 3 | 7 2 9 | 6 1 8 
8 2 1 | 6 3 4 | 9 5 7 
---------------------
7 9 4 | 3 5 2 | 1 8 6 
3 5 8 | 4 6 1 | 7 2 9 
2 1 6 | 8 9 7 | 5 3 4 
---------------------
4 8 5 | 2 7 6 | 3 9 1 
9 6 2 | 1 8 3 | 4 7 5 
1 3 7 | 9 4 5 | 8 6 2 
---------------------


In [9]:
# generating vocabulary for the BERT model
vocab = "[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\none\ntwo\nthree\nfour\nfive\nsix\nseven\neight\nnine"
os.makedirs("/content/tokenizer", exist_ok=True)
with open("/content/tokenizer/vocab.txt", "w") as f:
  f.write(vocab)

In [10]:
tokens = ["[MASK]", "one", "two", "three",
          "four", "five", "six", "seven", "eight", "nine"]

# helper Functions
def sudoku_to_tokens(sudoku_str):
  return [tokens[int(i)] for i in sudoku_str]

def tokens_to_sudoku(token_list):
  return "".join(str(tokens.index(x)) for x in token_list)

def fill_masks(sudoku_str, mask_str):

  ret_str = ""
  mask = iter(mask_str)
  for s in sudoku_str:
    if int(s) != 0:
      ret_str += s
    else:
      ret_str += next(mask)
  return ret_str

def check_correctness(sudoku_str):

  horizontals = [[sudoku_str[i+j*9] for i in range(0, 9)] for j in range(0,9)]
  verticals = [[sudoku_str[i*9+j] for i in range(0, 9)] for j in range(0,9)]
  boxes = [[sudoku_str[i*3+j*3*9 + x+y*9] for x in range(0, 3) for y in range(0,3)]
            for i in range(0, 3) for j in range(0,3)]
  horizontals_correct = all(map(lambda x: len(set(x)) == 9, horizontals))
  verticals_correct = all(map(lambda x: len(set(x)) == 9, verticals))
  boxes_correct = all(map(lambda x: len(set(x)) == 9, boxes))
  return horizontals_correct and verticals_correct and boxes_correct

In [11]:
# creating custom dataset for sudoku
def custom_dataset(puzzle_list, solution_list):

    tokenizer = BertTokenizerFast.from_pretrained('/content/tokenizer')
    inputs = tokenizer.batch_encode_plus(puzzle_list, return_tensors="pt",is_split_into_words=True)
    labels = tokenizer.batch_encode_plus(solution_list, return_tensors="pt",is_split_into_words=True)
    inputs['labels'] = labels['input_ids']
    dataset = Dataset.from_dict(inputs)
    dataset.set_format("torch")
    return dataset


In [12]:
# BERT model
class BERT:

  def __init__(self, hid_size, hid_layers):
    config = BertConfig(
        vocab_size = 14,  # we align this to the tokenizer vocab_size
        max_position_embeddings = 83,
        hidden_size = hid_size,
        num_attention_heads = 10,
        num_hidden_layers = hid_layers,
        type_vocab_size = 1
        )
    self.model = BertForMaskedLM(config)

  def train(self, custom_dataset, epochs = 1, scheduler = None, best_loss = 0):

    train_dataloader = DataLoader(custom_dataset, shuffle=True, batch_size=32)
    device = torch.device('cuda:0')
    self.model.to(device)
    self.model.train()

    # initialize optimizer
    optim = AdamW(self.model.parameters(), lr=1e-4)

    # scheduler
    if scheduler:
      num_training_steps = epochs * len(train_dataloader)
      lr_scheduler = get_scheduler(
          "linear",
          optimizer=optim,
          num_warmup_steps=0,
          num_training_steps=num_training_steps,
      )

    #  train model
    self.model.train()
    for epoch in range(epochs):
      print("Epoch:",epoch)
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = self.model(**batch)
          loss = outputs.loss
          if best_loss:
            if(loss.item() < best_loss):
              self.model.save_pretrained('./sudoku-bert')
              best_loss = loss.item()
          loss.backward()
          optim.step()
          if scheduler:
            lr_scheduler.step()
          optim.zero_grad()
    return loss

In [13]:
# hyperparameter Tuning using Optuna Framework
def objective(trial):

  x1 = data.iloc[:10000, 0].values
  y1 = data.iloc[:10000, 1].values
  puzzle_list = list(map(sudoku_to_tokens, x1))
  solution_list = list(map(sudoku_to_tokens, y1))

  # create sudoku dataset in torch format
  dataset = custom_dataset(puzzle_list, solution_list)

  # trial parameters for BERT
  hidden_size = trial.suggest_int("hidden", 20, 400, step = 20, log=False )
  hidden_layers = trial.suggest_int("hid_layers", 2, 12, step = 2, log=False)

  # create BERT model for trial
  bert = BERT(hidden_size, hidden_layers)

  #  train BERT model
  loss = bert.train(dataset)
  return loss

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

In [None]:
# Train BERT model using optimal parameters
puzzle_list = list(map(sudoku_to_tokens, data.iloc[:, 0].values))
solution_list = list(map(sudoku_to_tokens, data.iloc[:, 1].values))
dataset = custom_dataset(puzzle_list, solution_list)

# bert = BERT(study.best_params['hidden'], study.best_params['hid_layers'])
bert = BERT(400, 12)
bert.train(dataset, epochs = 2, scheduler = True, best_loss = 2.0)

# Compress the saved model and move to mydrive
!zip -r /content/drive/MyDrive/sudoku-bert1.zip /content/sudoku-bert

In [15]:
!unzip /content/drive/MyDrive/sudoku-bert1.zip

Archive:  /content/drive/MyDrive/sudoku-bert1.zip
   creating: content/sudoku-bert/
  inflating: content/sudoku-bert/config.json  
  inflating: content/sudoku-bert/model.safetensors  
  inflating: content/sudoku-bert/generation_config.json  


In [18]:
# evaluate model
def evaluation(test_model, tokenizer, test_puzzle_list, test_solution_list):
  val_summary = []
  sum = 0
  elements = 0

  for i, (x,y) in enumerate(zip(test_puzzle_list, test_solution_list)):
    new_inputs = tokenizer.encode_plus(x, return_tensors="pt",is_split_into_words=True)
    new_labels = tokenizer.encode_plus(y, return_tensors="pt",is_split_into_words=True)
    with torch.no_grad():
        output = test_model(**new_inputs, labels=new_labels['input_ids'])
        logits = output.logits

    # retrieve index of [MASK]
    mask_token_index = (new_inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
    correct_token_id = new_labels["input_ids"][0, mask_token_index]
    preds = tokenizer.decode(predicted_token_id)
    num_matching_masks = np.count_nonzero(correct_token_id==predicted_token_id)
    percent_correct = num_matching_masks / len(predicted_token_id)
    val_summary.append(percent_correct)
  return val_summary

In [21]:
# load test dataset
test_puzzle_list = list(map(sudoku_to_tokens, test.iloc[:100, 0].values))
test_solution_list = list(map(sudoku_to_tokens, test.iloc[:100, 1].values))
# test_dataset = custom_dataset(test_puzzle_list, test_solution_list)
# test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=32)

# load pretrained model and tokenizer
test_model = BertForMaskedLM.from_pretrained('content/sudoku-bert')
tokenizer = BertTokenizerFast.from_pretrained('/content/tokenizer')

# evaluate the model on test dataset
test_summary = evaluation(test_model, tokenizer, test_puzzle_list, test_solution_list)

In [26]:
# Overall Results
from statistics import mean
print("Maximum % match"  ,max(test_summary))
print("Minimum % match:" ,min(test_summary))
print("Overall % match:" ,mean(test_summary))

Maximum % match 0.8484848484848485
Minimum % match: 0.38
Overall % match: 0.6180411391881409


In [38]:
def test_sudoku_sample(puzzle_str):

  # load pretrained model and tokenizer
  test_model = BertForMaskedLM.from_pretrained('content/sudoku-bert')
  tokenizer = BertTokenizerFast.from_pretrained('/content/tokenizer')

  # format puzzle string into tokens
  puzzle_token = sudoku_to_tokens(puzzle_str)
  new_inputs = tokenizer.encode_plus(puzzle_token, return_tensors="pt",is_split_into_words=True)

  # predict the solution
  with torch.no_grad():
    output = test_model(**new_inputs)
    logits = output.logits

  # retrieve index of [MASK]
  mask_token_index = (new_inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
  predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

  # form sudoku solution from masks
  mask_tokens = tokenizer.decode(predicted_token_id)
  mask_str = tokens_to_sudoku(mask_tokens.split(" "))
  solution = fill_masks(tokens_to_sudoku(puzzle_token), mask_str)
  display_sudoku(solution)
  print("Correct?:", check_correctness(solution))


In [39]:
test_sudoku_sample(test['puzzle'][0])

---------------------
2 7 9 | 7 1 8 | 2 4 3 
2 4 9 | 7 1 9 | 6 1 8 
8 2 9 | 6 3 4 | 9 5 7 
---------------------
7 9 4 | 3 5 2 | 8 8 6 
3 5 8 | 4 6 1 | 1 2 9 
2 2 6 | 8 9 7 | 5 3 4 
---------------------
4 8 5 | 2 7 6 | 4 9 1 
9 3 2 | 1 8 6 | 7 6 5 
4 1 7 | 9 4 5 | 8 6 2 
---------------------
Correct?: False
