In [None]:
try:
  from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM
except:
  !pip -q install transformers
  from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, AdamW, get_scheduler
import torch
import pandas as pd
from torch.utils.data import DataLoader
import os

[K     |████████████████████████████████| 4.7 MB 33.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 57.0 MB/s 
[K     |████████████████████████████████| 120 kB 55.7 MB/s 
[?25h

In [None]:
!pip -q install datasets
from datasets import Dataset

[K     |████████████████████████████████| 365 kB 24.0 MB/s 
[K     |████████████████████████████████| 212 kB 20.6 MB/s 
[K     |████████████████████████████████| 115 kB 60.9 MB/s 
[K     |████████████████████████████████| 127 kB 61.2 MB/s 
[?25h

In [None]:
!pip -q install optuna
import optuna

[K     |████████████████████████████████| 348 kB 31.7 MB/s 
[K     |████████████████████████████████| 209 kB 70.2 MB/s 
[K     |████████████████████████████████| 81 kB 11.4 MB/s 
[K     |████████████████████████████████| 78 kB 8.6 MB/s 
[K     |████████████████████████████████| 112 kB 27.5 MB/s 
[K     |████████████████████████████████| 49 kB 6.2 MB/s 
[K     |████████████████████████████████| 147 kB 75.4 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [None]:
tokens = ["[MASK]", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]

## Helper Functions

def sudoku_to_tokens(sudoku_str):
  return [tokens[int(i)] for i in sudoku_str]

def tokens_to_sudoku(token_list):
  return "".join(str(tokens.index(x)) for x in token_list)

def display_sudoku(sudoku_str):
  print("-"*21)
  for i in range(0, 9):
    for j in range(0, 9):
      if sudoku_str[i*9+j] == "0": print("  ", end="")
      else: print(sudoku_str[i*9+j] + " ", end="")
      if j in [2, 5]: print("| ", end="")
    print("")
    if i in [2,5,8]: print("-" * 21)

def fill_masks(sudoku_str, mask_str):
  ret_str = ""
  mask = iter(mask_str)
  for s in sudoku_str:
    if int(s) != 0:
      ret_str += s
    else:
      ret_str += next(mask)
  return ret_str

def check_correctness(sudoku_str):
  horizontals = [[sudoku_str[i+j*9] for i in range(0, 9)] for j in range(0,9)]
  verticals = [[sudoku_str[i*9+j] for i in range(0, 9)] for j in range(0,9)]
  boxes = [[sudoku_str[i*3+j*3*9 + x+y*9] for x in range(0, 3) for y in range(0,3)]
            for i in range(0, 3) for j in range(0,3)]
  horizontals_correct = all(map(lambda x: len(set(x)) == 9, horizontals))
  verticals_correct = all(map(lambda x: len(set(x)) == 9, verticals))
  boxes_correct = all(map(lambda x: len(set(x)) == 9, boxes))
  return horizontals_correct and verticals_correct and boxes_correct

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
# Uploading Kaggle Json
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 62 bytes


In [None]:
# Downloading sudoku dataset from kaggle
!kaggle datasets download -d rohanrao/sudoku
!unzip sudoku.zip

Downloading sudoku.zip to /content
 98% 611M/620M [00:05<00:00, 156MB/s]
100% 620M/620M [00:05<00:00, 127MB/s]
Archive:  sudoku.zip
  inflating: sudoku.csv              


In [None]:
# path ="/content/drive/MyDrive/sudoku2.csv"
path ="/content/sudoku.csv"
df = pd.read_csv(path, nrows=300000) # only read x rows
data=df.sample(frac=0.8,random_state=200)
test=df.drop(data.index)

In [None]:
vocab = "[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\none\ntwo\nthree\nfour\nfive\nsix\nseven\neight\nnine"
os.makedirs("/content/tokenizer", exist_ok=True)
with open("/content/tokenizer/vocab.txt", "w") as f:
  f.write(vocab)


q = data.iloc[:, 0].values
s = data.iloc[:, 1].values
x1 = list(map(sudoku_to_tokens, q))
y1 = list(map(sudoku_to_tokens, s))

In [None]:
# Hyperparameter Tuning using Optuna Framework

def objective(trial):

  q1 = data.iloc[:50000, 0].values
  s1 = data.iloc[:50000, 1].values
  x2 = list(map(sudoku_to_tokens, q1))
  y2 = list(map(sudoku_to_tokens, s1))
  
  tokenizer = BertTokenizerFast.from_pretrained('/content/tokenizer')
  hidden_size = trial.suggest_int("hidden", 20, 400, step = 20, log=False )
  hidden_layers = trial.suggest_int("hid_layers", 2, 12, step = 2, log=False)
  config = BertConfig(
      vocab_size=14,
      max_position_embeddings=83,
      hidden_size=hidden_size,
      num_attention_heads=10,
      num_hidden_layers=hidden_layers,
      type_vocab_size=1
      )
  param_model = BertForMaskedLM(config)
  
  inputs = tokenizer.batch_encode_plus(x2, return_tensors="pt",is_split_into_words=True)
  labels = tokenizer.batch_encode_plus(y2, return_tensors="pt",is_split_into_words=True)
  inputs['labels']=labels['input_ids']
  
  dataset = Dataset.from_dict(inputs)
  dataset.set_format("torch")
  train_dataloader = DataLoader(dataset, shuffle=True, batch_size=32)
  
  device = torch.device('cuda:0')
  param_model.to(device)

  # activate training model
  param_model.train()
  # initialize optimizer
  optim = AdamW(param_model.parameters(), lr=1e-4)

  param_model.train()
  for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = param_model(**batch)
      loss = outputs.loss
      loss.backward()  
      optim.step()
      optim.zero_grad()
  return loss

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[32m[I 2022-09-08 11:34:35,150][0m A new study created in memory with name: no-name-dd3c318a-f94a-4ab1-9afa-3a3cad2d98a9[0m
[32m[I 2022-09-08 11:35:44,029][0m Trial 0 finished with value: 1.0670064687728882 and parameters: {'hidden': 100, 'hid_layers': 2}. Best is trial 0 with value: 1.0670064687728882.[0m
[32m[I 2022-09-08 11:37:55,306][0m Trial 1 finished with value: 1.0747731924057007 and parameters: {'hidden': 180, 'hid_layers': 6}. Best is trial 0 with value: 1.0670064687728882.[0m
[32m[I 2022-09-08 11:39:32,040][0m Trial 2 finished with value: 0.3555595874786377 and parameters: {'hidden': 340, 'hid_layers': 2}. Best is trial 2 with value: 0.3555595874786377.[0m
[32m[I 2022-09-08 11:43:58,421][0m Trial 3 finished with value: 1.0928794145584106 and parameters: {'hidden': 220, 'hid_layers': 12}. Best is trial 2 with value: 0.3555595874786377.[0m
[32m[I 2022-09-08 11:46:53,216][0m Trial 4 finished with value: 0.3889857828617096 and parameters: {'hidden': 400, 'hid_la

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('/content/tokenizer')
config = BertConfig(
    vocab_size=14,  # we align this to the tokenizer vocab_size
    max_position_embeddings=83,
    hidden_size=study.best_params['hidden'],
    num_attention_heads=10,
    num_hidden_layers=study.best_params['hid_layers'],
    type_vocab_size=1    
    )
model = BertForMaskedLM(config)

In [None]:
# activate training model
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
# Training using optimal parameters

inputs = tokenizer.batch_encode_plus(x1, return_tensors="pt",is_split_into_words=True)
labels = tokenizer.batch_encode_plus(y1, return_tensors="pt",is_split_into_words=True)
inputs['labels']=labels['input_ids']
dataset = Dataset.from_dict(inputs)
dataset.set_format("torch")
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=32)

In [None]:
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
# print(num_training_steps)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([32, 83]),
 'token_type_ids': torch.Size([32, 83]),
 'attention_mask': torch.Size([32, 83]),
 'labels': torch.Size([32, 83])}

In [None]:
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
print('Device:', torch.device('cuda:0'))
device = torch.device('cuda:0')
model.to(device)

Torch 1.12.1+cu113 CUDA 11.3
Device: cuda:0


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(14, 400, padding_idx=0)
      (position_embeddings): Embedding(83, 400)
      (token_type_embeddings): Embedding(1, 400)
      (LayerNorm): LayerNorm((400,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=400, out_features=400, bias=True)
              (key): Linear(in_features=400, out_features=400, bias=True)
              (value): Linear(in_features=400, out_features=400, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=400, out_features=400, bias=True)
              (LayerNorm): LayerNorm((400,), eps=1e-12, elementwise_affine=True)


In [None]:
# Training Loop

i=0
model.train()
best_loss = 2.0
for epoch in range(num_epochs):
  print("Epoch {}: ".format(epoch))
  for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      if(loss.item() < best_loss):
        model.save_pretrained('./sudoku-bert')
        best_loss = loss.item()
      loss.backward()
      
      i=i+1
      if(i % 400 == 0):
        print("Loss: {}  ".format(loss))
      optim.step()
      lr_scheduler.step()
      optim.zero_grad()

Epoch 0: 
Loss: 1.120271921157837  
Loss: 1.066070795059204  
Loss: 1.0977455377578735  
Loss: 0.4316365420818329  
Loss: 0.32883527874946594  
Loss: 0.27505841851234436  
Loss: 0.33567774295806885  
Loss: 0.3259488642215729  
Loss: 0.2550203204154968  
Loss: 0.2629614770412445  
Loss: 0.2479988932609558  
Loss: 0.2497366964817047  
Loss: 0.26964271068573  
Loss: 0.2511368989944458  
Loss: 0.23546437919139862  
Loss: 0.21125775575637817  
Loss: 0.26414594054222107  
Loss: 0.21616625785827637  
Epoch 1: 
Loss: 0.25771161913871765  
Loss: 0.1707964688539505  
Loss: 0.288177490234375  
Loss: 0.26257002353668213  
Loss: 0.20742423832416534  
Loss: 0.20823460817337036  
Loss: 0.2602976858615875  
Loss: 0.1903720647096634  
Loss: 0.16187891364097595  
Loss: 0.14471352100372314  
Loss: 0.17658931016921997  
Loss: 0.20096434652805328  
Loss: 0.17462414503097534  
Loss: 0.1692923903465271  
Loss: 0.14717702567577362  
Loss: 0.1549849957227707  
Loss: 0.18263743817806244  
Loss: 0.13180172443389

In [None]:
print(best_loss)

0.029785988852381706


In [None]:
!zip -r /content/drive/MyDrive/sudoku-bert1.zip /content/sudoku-bert

  adding: content/sudoku-bert/ (stored 0%)
  adding: content/sudoku-bert/pytorch_model.bin (deflated 7%)
  adding: content/sudoku-bert/config.json (deflated 47%)


In [None]:
puzz = test.iloc[:, 0].values
soln = test.iloc[:, 1].values
x2 = list(map(sudoku_to_tokens, puzz))
y2 = list(map(sudoku_to_tokens, soln))

test_inputs = tokenizer.batch_encode_plus(x2, return_tensors="pt",is_split_into_words=True)
test_labels = tokenizer.batch_encode_plus(y2, return_tensors="pt",is_split_into_words=True)
test_inputs['labels'] = test_labels['input_ids']
test_dataset = Dataset.from_dict(test_inputs)
test_dataset.set_format("torch")
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [None]:
test_model = BertForMaskedLM.from_pretrained('/content/drive/MyDrive/sudoku-bert')

In [None]:
new_inputs = tokenizer.encode_plus(x2[3300], return_tensors="pt",is_split_into_words=True)
new_labels = tokenizer.encode_plus(y2[3300], return_tensors="pt",is_split_into_words=True)

In [None]:
with torch.no_grad():
    output = test_model(**new_inputs, labels=new_labels['input_ids'])
    logits = output.logits

# retrieve index of [MASK]
mask_token_index = (new_inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
print(output[0])
print(predicted_token_id)

tensor(0.0017)
tensor([ 8,  6, 11, 13, 12,  9, 13,  6,  5,  9,  8, 12,  8, 13, 11,  6,  6, 10,
         7, 11, 12, 13, 10,  7,  5,  7,  9,  6, 10, 12,  8,  5,  6,  6,  5,  9,
        10, 11,  8,  9, 12, 13, 10,  5])


In [None]:
mask_tokens = tokenizer.decode(predicted_token_id)
print(mask_tokens)
# why is there a [CLS] in there?
mask_tokens = mask_tokens.replace("[CLS]", "one") # just for testing
mask_tokens = mask_tokens.replace("[SEP]", "one") # just for testing
mask_str = tokens_to_sudoku(mask_tokens.split(" "))
print(mask_str)

solution = fill_masks(tokens_to_sudoku(x2[3300]), mask_str)
display_sudoku(solution)
print("Correct?:", check_correctness(solution))

four two seven nine eight five nine two one five four eight four nine seven two two six three seven eight nine six three one three five two six eight four one two two one five six seven four five eight nine six one
42798592154849722637896313526841221567458961
---------------------
3 4 6 | 2 7 9 | 1 8 5 
7 9 2 | 1 5 8 | 6 4 3 
5 1 8 | 3 4 6 | 9 7 2 
---------------------
2 6 3 | 7 1 5 | 4 9 8 
8 5 9 | 6 2 4 | 3 1 7 
1 7 4 | 9 8 3 | 5 2 6 
---------------------
6 8 7 | 4 3 1 | 2 5 9 
9 2 1 | 5 6 7 | 8 3 4 
4 3 5 | 8 9 2 | 7 6 1 
---------------------
Correct?: True


In [None]:
given = x2[3300]
input = tokens_to_sudoku(given)
token_sudoku = y2[3300]
answer = tokens_to_sudoku(token_sudoku)
print(answer)
display_sudoku(input)
display_sudoku(answer)

346279185792158643518346972263715498859624317174983526687431259921567834435892761
---------------------
3   6 |       | 1     
7     |     8 | 6   3 
5 1   | 3   6 |       
---------------------
      |   1 5 | 4 9 8 
  5   |   2 4 |   1 7 
  7 4 | 9 8   |       
---------------------
6   7 |   3   |   5 9 
9     |       | 8 3 4 
  3   |     2 | 7     
---------------------
---------------------
3 4 6 | 2 7 9 | 1 8 5 
7 9 2 | 1 5 8 | 6 4 3 
5 1 8 | 3 4 6 | 9 7 2 
---------------------
2 6 3 | 7 1 5 | 4 9 8 
8 5 9 | 6 2 4 | 3 1 7 
1 7 4 | 9 8 3 | 5 2 6 
---------------------
6 8 7 | 4 3 1 | 2 5 9 
9 2 1 | 5 6 7 | 8 3 4 
4 3 5 | 8 9 2 | 7 6 1 
---------------------


In [None]:
import numpy as np

## Validate model
sum = 0
elements = 0
for i, (x,y) in enumerate(zip(x2, y2)):
  new_inputs = tokenizer.encode_plus(x, return_tensors="pt",is_split_into_words=True)
  new_labels = tokenizer.encode_plus(y, return_tensors="pt",is_split_into_words=True)
  with torch.no_grad():
      output = test_model(**new_inputs, labels=new_labels['input_ids'])
      logits = output.logits

  # retrieve index of [MASK]
  mask_token_index = (new_inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
  predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
  correct_token_id = new_labels["input_ids"][0, mask_token_index]
  preds = tokenizer.decode(predicted_token_id)
  num_matching_masks = np.count_nonzero(correct_token_id==predicted_token_id)
  percent_correct = num_matching_masks / len(predicted_token_id)
  sum += percent_correct
  elements += 1

  if i % 100 == 0:
    print(f"{i:04}: {num_matching_masks} / {len(predicted_token_id)} = {percent_correct:03f} | average: {sum / elements:03f}")

0000: 43 / 44 = 0.977273 | average: 0.977273
0100: 44 / 45 = 0.977778 | average: 0.944623
0200: 41 / 42 = 0.976190 | average: 0.943196
0300: 40 / 43 = 0.930233 | average: 0.940204
0400: 37 / 37 = 1.000000 | average: 0.935534
0500: 39 / 39 = 1.000000 | average: 0.936724
0600: 36 / 49 = 0.734694 | average: 0.938336
0700: 40 / 47 = 0.851064 | average: 0.937918
0800: 40 / 40 = 1.000000 | average: 0.938179
0900: 44 / 44 = 1.000000 | average: 0.938619
1000: 32 / 32 = 1.000000 | average: 0.937994
1100: 45 / 45 = 1.000000 | average: 0.937159
1200: 39 / 39 = 1.000000 | average: 0.937630
1300: 37 / 39 = 0.948718 | average: 0.937413
1400: 47 / 47 = 1.000000 | average: 0.936751
1500: 39 / 39 = 1.000000 | average: 0.935934
1600: 26 / 26 = 1.000000 | average: 0.936240
1700: 37 / 37 = 1.000000 | average: 0.936696
1800: 43 / 43 = 1.000000 | average: 0.937327
1900: 43 / 43 = 1.000000 | average: 0.937654
2000: 30 / 30 = 1.000000 | average: 0.937600
2100: 45 / 47 = 0.957447 | average: 0.937985
2200: 41 /

KeyboardInterrupt: ignored