In [1]:
from mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = None
model_config.vocab_size = 61 # openai's model vocabulary
model_config.block_size = 60  # openai's model block_size (i.e. input context length)
model_config.n_layer = 8
model_config.n_embd = 512
model_config.n_head = 8
model = GPT(model_config)

number of parameters: 25.28M


In [2]:
from mingpt.model import GPT

In [3]:
sum(p.numel() for p in model.parameters())

25313280

In [4]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(61, 512)
    (wpe): Embedding(60, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=True)
          (c_proj): Linear(in_features=512, out_features=512, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_feat

In [5]:
from torch.utils.data import Dataset
import numpy as np
import torch as t
class MyDataset(Dataset):
    def __init__(self):
        self.files = 'moves.npy'
    
    def __getitem__(self, index):
        x = np.load('moves.npy')
        x = t.from_numpy(x).long()
        return x[index,:-1], x[index, 1:]
    
    def __len__(self):
        return len(np.load('moves.npy'))

In [6]:
data = MyDataset()
#data[0]
data.__len__()

100000

In [7]:
# your subclass of torch.utils.data.Dataset that emits example
# torch LongTensor of lengths up to 1024, with integers from [0,50257)
if __name__ == "__main__":
    train_dataset = data

    from mingpt.trainer import Trainer
    train_config = Trainer.get_default_config()
    #train_config.device = 'cpu'
    train_config.learning_rate = 5e-4 # many possible options, see the file
    #train_config.learning_rate = 5e-6 # many possible options, see the file
    train_config.max_iters = 10000
    train_config.batch_size = 32
    train_config.num_workers = 0
    trainer = Trainer(train_config, model, train_dataset)

    def batch_end_callback(trainer):
        if trainer.iter_num % 10 == 0:
            print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
    trainer.set_callback('on_batch_end', batch_end_callback)

    trainer.run()
    t.save(model.state_dict(), 'model_state_dict.pth')

running on device cuda
iter_dt 0.00ms; iter 0: train loss 4.20526
iter_dt 979.02ms; iter 10: train loss 3.96961
iter_dt 1028.27ms; iter 20: train loss 3.89949
iter_dt 1086.61ms; iter 30: train loss 3.88097
iter_dt 932.04ms; iter 40: train loss 3.85949
iter_dt 982.88ms; iter 50: train loss 3.87000
iter_dt 942.01ms; iter 60: train loss 3.86847
iter_dt 1060.16ms; iter 70: train loss 3.83650
iter_dt 997.47ms; iter 80: train loss 3.84214
iter_dt 911.08ms; iter 90: train loss 3.79401
iter_dt 901.61ms; iter 100: train loss 3.78211
iter_dt 892.14ms; iter 110: train loss 3.76217
iter_dt 891.11ms; iter 120: train loss 3.73387
iter_dt 906.74ms; iter 130: train loss 3.67425


In [201]:
t.save(model.state_dict(), 'model_state_dict.pth')

In [5]:
model.load_state_dict(t.load('model_state_dict.pth'))
model.to('cuda')

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(61, 512)
    (wpe): Embedding(60, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=True)
          (c_proj): Linear(in_features=512, out_features=512, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_feat

In [9]:
loaded_model = GPT(model_config)

number of parameters: 25.28M


In [10]:
loaded_model.load_state_dict(t.load('model_state_dict.pth'))
loaded_model.to('cuda')
loaded_model.eval()
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(61, 512)
    (wpe): Embedding(60, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=512, out_features=1536, bias=True)
          (c_proj): Linear(in_features=512, out_features=512, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_feat

In [204]:
loaded_model(data[0][0].unsqueeze(dim=0).to(device = 'cuda'), data[0][1].unsqueeze(dim=0).to(device = 'cuda'))

(tensor([[[-0.0926, -1.9478, -1.6760,  ..., -0.9477, -0.9677,  0.6183],
          [-1.2568, -0.9349, -1.1919,  ..., -2.2407, -0.4905, -0.8273],
          [-1.1815, -1.6986, -0.9385,  ..., -1.3924, -0.6063, -1.5884],
          ...,
          [ 1.2539, -2.4169, -1.0164,  ...,  0.5840, -0.0607,  3.3853],
          [ 2.2742, -0.7422, -1.3572,  ..., -1.4860, -0.0146,  5.3410],
          [ 0.9980, -0.4837, -1.2252,  ..., -1.5212, -1.3533,  6.2726]]],
        device='cuda:0', grad_fn=<UnsafeViewBackward0>),
 tensor(2.8353, device='cuda:0', grad_fn=<NllLossBackward0>))

In [12]:
test_data = t.tensor(np.load('test_moves.npy'), dtype=t.long)
test_data[0]

tensor([40, 41, 26, 20, 42, 48, 33, 25, 55, 43, 17,  8, 32, 39, 31, 56,  9, 29,
        28, 54, 13, 23, 29, 18, 50, 30, 57, 47, 53, 49, 34, 51, 58, 10, 46, 45,
         2, 12, 15, 37, 38, 14, 16, 24, 36, 11,  1, 52,  3, 35, 21,  4,  7, 22,
        59,  6,  0, 44,  5, 60])

In [13]:
total = 0
for i in range(100):
    total += loaded_model(test_data[i,:-1].unsqueeze(dim=0).to(device = 'cuda'), test_data[i,1:].unsqueeze(dim=0).to(device = 'cuda'))[1].item()

print(f'test loss = {total/100}')

test loss = 2.7630322861671446


In [11]:
loaded_model(test_data[1,:-1].unsqueeze(dim=0).to(device = 'cuda'), test_data[1,1:].unsqueeze(dim=0).to(device = 'cuda'))[0].shape

torch.Size([1, 59, 61])

In [12]:
loaded_model(test_data[1,:-1].unsqueeze(dim=0).to(device = 'cuda'), test_data[1,1:].unsqueeze(dim=0).to(device = 'cuda'))[0][0,:,0].shape

torch.Size([59])

In [210]:
loaded_model(test_data[0,:-1].unsqueeze(dim=0).to(device = 'cuda'), test_data[0,1:].unsqueeze(dim=0).to(device = 'cuda'))[0][0,0,:]

tensor([-1.4936, -1.4131, -1.4290, -0.5553, -1.4629, -0.9871, -0.4276, -1.1516,
        -1.6386, -0.5513, -1.6766,  0.5470, -0.8714,  0.9708, -1.0177, -0.6782,
        -1.0333, -1.8114,  0.0361, -0.4909,  1.2373, -0.4617, -1.6881, -0.9667,
        -0.4150,  1.1750, -1.0983, -7.9347,  0.1663,  7.6877, -1.9586, -1.2159,
         1.4354,  0.7997, -1.1937, -1.9261, -1.4702,  0.8504,  0.4353,  8.1636,
        -1.1502,  7.6675, -1.7634, -1.7804, -0.8411, -1.8221, -0.4208,  2.5605,
         1.0385,  0.4574, -0.0823, -1.8840, -0.9458, -0.5064, -1.1361, -0.1512,
        -1.0553, -0.6979, -1.2378, -0.8959,  0.3069], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [208]:
def tokens_to_ij(input):
    assert input.shape == (61,)
    padded = t.zeros((64,))
    padded[:27] = input[:27]
    padded[27] = 0
    padded[28] = 0
    padded[29:35] = input[27:33]
    padded[35] = 0
    padded[36] = 0
    padded[37:] = input[33:-1]
    return padded.reshape(8,8)

x = tokens_to_ij(loaded_model(test_data[0,:-1].unsqueeze(dim=0).to(device = 'cuda'), test_data[0,1:].unsqueeze(dim=0).to(device = 'cuda'))[0][0,0,:])

fig = px.imshow(x.detach().numpy())

fig.show()

In [230]:
board_state(data[3][0],59)

tensor(33)
tensor(29)
tensor(19)
tensor(41)
tensor(42)
tensor(43)
tensor(22)
tensor(10)
tensor(50)
tensor(49)
tensor(11)
tensor(58)
tensor(20)
tensor(18)
tensor(40)
tensor(4)
tensor(32)
tensor(39)
tensor(46)
tensor(38)
tensor(59)
tensor(25)
tensor(2)
tensor(13)
tensor(34)
tensor(53)
tensor(37)
tensor(35)
tensor(55)
tensor(3)
tensor(51)
tensor(21)
tensor(48)
tensor(23)
tensor(28)
tensor(47)
tensor(15)
tensor(44)
tensor(31)
tensor(29)
tensor(1)
tensor(17)
tensor(36)
tensor(26)
tensor(6)
tensor(30)
tensor(5)
tensor(54)
tensor(24)
tensor(16)
tensor(52)
tensor(56)
tensor(14)
tensor(57)
tensor(45)
tensor(12)
tensor(9)
tensor(7)
tensor(8)


In [239]:
loaded_model(t.tensor([[26]]).to('cuda'))[0][0,0,:].shape

fig = px.imshow(tokens_to_ij(loaded_model(t.tensor([[26,20,41]]).to('cuda'))[0][0,1,:]).detach().numpy())
fig.show()

In [249]:
data[0]

(tensor([26, 32, 41, 18,  9, 21, 37, 38, 31, 36, 46, 55, 54, 47, 44, 53, 14,  7,
         39, 30, 27, 28, 56,  0, 24, 25, 11, 50, 20, 57, 29, 40, 59, 16, 48, 42,
         45, 13, 19, 35, 49, 10, 12, 52, 34, 58,  5, 33,  1, 23, 51, 43, 17,  8,
          4,  3,  2, 22, 15]),
 tensor([32, 41, 18,  9, 21, 37, 38, 31, 36, 46, 55, 54, 47, 44, 53, 14,  7, 39,
         30, 27, 28, 56,  0, 24, 25, 11, 50, 20, 57, 29, 40, 59, 16, 48, 42, 45,
         13, 19, 35, 49, 10, 12, 52, 34, 58,  5, 33,  1, 23, 51, 43, 17,  8,  4,
          3,  2, 22, 15,  6]))

In [250]:
from othello import Othello



def token_to_ij(token):
    token_copy = token.copy()
    if token_copy > 26:
        token_copy += 2
    if token_copy > 34:
        token_copy += 2
    return (token_copy // 8, token_copy % 8)


def board_state(tokens, turn):
    game = Othello()
    for i in range(turn):
        print(tokens[i])
        # print(token_to_ij(tokens[i].numpy()))
        # print(token_to_ij(tokens[i].numpy())[0].item())
        # print(token_to_ij(tokens[i].numpy())[1].item())
        game.move(*token_to_ij(tokens[i].numpy()))
    game.display()
    
run = 7
for turn in range(10):

    # board_state(test_data[run,:-1], turn)

    # fig = px.imshow(tokens_to_ij(loaded_model(test_data[run,:-1].unsqueeze(dim=0).to(device = 'cuda'))[0][0,turn - 1,:]).detach().numpy())
    
    board_state(data[0][0], turn)

    fig = px.imshow(tokens_to_ij(loaded_model(data[0][0].unsqueeze(dim=0).to(device = 'cuda'))[0][0,turn - 1,:]).detach().numpy())

    fig.update_layout(width=400, height=400)

    fig.update_layout(
    margin=dict(l=0,r=0,b=0,t=0)
        )

    fig.show()

tensor(26)


tensor(26)
tensor(32)


tensor(26)
tensor(32)
tensor(41)


tensor(26)
tensor(32)
tensor(41)
tensor(18)


tensor(26)
tensor(32)
tensor(41)
tensor(18)
tensor(9)


tensor(26)
tensor(32)
tensor(41)
tensor(18)
tensor(9)
tensor(21)


tensor(26)
tensor(32)
tensor(41)
tensor(18)
tensor(9)
tensor(21)
tensor(37)


tensor(26)
tensor(32)
tensor(41)
tensor(18)
tensor(9)
tensor(21)
tensor(37)
tensor(38)


tensor(26)
tensor(32)
tensor(41)
tensor(18)
tensor(9)
tensor(21)
tensor(37)
tensor(38)
tensor(31)


In [130]:
game = Othello()
game.move(3,2)
game.display()

In [140]:
test_data

tensor([[40, 41, 26,  ..., 44,  5, 60],
        [33, 41, 26,  ..., 45, 52, 36],
        [26, 20, 21,  ...,  3, 56, 44],
        ...,
        [33, 39, 18,  ..., 29,  2, 23],
        [26, 18, 10,  ...,  5, 43, 53],
        [33, 41, 26,  ..., 52, 57, 60]])

In [30]:
import plotly.express as px

fig = px.imshow(x.detach().numpy())
fig.show()