In [36]:
import numpy as np
import torch
from torch.autograd import Variable
from torch import nn
import loader
from threes import *
import time

MOVES = [0, 1, 2, 3]
CUDA = False

FILENAME = "saved_parameters"
INPUT_SIZE = 20
HIDDEN_SIZE = 256
GAMMA = 0.8


def train(model, data_loaders, optimizer, num_epochs=500, log_every=100, verbose=True):
    if CUDA:
        model.network.cuda()

    iter_ = 0
    epoch = 0
    if verbose:
        print u'Training the model!'
        print u'Interrupt at any time to get current model'
    try:
        while epoch < num_epochs:
            epoch += 1
            x = data_loaders.get(model, 30)
            future = x[:, 21:]
            
            future_scores = np.zeros((x.shape[0], len(MOVES)))
            
            for i, move in enumerate(MOVES):
                future_scores[:, i] = model.Q(np.hstack((future, np.full((x.shape[0],1), move)))).ravel()
            for i, row in enumerate(future):
                game = Threes(save_game=False, data=row.tolist())
                for j, move in enumerate(MoveEnum):
                    if not game.canMove(move):
                        future_scores[i, j] = float('-inf')
                if not game.getPossibleMoves():
                    future_scores[i:,:] = np.full((1,4), x[i, 20])
                
            y = x[:, 20] + GAMMA * np.max(future_scores, axis=1)
            x = x[:, :20]
            if CUDA:
                x = x.cuda()
                y = y.cuda()
            iter_ += 1

            optimizer.zero_grad()
            out = model.Q(x, as_variable=True)
            loss = model.loss(out, y)
            loss.backward()
            optimizer.step()

            if iter_ % log_every == 0 and verbose:
                print u"Minibatch {0: >6}  | loss {1: >5.2f} ".format(iter_, loss.data[0])

    except KeyboardInterrupt:
        pass
    model.save_parameters(FILENAME)


class QLearningNet(object):
    def __init__(self, network, criterion):
        self.network = network
        self.criterion = criterion
            
    def Q(self, batch, as_variable=False):
        batch = Variable(torch.FloatTensor(batch), requires_grad=False)
        if as_variable:
            return self.network.forward(batch) 
        else:
            return self.network.forward(batch).data.numpy()

    def loss(self, out, y):
        y = Variable(torch.FloatTensor(y), requires_grad=False)
        #print(out, y)
        return self.criterion(out, y)
    
    def save_parameters(self, filename):
        torch.save(self.network.state_dict(), filename)
        
    def load_parameters(self, filename):
        self.network.load_state_dict(torch.load(filename))


network = nn.Sequential(nn.Linear(INPUT_SIZE,HIDDEN_SIZE), nn.Tanh(), nn.Linear(HIDDEN_SIZE,1))
criterion = nn.MSELoss()
q_learning_net = QLearningNet(network, criterion)
for p in q_learning_net.network.parameters():
    p.requires_grad = True
optimizer = torch.optim.Adam(network.parameters(), lr=0.001)
data_loader = loader.Loader()
train(q_learning_net, data_loader, optimizer, num_epochs=5000)


Training the model!
Interrupt at any time to get current model
Minibatch    100  | loss 338.63 
Minibatch    200  | loss 275.48 
Minibatch    300  | loss 81.77 
Minibatch    400  | loss 308.51 
Minibatch    500  | loss 38.72 
Minibatch    600  | loss 76.97 
Minibatch    700  | loss 97.23 
Minibatch    800  | loss 86.62 
Minibatch    900  | loss 101.59 
Minibatch   1000  | loss 201.03 
Minibatch   1100  | loss 16.60 
Minibatch   1200  | loss 11.04 
Minibatch   1300  | loss 121.75 
Minibatch   1400  | loss 307.54 
Minibatch   1500  | loss 23.98 
Minibatch   1600  | loss 30.97 
Minibatch   1700  | loss 77.81 
Minibatch   1800  | loss 2152.51 
Minibatch   1900  | loss 59.92 
Minibatch   2000  | loss 39.02 
Minibatch   2100  | loss 232.08 
Minibatch   2200  | loss 241.06 
Minibatch   2300  | loss 30.20 
Minibatch   2400  | loss 246.46 
Minibatch   2500  | loss 75.93 
Minibatch   2600  | loss 78.67 
Minibatch   2700  | loss 246.60 
Minibatch   2800  | loss 118.58 
Minibatch   2900  | loss 28

In [37]:
def printer(curr_game):
    board = curr_game.stateInfo().board
    for ys in board:
        for el in ys:
            x = u"."
            if el != 0:
                x = el
            print u"{:>4}".format(x),
        print u""

if __name__ == u'__main__':
    seed = int(time.time())
    random.seed(seed)
    game = Threes(save_game=False)
    file = u''
    if game.save_game:
        filename = getFilename()
        file = open(filename, u'a+')
        file.write(unicode(seed) + u'\n')
        file.flush()
    printer(game)
    moves_dict = {u"w": MoveEnum.Up,
                  u"a": MoveEnum.Left,
                  u"s": MoveEnum.Down,
                  u"d": MoveEnum.Right}
    while True:
        any_move = False
        for m in moves_dict.values():
            any_move = any_move or game.canMove(m)
        if not any_move:
            break
        move = None
        best_result = float('-inf')
        for pos_move in game.getPossibleMoves():
            res = q_learning_net.Q(np.array([np.append(game.data(), [pos_move.value])]))[0]
            print "Gra z Q{}".format(res)
            if res >= best_result:
                best_result = res
                move = pos_move
        m = move
        if game.canMove(m):
            if game.save_game:
                saveState(game, m, file)
            game.makeMove(m)
        else:
            print u"THE MOVE IS NOT VALID!"
        print
        printer(game)

   1    .    3    2 
   3    1    .    . 
   2    3    2    1 
   .    .    .    . 
Gra z Q[ 4.30305672]
Gra z Q[ 4.6611743]
Gra z Q[ 4.99688387]
Gra z Q[ 5.30932999]

   .    2    .    . 
   1    .    3    2 
   3    1    .    . 
   2    3    2    1 
Gra z Q[ 4.04160023]
Gra z Q[ 4.24496841]
Gra z Q[ 4.52504492]
Gra z Q[ 4.86699677]

   .    .    1    . 
   1    2    .    . 
   3    1    3    2 
   2    3    2    1 
Gra z Q[ 2.97743607]
Gra z Q[ 3.24030399]
Gra z Q[ 3.51753998]
Gra z Q[ 3.79852581]

   .    .    .    3 
   1    .    1    . 
   3    3    3    . 
   2    3    2    3 
Gra z Q[ 5.20824099]
Gra z Q[ 5.36664867]
Gra z Q[ 5.51828289]
Gra z Q[ 5.68182611]

   .    .    .    1 
   1    .    1    3 
   3    .    3    . 
   2    6    2    3 
Gra z Q[ 5.49327707]
Gra z Q[ 5.58418751]
Gra z Q[ 5.74870253]
Gra z Q[ 5.96386147]

   .    .    .    1 
   1    .    1    1 
   3    .    3    3 
   2    6    2    3 
Gra z Q[ 5.10374308]
Gra z Q[ 5.14557314]
Gra z Q[ 5.22334909]
Gra z Q[ 

In [28]:
for p in q_learning_net.network.parameters():
    print p

Parameter containing:

Columns 0 to 9 
-0.1849  0.0213 -0.1573 -0.0685 -0.1207  0.1017 -0.2361  0.0731 -0.2836  0.0781
 0.0960  0.1488 -0.1029  0.1888  0.1495 -0.0223 -0.1353  0.0044 -0.1556  0.1147
-0.2490 -0.1687 -0.2042 -0.0916 -0.2106  0.0158 -0.0681 -0.0254 -0.0871  0.1691

Columns 10 to 19 
 0.1223 -0.0231 -0.0666 -0.1115  0.0852  0.0037 -0.0201 -0.0836 -0.0796 -0.0603
-0.2443 -0.0347  0.0673 -0.0638 -0.2140 -0.2142 -0.1477  0.0585  0.1363 -0.1596
-0.0830  0.0658 -0.2568 -0.2638  0.1615 -0.1041  0.1587  0.0059 -0.0007  0.0746
[torch.FloatTensor of size 3x20]

Parameter containing:
-0.0179
-0.2151
-0.2373
[torch.FloatTensor of size 3]

Parameter containing:
-0.2833 -0.3474 -0.4299
[torch.FloatTensor of size 1x3]

Parameter containing:
1.00000e-02 *
 -2.6857
[torch.FloatTensor of size 1]



In [31]:
r = 1000000 * np.random.rand(20)
print q_learning_net.Q(r)

[ 353753.65625]
