In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch as T
import imageio
from tqdm import tqdm
import cv2
from IPython.display import HTML
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
boardSize = (12,15)   # Legt die Größe des Feldes fest
safeDist = 3

In [46]:
def showhist_map(hist):
        for i in range(len(hist)):
            if all(elm < boardSize[0] for elm in hist[int(i)]):
                h = hist[i].astype(np.int)
                
                if np.max(hist[i]) == 1: 
                    plt.imshow(hist[i].reshape(boardSize[0], boardSize[1]), cmap='hot', interpolation='nearest')
                else:
                    plt.imshow(np.zeros((boardSize[0], boardSize[1])))
                plt.show()
                
                
            
def showReward(h,elemMax=True):
    
    m = np.zeros(boardSize)
    for i in range(boardSize[0]):
        for j in range(boardSize[1]):
            m[i,j] , _= env.getRewardForField(i,j)
            if(m[i,j] > 30):
                if elemMax:
                    m[i,j] = 10
    
    return m.astype(np.int)


def fig2data ( fig ):
    fig.canvas.draw ( )

    w,h = fig.canvas.get_width_height()

    buf = np.frombuffer ( fig.canvas.tostring_rgb(), dtype=np.uint8 )
    buf.shape = ( h, w,3 )
 
    buf = np.roll ( buf, 3, axis = 2 )
    plt.close()
    return buf

def showhist_ani(hist):
    frame_array = []
    for i in range(len(hist)):
        if all(elm < boardSize[0] for elm in hist[int(i)]):
            h = hist[i].astype(np.int)
            fig = plt.figure()
            if np.max(hist[i]) == 1: 
                plt.imshow(hist[i].reshape(boardSize[0], boardSize[1]), cmap='hot', interpolation='nearest')
            else:
                plt.imshow(np.zeros((boardSize[0], boardSize[1])))
            frame_array.append(fig2data ( fig ))
    #

    w = imageio.get_writer('output.mp4', fps = 6,quality=6)
    for i in range(len(frame_array)):
        w.append_data(frame_array[i] )
    w.close()
    #return frame_array

def playVideo(path):
    before = """<video width="864" height="576" controls><source src="""
    end = """ type="video/mp4"></video>"""
    return HTML(before + path + end)


In [47]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape),
                                    dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape),
                                        dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

In [48]:
class DuelingLinearDeepQNetwork(nn.Module):
    def __init__(self, ALPHA, n_actions, name, input_dims, chkpt_dir=''):
        super(DuelingLinearDeepQNetwork, self).__init__()

        self.fc1 = nn.Linear(*input_dims, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 256)
        
        self.preV = nn.Linear(256, 128)
        self.V = nn.Linear(128, 1)
        
        self.preA = nn.Linear(256,128)
        self.A = nn.Linear(128, n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=ALPHA)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_dqn')

    def forward(self, state):
        l1 = F.relu(self.fc1(state))
        l2 = F.relu(self.fc2(l1))
        l3 = F.relu(self.fc3(l2))
        prV = F.relu(self.preV(l3))
        V = self.V(prV)
        prA = F.relu(self.preA(l3))
        A = self.A(prA)

        return V, A

    def save_checkpoint(self):
        #print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        #print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_file))

In [49]:
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, chkpt_dir=''):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0
        print(mem_size)
        print(input_dims)
        print(n_actions)
        self.memory = ReplayBuffer(mem_size, input_dims)

        self.q_eval = DuelingLinearDeepQNetwork(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='model',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DuelingLinearDeepQNetwork(self.lr, self.n_actions,
                                   input_dims=self.input_dims,
                                   name='model_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
            _, advantage = self.q_eval.forward(state)
            action = T.argmax(advantage).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                        if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        indices = np.arange(self.batch_size)

        V_s, A_s = self.q_eval.forward(states)
        V_s_, A_s_ = self.q_next.forward(states_)

        V_s_eval, A_s_eval = self.q_eval.forward(states_)

        q_pred = T.add(V_s,
                        (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
        q_next = T.add(V_s_,
                        (A_s_ - A_s_.mean(dim=1, keepdim=True)))

        q_eval = T.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1,keepdim=True)))

        max_actions = T.argmax(q_eval, dim=1)

        q_next[dones] = 0.0
        q_target = rewards + self.gamma*q_next[indices, max_actions]

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

In [80]:
class game:
    def __init__(self, startPos, aimPos ,obstacles ,boardSize = boardSize ):
        self.startPos = np.copy(startPos)
        self.playerPos = startPos
        self.aim = aim(aimPos[0],aimPos[1])
        self.boardSize = boardSize
        self.obstacles = obstacles
        
        
    def reset(self):
        self.playerPos = self.startPos
        self.reward = 0

        return self.createMap().reshape(boardSize[0]*boardSize[1])   
        
    def step(self, action):
        done = False
        
        if(action == 0):
            self.playerPos[0] +=1
        if(action == 1):
            self.playerPos[0] -=1
        if(action == 2):
            self.playerPos[1] +=1
        if(action == 3):
            self.playerPos[1] -=1
        
        
        observ = self.createMap().reshape(boardSize[0]*boardSize[1])
        
        reward, done = self.getRewardForField(self.playerPos[0],self.playerPos[1])
        
        return observ, reward, done
            
    def createMap(self,plot=False):
        
        m = np.zeros(self.boardSize)
        for ob in self.obstacles:
            m[ob.x,ob.y] = 0.3
        m[self.aim.x,self.aim.y] = 0.9
        
        if(self.playerPos[0]>0 and self.playerPos[0]< boardSize[0] and
          self.playerPos[1]>0 and self.playerPos[1]< boardSize[1]):
            m[self.playerPos[0],self.playerPos[1]] = 1

        if plot:
            plt.imshow(m, cmap='hot', interpolation='nearest')
            plt.show()
        return m
    
    def checkBounds(self,p):
        if(p[0]<0):
            return -10000, True
        if(p[1]<0):
            return -10000, True
        if(p[0]>=boardSize[0]):
            return -10000, True
        if(p[1]>=boardSize[1]):
            return -10000, True
        return 0, False
    
    def distance(self, a, b,special=False):
        if(special):
            return np.sqrt(np.square(a[0]-b.x)+np.square(a[1]-b.y))
        else:
            return np.sqrt(np.square(a.x-b.x)+np.square(a.y-b.y))
        
    def getRewardForField(self, x, y): 
        done = False
        pos = [x,y]
        reward = 0
        reward -= 100*(self.distance(pos,self.aim,special=True))/(boardSize[1]+boardSize[0])**2
        rew , done = self.checkBounds(pos)
        reward += rew
        for ob in self.obstacles:
            reward -= 1000*np.exp(-(self.distance(pos,ob,special=True)*safeDist))
        
        edgeControl = 600
        if(pos[0]==0 or pos[0]== boardSize[0]-1):
            reward -= edgeControl
        if(pos[1]==0 or pos[1]== boardSize[1]-1):
            reward -= edgeControl
        if(x == self.aim.x and y == self.aim.y):
            reward =  MAX_REWARD
            done = True
        return reward, done

class aim:
    def __init__(self,x,y):
        self.x = x
        self.y = y        

        
class obstacle:
    def __init__(self,x,y):
        self.x = x
        self.y = y 

In [82]:
##### Here

MAX_REWARD = 20000
#gameStates = np.zeros((timeSteps, boardSize[0],boardSize[1] ))

agent = Agent(gamma=0.99, epsilon=1.0, lr=1*5e-3, n_actions=4, input_dims=[boardSize[0]*boardSize[1]], 
              mem_size=100000, batch_size=64, eps_min=0.01, eps_dec=5*1e-5, replace=100)
load_checkpoint = False

if load_checkpoint:
        agent.load_models()
        
scores, eps_hist = [], []
n_games = 10000

num_obstaces = np.random.randint(15,25)
obstacles = []
for i in range(num_obstaces):
    obstacles.append(obstacle(np.random.randint(1,boardSize[0]),np.random.randint(1,boardSize[1])))
    
score_saver = []
avg_score_saver = []
ddqn_scores = []
eps_history = []
savedGames = []
MAX_ITER = 60
prec = 40
reached = 0
reached_last_100 = 0
for i in tqdm(range(n_games)):
    score = 0
    done = False
    aimPos = np.array([np.random.randint(6,boardSize[0]-2), np.random.randint(int(boardSize[1]/2+2),boardSize[1]-2)])
    playerpos = np.array([np.random.randint(2,4),np.random.randint(2,5)])            
    env = game(playerpos,aimPos,obstacles)
    observation = env.reset()
    

    game_sav = []
    iteration = 0
    while not done:
        iteration +=1

        action = agent.choose_action(observation)
        observation_, reward, done = env.step(action)
        if reward == MAX_REWARD:
            reached += 1
            if i > (n_games -100):
                reached_last_100 += 1
                
        score += reward
        agent.store_transition(observation, action,
                                    reward, observation_, int(done))
        
        agent.learn()
        observation = observation_

        game_sav.append(observation_)
        eps_history.append(agent.epsilon)

        ddqn_scores.append(score)
        if(i > 20):
            avg_score = np.mean(ddqn_scores[-10])
        
        if iteration == MAX_ITER:
            done = True

        if i % 10 == 0 and i > 0:
            agent.save_models()
    score_saver.append(score)
    if(i > 20):
        avg_score_saver.append(avg_score)
        if i % int(n_games/prec) == int(n_games/prec)-1:
            print('episode: ', i,'score: %.2f' % score,
                  ' average score %.2f' % avg_score,
                  'Epsilon %.3f' % agent.epsilon,
                  'Erreicht: ' +str(reached)) 
    savedGames.append(game_sav)
    
   
print("")
print(str(n_games) + " Spieldurchläufe: "  +str(reached) + " mal Ziel erreicht, Quote = " + str(reached/n_games))
print("Quote der letzten 100 Durchläufe " + str(reached_last_100/100) )
plt.plot(score_saver)
plt.show()
plt.plot(avg_score_saver)
plt.show()

  0%|          | 0/10000 [00:00<?, ?it/s]

100000
[180]
4


  2%|▎         | 250/10000 [01:32<1:46:58,  1.52it/s]

episode:  249 score: -6788.02  average score -6760.62 Epsilon 0.525 Erreicht: 22


  4%|▎         | 368/10000 [02:34<1:07:16,  2.39it/s]


KeyboardInterrupt: 

In [None]:
 plt.plot(score_saver[-300:])

In [None]:
(showReward(savedGames[-1][0],elemMax=False)).astype(np.int)

In [None]:
plt.imshow(showReward(savedGames[-2][0]).reshape(boardSize[0], boardSize[1]), cmap='hot', interpolation='nearest')

In [None]:
frame_array = showhist_ani(savedGames[-np.random.randint(1,10)])
playVideo('output.mp4')