# DQN and DDQN methods

In [1]:
import math
import copy
import time
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from modelsummary import summary


from qugames import snake

class MemoryBuffer:
    def __init__(self, capacity, state_shape):
        self.capacity = capacity  # memory capacity (number of examples)
        self.count    = 0         # number of examples added
        self.S0 = torch.empty( (capacity, ) + state_shape, dtype=torch.float32)
        self.S1 = torch.empty( (capacity, ) + state_shape, dtype=torch.float32)
        self.A0 = torch.empty( (capacity, 1),              dtype=torch.int64)        
        self.R1 = torch.empty( (capacity, 1),              dtype=torch.float32)
        self.Dn = torch.empty( (capacity, 1),              dtype=torch.float32)

    def add(self, s0, a0, s1, r1, done):
        """ Add to memory (s0,a0,s1,r1) """
        idx = self.count % self.capacity
        self.S0[idx] = s0
        self.S1[idx] = s1
        self.A0[idx] = a0;  self.R1[idx] = r1; self.Dn[idx] = done
        self.count += 1

    def get(self, count):
        """ Return count of examples for (s0,a0,s1,r1) """        
        high = min(self.count, self.capacity)
        num  = min(count, high)
        ids = torch.randint(high = high, size = (num,) )
        return self.S0[ids], self.A0[ids], self.S1[ids], self.R1[ids], self.Dn[ids]

#========================================================================================

class AgentModel(nn.Module):
    """ Neural network for Q(s,a) """
    def __init__(self, state_shape, nA = 5, 
                 channels=[12,24,64], kernels = [8,3,3], strides = [8,1,1], paddings = [0,1,1],
                 pools=[2,2,2], dropout=0.2,  hidden = 128):
        """
        state_shape = (3 * n_frames, image_width, image_height)
        nA - number of state (Snake = 5)
        """
        super(AgentModel, self).__init__()
                
        channels = [ state_shape[0] ] + channels;  conv_kernels = kernels      
        w, h     =  state_shape[1], state_shape[2]
        layers = []
        for i in range(len(channels)-1):
            layers +=  [ 
                nn.Conv2d(channels[i], channels[i+1], kernel_size=kernels[i], stride=strides[i], padding=paddings[i]),
                nn.ReLU()]
            if pools[i] > 1:
                layers += [
                    nn.MaxPool2d(kernel_size=pools[i], stride=pools[i]),
                    nn.Dropout(p=dropout) ]                            
            w = (((w + 2*paddings[i] - kernels[i]) // strides[i] + 1) - pools[i]) // pools[i] + 1
            h = (((h + 2*paddings[i] - kernels[i]) // strides[i] + 1) - pools[i]) // pools[i] + 1        
            
        self.features = channels[-1] * w * h
        layers += [ 
            nn.Flatten(1),
            nn.Linear(self.features, hidden),
            nn.ReLU(),
            nn.Linear(hidden, nA) ]                
                
        self.model = nn.Sequential(*layers)       
 
    def forward(self, x):        
        return self.model(x)        

#========================================================================================    
    
class DQN:
    """ DQN метод для дискретных действий """
    def __init__(self, env):
        self.env  = env                         # environment we work with
        self.nA   =  self.env.action_space.n    # number of discrete actions
        shape     = env.observation_space.shape
        self.state_shape = (2*shape[2], shape[0], shape[1]) # (2*channels, width, heights)
        print("state_shape", self.state_shape)

        self.params = {                   # default parameters
            'env'      : "Environment",
            'ticks'    : 1000,                  
            'timeout'  : True,            # whether to consider reaching ticks as a terminal state
            'method'   : "DQN",           # kind of the method (DQN, DDQN)     
            'gamma'    : 0.99,            # discount factor
            'eps1'     : 1.0,             # initial value epsilon
            'eps2'     : 0.001,           # final value   epsilon
            'decays'   : 1000,            # number of episodes to decay eps1 - > eps2
            'update'   : 10,              # target model update rate (in frames = time steps)         
            'batch'    : 100,             # batch size for training
            'capacity' : 100000,          # memory size
            'channels' : [2,48,92],       # conv channles
            'kernels'  : [8,3,3],         # conv kernels
            'strides'  : [8,1,1],         # conv strides
            'paddings' : [0,1,1],         # conv paddings
            'pools'    : [2,2,2],         # = pool_strides
            'dropout'  : 0.2,             # after cnn,reLU,pool 
            'hidden'  :  128,             # hidden layers            
            'loss'     : 'mse',           # loss function (mse, huber)
            'optimizer': 'adam',          # optimizer (sgd, adam)
            'lr'       : 0.0001,          # learning rate           
        }
        self.last_loss = 0.               # last loss
        self.history   = []
        self.maxQ      = []
   
        
    #------------------------------------------------------------------------------------

    def init(self):
        """ Create a neural network and optimizer """

        self.gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("device:", self.gpu)

        self.model     = AgentModel(self.state_shape, self.nA,                       # current Q
                             channels=self.params['channels'], kernels = self.params['kernels'], paddings = self.params['paddings'],
                             pools = self.params['pools'], dropout=self.params['dropout'], hidden  = self.params['hidden']).to(self.gpu)      
        self.target    = AgentModel(self.state_shape, self.nA,                       # target  Q
                             channels=self.params['channels'], kernels = self.params['kernels'], paddings = self.params['paddings'],
                             pools = self.params['pools'], dropout=self.params['dropout'], hidden  = self.params['hidden']).to(self.gpu)      
        self.best_model= AgentModel(self.state_shape, self.nA,                       # best net
                             channels=self.params['channels'], kernels = self.params['kernels'], paddings = self.params['paddings'], 
                             pools = self.params['pools'], dropout=self.params['dropout'], hidden  = self.params['hidden']).to(self.gpu)
        self.best_reward = -100000                                                   # best reward

        if   self.params['loss'] == 'mse':
             self.loss  = nn.MSELoss()
        elif self.params['loss'] == 'huber':
             self.loss = nn.HuberLoss()
        else:
            print("ERROR: Unknown loss function!!!")
        
        if   self.params['optimizer'] == 'sgd':
             self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.params['lr'], momentum=0.8)
        elif self.params['optimizer'] == 'adam':
             self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.params['lr'])
        else:
            print("ERROR: Unknown optimizer!!!")

        self.memo = MemoryBuffer(self.params['capacity'], self.state_shape)        
        
        self.epsilon     = self.params['eps1']        # start value in epsilon greedy strategy
        self.decay_rate  = math.exp(math.log(self.params['eps2']/self.params['eps1'])/self.params['decays'])

        print(f"decay_rate: {self.decay_rate:.4f}")
        print("features:", self.model.features)
        print(self.model)                 
        print("features:", self.model.features)

        #tot = 0
        #for k, v in self.model.state_dict().items():
        #    pars = np.prod(list(v.shape)); tot += pars
        #    print(f'{k:20s} :{pars:7d}  =  {tuple(v.shape)} ')
        #print(f"{'parameters':20s} :{tot:7d}")
        
        summary(self.model.to("cpu"), torch.zeros((1, )+self.state_shape), show_input=False)      # 
        self.model.to(self.gpu)
    #------------------------------------------------------------------------------------

    def policy(self, state):
        """ Return action according to epsilon greedy strategy """
        if np.random.random() < self.epsilon:            
            return np.random.randint(self.nA)    # random action

        x = state.unsqueeze(dim=0).to(self.gpu)
        with torch.no_grad():
            y = self.model(x).detach().to('cpu').numpy() 
        return np.argmax(y)                      # best action

    
    #------------------------------------------------------------------------------------
    
    def set_state(self, s1,s2):
        s1 = torch.tensor(s1, dtype=torch.float32).permute(2, 0, 1)/255.
        s2 = torch.tensor(s2, dtype=torch.float32).permute(2, 0, 1)/255.
        return torch.cat([s1,s2],0)
    
    #------------------------------------------------------------------------------------

    def run_episode(self, ticks = 200):
        """ Run one episode, keeping the environment model in memory """
        rew = 0                                  # total reward
        oi = self.env.reset()                    # initial state        
        o0, _, _, _ = self.env.step(0)           # nothing do
        s0 = self.set_state(oi,o0)
        a0 = self.policy(s0)  # get action        
        for t in range(1, ticks+1):
            o1, r1, done, _ = self.env.step(a0)            
            s1 = self.set_state(o0,o1)
            a1 = self.policy(s1)

            dn = done and (self.params['timeout'] or t < ticks)                        
            self.memo.add(s0, a0, s1, r1, float(dn) )

            if self.frame % self.params['update'] == 0:  # copy model to target
                self.target.load_state_dict( self.model.state_dict() ) 

            if self.memo.count >= self.params['batch']:    
                self.learn_model()                         

            rew += r1
            self.frame += 1

            if done:
                break

            s0, a0, o0 = s1, a1, o1
        return rew, t

    #------------------------------------------------------------------------------------

    def learn(self, episodes = 100000, stat1 = 10, stat2 = 100, plots = 1000, rews_range=None):
        """ Repeat episodes episodes times """
        self.frame = 1      
        self.max_lens = 0
        rews, lens, mean, beg   = [], [], 0, time.process_time()
        for episode in range(1, episodes+1):
            rew, t = self.run_episode( self.params['ticks'] )
            rews.append( rew )
            lens.append(t)

            self.epsilon *= self.decay_rate                # epsilon-decay
            if self.epsilon < self.params['eps2']:
                self.epsilon = 0.
                
            if episode % stat1 == 0:
                self.history.append([episode, np.mean(rews[-stat1:]), np.mean(rews[-stat2:]), np.mean(lens[-stat2:])])                                      
                
            if  episode % stat2 == 0:                               
                mean, std    = np.mean(rews[-stat2:]), np.std(rews[-stat2:])    
                lensM, lensS = np.mean(lens[-stat2:]), np.std(lens[-stat2:])                    
                if mean > self.best_reward:
                    self.best_reward = mean
                    self.best_model.load_state_dict( self.model.state_dict() )                     
                if self.max_lens < lensM:
                    self.max_lens = lensM
                    
                maxQ = self.maxQ.to('cpu')
                print(f"{episode:6d} rew:{mean:7.1f} ± {std/stat2**0.5:3.1f}  ({self.best_reward:7.2f}), ticks:{lensM:3.0f} ({self.max_lens:3.0f}), eps:{self.epsilon:.3f}, Q:{maxQ.mean():8.2f} ±{maxQ.std():6.2f}, loss:{self.last_loss:7.3f}, {(time.process_time() - beg):3.0f}s")
                beg = time.process_time()
                
            if  episode % plots == 0:                   
                self.plot(f"{self.params['env']}  Episode: {episode}  best: {self.best_reward:7.1f}", rews_range)
                #self.test(episodes = 1, ticks = self.params['ticks'], render = True)
                #env.close()
            
    #------------------------------------------------------------------------------------

    def learn_model(self):
        """ Model Training """
        batch = self.params['batch']
        
        S0, A0, S1, R1, Done = self.memo.get(batch)
        S0 = S0.to(self.gpu); A0 = A0.to(self.gpu)
        S1 = S1.to(self.gpu); R1 = R1.to(self.gpu);  Done = Done.to(self.gpu)
        
        if self.params['method'] == 'DQN':
            with torch.no_grad():
                y = self.target(S1).detach()
            self.maxQ, _ = torch.max(y, 1)      # maximum Q values for S1
        elif self.params['method'] == 'DDQN':
            y = self.model(S1)                 
            a = torch.argmax(y,1).view(-1,1)   # a = arg max Q(s1,a; theta)                 
            with torch.no_grad():
                q = self.target(S1)                       
            self.maxQ = q.gather(1, a)         # Q(s1, a; theta')   
        else:            
            print("Unknown method")
            
        sum_loss = 0        
        s0, a0   = S0, A0.view(-1,1)
        r1, done = R1.view(-1,1), Done.view(-1,1)
        q1       = self.maxQ.view(-1,1)

        yb = r1 + self.params['gamma']*q1*(1.0 - done)

        y = self.model(s0)             # forward
        y = y.gather(1, a0)
        L = self.loss(y, yb)

        self.optimizer.zero_grad()     # reset the gradients
        L.backward()                   # calculate gradients
        self.optimizer.step()          # adjusting parameters

        sum_loss += L.detach().item()

        self.last_loss = sum_loss
        
    #------------------------------------------------------------------------------------
        
    def plot(self, text, rews_range):
        """ Plot histogram for states and actions """        
        hist_A, bins_A = np.histogram(self.memo.A0, bins=np.linspace(-0.5, self.nA-0.5, self.nA+1), density=True)

        fig, ax = plt.subplots(1, 2, figsize=(16,6), gridspec_kw={'width_ratios': [1, 5]})        
        plt.suptitle(text, fontsize=18)
                                                        
        ax[0].set_xlim(min(bins_A), max(bins_A))    # histogram for A
        ax[0].grid(axis='x', alpha=0.75); ax[0].grid(axis='y', alpha=0.75)
        ax[0].set_xlabel('actions', fontsize=16)
        ax[0].set_xticks(np.arange(self.nA));
        bins = [ (bins_A[i]+bins_A[i+1])/2 for i in range(len(bins_A)-1)]        
        ax[0].bar(bins, hist_A, width=0.5, color='blue')

        history = np.array(self.history)            # loss history
        ax[1].plot(history[:,0], history[:,1], linewidth=1)
        ax[1].plot(history[:,0], history[:,2], linewidth=2)
        ax[1].set_ylim(rews_range[0], rews_range[1]);
        ax[1].set_xlabel('reward', fontsize=16)        
        ax[1].set_xlabel('episode', fontsize=16)        
        ax[1].grid(axis='x', alpha=0.75); ax[1].grid(axis='y', alpha=0.75)                
        
        params = [ f"{k:9s}: {v}\n" for k,v in self.params.items()]
        ax[1].text(history[0,0], rews_range[0], "".join(params), {'fontsize':12, 'fontname':'monospace'})
        
        ax2=ax[1].twinx()  
        ax2.plot(history[:,0], history[:,3], color="blue")        
        ax2.set_ylabel("episode length",color="blue", fontsize=16)        

        plt.show()
        
    #------------------------------------------------------------------------------------

    def test(self, episodes = 1000, ticks = 1000, render = False):
        """ Q-Function Testing """
        rews = []
        for episode in range(1, episodes+1):
            tot = 0
            obs =  self.env.reset()
            for _ in range(ticks):
                action = self.policy( self.scale(obs) )
                obs, rew, done, _ = self.env.step(action)
                tot += rew
                if render:
                    env.render()
                if done:
                    break
            rews.append(tot)
            if episode % 100:
                print(f"\r {episode:4d}: Reward: {np.mean(rews):7.3f} ± {np.std(rews)/len(rews)**0.5:.3f}", end="")
        print()
        

pygame 2.1.0 (SDL 2.0.16, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


## Snake

In [2]:
env_name = "Snake"  # (nS=(), nA=5)
env = snake.Snake()

dqn = DQN( env )

dqn.params = {
            'env'      : env_name,
            'ticks'    : 200,                  
            'timeout'  : True,            # whether to consider reaching ticks as a terminal state
            'method'   : "DQN",           # kind of the method (DQN, DDQN)     
            'gamma'    : 0.99,            # discount factor
            'eps1'     : 1.0,             # initial value epsilon
            'eps2'     : 0.001,           # final value   epsilon
            'decays'   : 1,               # number of episodes to decay eps1 - > eps2
            'update'   : 100,             # target model update rate (in frames = time steps)         
            'batch'    : 512,             # batch size for training
            'capacity' : 10000,           # memory size
            'channels' : [2,48,92],       # conv channles
            'kernels'  : [8,3,3],         # conv kernels
            'strides'  : [8,1,1],         # conv strides
            'paddings' : [0,1,1],         # conv paddings
            'pools'    : [1,2,2],         # = pool_strides
            'dropout'  : 0.2,             # after cnn,reLU,pool 
            'hidden'   : 128,             # hidden layers            
            'loss'     : 'mse',           # loss function (mse, huber)
            'optimizer': 'adam',          # optimizer (sgd, adam)
            'lr'       : 0.0001,          # learning rate             
}

dqn.init()

state_shape (6, 128, 128)
device: cuda:0
decay_rate: 0.0010
features: 1472
features: 1472
-----------------------------------------------------------------------
             Layer (type)               Output Shape         Param #
                 Conv2d-1            [-1, 2, 16, 16]             770
                   ReLU-2            [-1, 2, 16, 16]               0
                 Conv2d-3           [-1, 48, 16, 16]             912
                   ReLU-4           [-1, 48, 16, 16]               0
              MaxPool2d-5             [-1, 48, 8, 8]               0
                Dropout-6             [-1, 48, 8, 8]               0
                 Conv2d-7             [-1, 92, 8, 8]          39,836
                   ReLU-8             [-1, 92, 8, 8]               0
              MaxPool2d-9             [-1, 92, 4, 4]               0
               Dropout-10             [-1, 92, 4, 4]               0
               Flatten-11                 [-1, 1472]               0
          

In [3]:
print(dqn.params)
dqn.learn(episodes = 100000, rews_range = [-100, 100])

{'env': 'Snake', 'ticks': 200, 'timeout': True, 'method': 'DQN', 'gamma': 0.99, 'eps1': 1.0, 'eps2': 0.001, 'decays': 1, 'update': 100, 'batch': 512, 'capacity': 10000, 'channels': [2, 48, 92], 'kernels': [8, 3, 3], 'strides': [8, 1, 1], 'paddings': [0, 1, 1], 'pools': [1, 2, 2], 'dropout': 0.2, 'hidden': 128, 'loss': 'mse', 'optimizer': 'adam', 'lr': 0.0001}


RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

## Test best model

In [None]:
dqn.model.load_state_dict( dqn.best_model.state_dict() )
dqn.test(episodes = 1, ticks=500, render=True)
env.close()

## Save model

In [None]:
import datetime
  
state = {'info':      f"{env_name}: Q-function, Reward:  286",     
         'date':      datetime.datetime.now(),  
         'model':     str(dqn.best_model),
         'state' :    dqn.best_model.state_dict(),  
        } 
print(dqn.params['hiddens'])
torch.save(state, f"{env_name}_{'_'.join([str(x) for x in dqn.params['hiddens']])}.286.pt")
print(state['model'])