# Policy Gradient method
```
!pip3 install box2d-py
!pip3 install gym[Box_2D]
```

In [None]:
import math
import time
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

#========================================================================================

class AgentModel(nn.Module):
    """ Neural network for pi(s,a) """
    def __init__(self, sizes, hidden=nn.ReLU, output=nn.Identity):
        super(AgentModel, self).__init__()        
        layers = []        
        for i in range(len(sizes)-1):            
            activation = hidden if i < len(sizes)-2 else output
            layers += [ nn.Linear(sizes[i], sizes[i+1]), activation() ]        
        self.model = nn.Sequential(*layers)
 
    def forward(self, x):
        return self.model(x)        
    

class GradientPolicy:
    def __init__(self, env):
        self.env  = env                         # environment we work with
        self.low  = env.observation_space.low   # minimum observation values
        self.high = env.observation_space.high  # maximum observation values
        self.nA   =  self.env.action_space.n    # number of discrete actions
        self.nS   =  self.env.observation_space.shape[0] # number of state components
        self.action_space = np.arange(env.action_space.n)    

        self.config = {             # default parameters
            'method'   : "GradPolicy",# kind of the method (GradPolicy)     
            'gamma'    : 0.99,      # discount factor
            'eps1'     : 1.0,       # initial value epsilon
            'eps2'     : 0.001,     # final value   epsilon
            'decays'   : 1000,      # number of episodes to decay eps1 - > eps2
            'update'   : 10,        # target model update rate (in frames = time steps)         
            'batch'    : 100,       # batch size for training
            'capacity' : 100000,    # memory size
            'rewrite'  : 1.0,       # rewrite memory (if < 1 - sorted)
            'hiddens'  : [256,128], # hidden layers
            'scale'    : True,      # scale or not observe to [-1...1]
            'loss'     : 'huber',     # loss function (mse, huber)
            'optimizer': 'sgd',     # optimizer (sgd, adam)
            'lr'       : 0.001,     # learning rate           
        }
        self.last_loss = 0.         # last loss

        print("low :   ", self.low)
        print("high:   ", self.high)        
                
    #------------------------------------------------------------------------------------

    def init(self):
        """ Create a neural network and optimizer """

        #self.device = "cpu"
        self.device =torch.device("cuda:0" if torch.cuda.is_available() else "cpu")        
        print("device:", self.device)

        self.model  = AgentModel([self.nS]+self.config['hiddens']+[self.nA]).to(self.device)      # current Q        

        self.best_model  = AgentModel([self.nS]+self.config['hiddens']+[self.nA]).to(self.device) # best net
        self.best_reward = -100000                                                           # best reward

        if   self.config['loss'] == 'mse':
             self.loss  = nn.MSELoss()
        elif self.config['loss'] == 'huber':
             self.loss = nn.HuberLoss()
        else:
            print("ERROR: Unknown loss function!!!")
        
        if   self.config['optimizer'] == 'sgd':
             self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config['lr'], momentum=0.8)
        elif self.config['optimizer'] == 'adam':
             self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config['lr'])
        else:
            print("ERROR: Unknown optimizer!!!")        
        
        self.epsilon     = self.config['eps1']        # start value in epsilon greedy strategy
        self.decay_rate  = math.exp(math.log(self.config['eps2']/self.config['eps1'])/self.config['decays'])

        print(f"decay_rate: {self.decay_rate:.4f}")
        print(self.model)                    
        
    #------------------------------------------------------------------------------------

    def scale(self, obs):
        """ to [-1...1] """
        if self.config['scale']:
            return -1. + 2.*(obs - self.low)/(self.high-self.low)
        else:
            return obs
        
    #------------------------------------------------------------------------------------

    def policy(self, state):
        """ Return action according to epsilon greedy strategy """
        if np.random.random() < self.epsilon:                        
            return np.random.randint(self.nA)    # random action            

        x = torch.tensor(state, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            y = self.model(x).detach().to('cpu')
        probs = torch.softmax(y, 0).numpy()      
        return np.random.choice(self.action_space, p=probs)         # action by probability
    
    #------------------------------------------------------------------------------------
    
    def create_memory(self, capacity):                
        """ Сreate a memory for states, actions and rewards """
        self.capacity = capacity 
        self.count    = 0                
        self.memo_S   = [None]*capacity 
        self.memo_A   = [None]*capacity
        self.memo_R   = [None]*capacity
    
    #------------------------------------------------------------------------------------
    
    def append_memory(self, S, A, R):
        """ Add to memory data lists """
        for i in range(len(S)):
            index = self.count % self.capacity
            self.memo_S[index] = S[i]
            self.memo_A[index] = A[i]
            self.memo_R[index] = R[i]
            self.count += 1

    #------------------------------------------------------------------------------------
    
    def discount_rewards(self, rewards):
        """ """        
        gamma = self.config['gamma']                
        if self.config.get('discount') == 1:            
            res = np.zeros( (len(rewards),) )
            cum = 0
            for t in range(0, len(rewards), -1):
                cum = rewards[t] + cum*gamma
                res[t] = cum                         
            return res #- np.mean(res)
        elif self.config.get('discount') == -1:
            r = np.array([gamma**i * rewards[i] for i in range(len(rewards))])
            r = r[::-1].cumsum()[::-1]
            return r - r.mean()  
        else:
            return [np.sum(rewards)]*len(rewards)    
                
    #------------------------------------------------------------------------------------
    
    def run_episode(self, ticks):
        """ Complete one episode """
        states, actions, rewards = [], [], []
        state, tot_rew = self.env.reset(), 0
        for t in range(1, ticks+1):                
            action = self.policy(state)
            
            actions.append( action )
            states. append( state.tolist() )                
            
            state, rew, done, _ = self.env.step(action)               
            tot_rew += rew
            rewards.append(rew)                
                
            if done:        
                break                                                             
        
        self.append_memory( states, actions, self.discount_rewards(rewards) )
        
        return tot_rew, t 
            
    #------------------------------------------------------------------------------------
    
    def learn(self, episodes=1000, stat1 = 10, stat2 = 100, plots = 1000, rews_range = [-1000, 1000]):                                
        self.create_memory(self.config['capacity'])
        self.history  = [] 
        self.beg_time = time.process_time()
        rews, lens, beg  = [], [], time.process_time()
        for episode in range(1, episodes+1):                       
            rew, t = self.run_episode( self.config['ticks'] )
            rews.append(rew)
            lens.append(t)

            self.epsilon *= self.decay_rate                # epsilon-decay
            if self.epsilon < self.config['eps2']:
                self.epsilon = 0.                
            
            if self.count >= self.capacity:                # start learning
                self.learn_model()
                if self.config['clear']:                   
                    self.count = 0                         # wait new data until full capacity

            if episode % stat1 == 0:
                self.history.append([episode, np.mean(rews[-stat1:]), np.mean(rews[-stat2:])])      
                                
            if  episode % stat2 == 0:                               
                mean, std    = np.mean(rews[-stat2:]), np.std(rews[-stat2:])    
                lensM, lensS = np.mean(lens[-stat2:]), np.std(lens[-stat2:])                    
                if mean > self.best_reward:
                    self.best_reward = mean
                    self.best_model.load_state_dict( self.model.state_dict() )                     
                
                print(f"{episode:6d} rew:{mean:7.1f} ± {std/stat2**0.5:3.1f}, best:{self.best_reward:7.2f}, ticks:{lensM:3.0f}, eps:{self.epsilon:.3f},  loss:{self.last_loss:7.3f}, {(time.process_time() - beg):3.0f}s")
                beg = time.process_time()
                
            if  episode % plots == 0:                   
                self.plot(f"{self.config['env']}  Episode: {episode}  best: {self.best_reward:7.1f}  time: {time.process_time()-self.beg_time:.0f}s", rews_range)     
                          
        print(f"time = {time.process_time()-self.beg_time: .0f}s")
    
    #------------------------------------------------------------------------------------
    
    def learn_model(self):
        """ Model Training """
        num = min(self.capacity, self.count)
        if num == 0:
            return        
       
        S = torch.FloatTensor(self.memo_S[:num]).to(self.device)
        W = torch.FloatTensor(self.memo_R[:num]).to(self.device)                    
        A = torch.LongTensor (self.memo_A[:num]).to(self.device) # Actions are used as indices, must be LongTensor        
        
        batch = min(num, self.config['batch'])
        for epoch in range(1, self.config['epochs']+1):
            idx = torch.randperm( len(S) ).to(self.device)                       
            S, W, A = S[idx], W[idx], A[idx]
        
            tot_L,  num_B = 0, int( len(S)/batch ) 
            for i in range(0, num_B*batch, batch):          
                sb, wb, ab = S[i: i+batch], W[i: i+batch], A[i: i+batch]                         
            
                probs = torch.softmax( self.model(sb), 1 )          # !!!
                logprob = (probs+1.e-8).log()                     
                logprob = torch.gather(logprob, 1, ab.view(-1,1)).squeeze()     
                #wb = F.softmax(wb, 0)                       
                loss = -(wb * logprob).mean()                       # Calculate loss                    

                self.optimizer.zero_grad()
                loss.backward()                                     # Calculate gradients                    
                self.optimizer.step()                               # Apply gradients

                tot_L += loss.item()                
                
        self.last_loss = tot_L / num_B

    #------------------------------------------------------------------------------------
    
    def plot(self, text, rews_range):
        """ Plot histogram for states and actions """        
        num = min(self.count, self.capacity)
        if num == 0:
            return
                
        hist_S, bins_S = np.histogram(self.memo_S, bins=np.linspace(0, math.sqrt(self.nS), 101), density=True)        
        hist_A, bins_A = np.histogram(self.memo_A, bins=np.linspace(-0.5, self.nA-0.5, self.nA+1), density=True)    

        fig, ax = plt.subplots(1, 3, figsize=(16,6), gridspec_kw={'width_ratios': [2, 1, 5]})        
        plt.suptitle(text, fontsize=18)
                                
        ax[0].set_xlim(min(bins_S), max(bins_S))    # histogram for S1
        ax[0].grid(axis='x', alpha=0.75); ax[0].grid(axis='y', alpha=0.75)
        ax[0].set_xlabel('|s1|', fontsize=16)
        bins = [ (bins_S[i]+bins_S[i+1])/2 for i in range(len(bins_S)-1)]
        ax[0].bar(bins, hist_S, width=0.5, color='blue')
                        
        ax[1].set_xlim(min(bins_A), max(bins_A))    # histogram for A
        ax[1].grid(axis='x', alpha=0.75); ax[1].grid(axis='y', alpha=0.75)
        ax[1].set_xlabel('actions', fontsize=16)
        ax[1].set_xticks(np.arange(self.nA));
        bins = [ (bins_A[i]+bins_A[i+1])/2 for i in range(len(bins_A)-1)]        
        ax[1].bar(bins, hist_A, width=0.5, color='blue')

        history = np.array(self.history)            # loss history
        ax[2].plot(history[:,0], history[:,1], linewidth=1)
        ax[2].plot(history[:,0], history[:,2], linewidth=2)
        ax[2].set_ylim(rews_range[0], rews_range[1]);
        ax[2].set_xlabel('episode', fontsize=16)        
        ax[2].grid(axis='x', alpha=0.75); ax[2].grid(axis='y', alpha=0.75)
        params = [ f"{k:9s}: {v}\n" for k,v in self.config.items()]
        ax[2].text(history[0,0], rews_range[0], "".join(params), {'fontsize':12, 'fontname':'monospace'})

        plt.show()

## CartPole

In [None]:
env_name = "CartPole-v0"    # (nS=4, nA=2)
ml = GradientPolicy( gym.make(env_name) )

ml.config = {
   'env'      : env_name,
    'ticks'    : 200,      
    'method'   : "GradPolicy",# kind of the method (GradPolicy)     
    'eps1'     : 1.0,         # initial value epsilon
    'eps2'     : 0.001,       # final value   epsilon
    'decays'   : 500,         # number of episodes to decay eps1 - > eps2   
    'discount' : 0,           # use descount future reward or total reward
    'gamma'    : 0.99,        # discount factor
    'clear'    : False,       # clear memory after learn
    'epochs'   : 2,           # number of epochs for batch learning
    'batch'    : 1000,        # batch size for training
    'capacity' : 2000,        # memory size    
    'hiddens'  : [32],        # hidden layers
    'scale'    : False,       # scale or not observe to [-1...1]
    'loss'     : 'mse',       # loss function (mse, huber)
    'optimizer': 'adam',      # optimizer (sgd, adam)
    'lr'       : 0.001,        # learning rate          
}

ml.init()
ml.learn(episodes=3000,  rews_range = [0, 210])

## LunarLander-v2

In [None]:
env_name = "LunarLander-v2"    # (nS=4, nA=2)
ml = GradientPolicy( gym.make(env_name) )

ml.config = {
    'env'      : env_name,
    'ticks'    : 500,      
    'method'   : "GradPolicy",# kind of the method (GradPolicy)     
    'discount' : 0,           # use descount future reward or total reward
    'gamma'    : 1,           # discount factor
    'eps1'     : 1.0,         # initial value epsilon
    'eps2'     : 0.001,       # final value   epsilon
    'decays'   : 500,         # number of episodes to decay eps1 - > eps2   
    'epochs'   : 1,
    'batch'    : 1000,         # batch size for training
    'capacity' : 1000,        # memory size    
    'hiddens'  : [32,32],     # hidden layers
    'scale'    : False,       # scale or not observe to [-1...1]
    'loss'     : 'mse',       # loss function (mse, huber)
    'optimizer': 'adam',      # optimizer (sgd, adam)
    'lr'       : 0.001,        # learning rate           
}

ml.init()
ml.learn(episodes=3000,  rews_range = [0, 510])

## MountainCar-v0

In [None]:
env_name = "MountainCar-v0"    # (nS=4, nA=2)
ml = GradientPolicy( gym.make(env_name) )

ml.config = {
    'env'      : env_name,
    'ticks'    : 200,      
    'method'   : "GradPolicy",# kind of the method (GradPolicy)     
    'eps1'     : 1.0,         # initial value epsilon
    'eps2'     : 0.9,         # final value   epsilon
    'decays'   : 1000,        # number of episodes to decay eps1 - > eps2   
    'discount' : 0,           # use descount future reward or total reward
    'gamma'    : 1,           # discount factor
    'clear'    : False,        # c
    'epochs'   : 20,
    'batch'    : 100,        # batch size for training
    'capacity' : 2000,        # memory size    
    'hiddens'  : [256,128],   # hidden layers
    'scale'    : True,       # scale or not observe to [-1...1]
    'loss'     : 'mse',       # loss function (mse, huber)
    'optimizer': 'adam',      # optimizer (sgd, adam)
    'lr'       : 0.01,        # learning rate          
}

ml.init()
ml.learn(episodes=3000,  rews_range = [-200, -80])

In [None]:
t = torch.tensor([[1, 2, 3], 
                  [4, 5, 6], 
                  [7, 8, 9]])
a = torch.tensor([[0], [2], [1]])
t.gather(1, a)

In [None]:
bins = (11, 11)                      # число интервалов по каждой оси
low  = np.array([-1.2, -0.07])       # минимальные значения наблюдения
high = np.array([ 0.6,  0.07])       # минимальные значения наблюдения
bin  = (high-low)/bins               # ширины интервалов

def index(state):                    # вещественный state в пару индексов
    indx = ((state - low) / bin).clip(np.zeros_like(low), np.array(bins)-1)
    return tuple( indx.astype(int) )

index(np.array([-1,0.05]))