# Libraries

In [1]:
import matplotlib

matplotlib.use("TkAgg")
import gym
import gridworld
from gym import wrappers, logger
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

np.random.seed(3)

In [2]:
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Utils

In [72]:

###################     Highly abstract policy class! #######################
class Policy(object):
    
    def __init__(self):
        """ Nothing is required for to construct an abstract policy class """
        pass
    
    def get_action_value(self):
        "Must be redefined"
        raise NotImplementedError
        
############################################################################


###################                Uniform            #######################
class Uniform_Policy(Policy):

    def get_action_value(self , Q_state ):
        """ Q_values numpy array for a given state  size ( number of actions ) """
        action = np.random.randint (Q_state.size)
        return action , Q_state[action]

############################################################################

###################                Greedy            #######################
class Greedy_Policy(Policy):

    
    def get_action_value(self , Q_state):
        action = Q_state.argmax()
        return action , Q_state[action]

############################################################################

###################             ε-Greedy            #######################
class Epsilon_Greedy_Policy(Policy):
    
    def __init__(self , eps=0.1 , decay=1.0 ):  
        self.eps      = eps
        self.decay = decay
                    
    def get_action_value(self , Q_state ):
        self.eps = self.eps*self.decay
        if ( np.random.rand() <= (self.eps) ):
            action = np.random.choice ( Q_state.size )
        else :
            action = Q_state.argmax()
    
        return action , Q_state[action]
    
    def set_epsilon(self,eps):
        """
        when we want to perform a cutomized decay manually
        
        """
        self.eps = eps

############################################################################

In [73]:
def make_policy ( name="Epsilon_Greedy" , params={ "eps":0.1 , "decay":1.0 } ) :
    
    if   ( name=="Uniform" ): return Uniform_Policy()
    elif ( name=="Greedy" ) : return Greedy_Policy()
    elif ( name=="Epsilon_Greedy" ) : return Epsilon_Greedy_Policy(**params)
    else : 
        raise Exception("Unknown policy")

In [74]:
p = make_policy( name="Epsilon_Greedy" )

In [75]:
# class DQN(nn.Module):
#     def __init__(self, in_channels=4, num_actions=2):
#         """
#         DQN
#         """
#         super(DQN, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
#         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
#         self.fc4 = nn.Linear(7 * 7 * 64, 512)
#         self.fc5 = nn.Linear(512, num_actions)

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = F.relu(self.conv2(x))
#         x = F.relu(self.conv3(x))
#         x = F.relu(self.fc4(x.view(x.size(0), -1)))
#         return self.fc5(x)

## Algorithms

<img src="./algo_imgs/DQN.png">

## DQN ( Experience replay + target Network )

In [76]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity   = capacity
        self.curr_state = torch.Tensor().type(torch.float64)
        self.next_state = torch.Tensor().type(torch.float64)
        self.action     = torch.Tensor().type(torch.uint8)
        self.reward     = torch.Tensor().type(torch.float64)
        self.final_state_mask = []
        
        self.position = 0
        
    def push(self, Phi_S, action, Phi_next_S, reward, done):
        """Saves a transition."""
        
        act = torch.tensor([[action]],dtype=torch.uint8)
        
        if len(self) < self.capacity:
            
            self.curr_state = torch.cat( [self.curr_state,Phi_S.view(1,-1)] ,0)
            self.next_state = torch.cat( [self.next_state,Phi_next_S.view(1,-1)] ,0)
            self.action     = torch.cat([self.action,act], 0)
            self.reward     = torch.cat([self.reward,torch.tensor([reward],dtype=torch.float64).view(1,-1)], 0)
            self.final_state_mask.append(not done)
        else :
            self.curr_state[self.position]   = Phi_S
            self.next_state[self.position]   = Phi_next_S
            self.action[self.position]       = act
            self.reward[self.position]       = reward
            self.final_state_mask[self.position] = done
            
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        indexes = np.random.choice(len(self),size=batch_size)
        return self.curr_state[indexes], self.action[indexes],\
               self.next_state[indexes], self.reward[indexes],\
               torch.tensor( self.final_state_mask ) [indexes] 

    def __len__(self):
        return len(self.curr_state)

In [77]:
class DQN_Q_Estimator(nn.Module):
    """
        Simple Neural Network Q(state) ==> [ Q[state,action1], Q[state,action2], ... ]
    """
    
    def __init__(self , in_size=4 , num_actions=2 , hidden_size=[] ):
        super(DQN_Q_Estimator, self).__init__()
        self.layers = nn.ModuleList([])
        for x in hidden_size:
            self.layers.append(nn.Linear(in_size, x))
            in_size = x
            self.layers.append(nn.Linear(in_size, num_actions))
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
            
    def forward(self, x):
        x = self.layers[0](x)
        for i in range(1, len(self.layers)):
            x = torch.nn.functional.relu(x)
            x = self.layers[i](x)
        return x

In [78]:
class CartPoleAgent(object):
    """The world's simplest agent!"""

    def __init__(self, action_space, behavior_policy, memory_capacity=1000, hidden_size=[50],hiddevice=device):
        
        self.action_space      = action_space
        self.replay_memory     = ReplayMemory(memory_capacity)
        self.Q_estimator_policy= DQN_Q_Estimator(in_size=4,num_actions=action_space.n,hidden_size=hidden_size)\
                                 .double().to(device)
        
        # copy from policy net
        self.Q_estimator_target= DQN_Q_Estimator(in_size=4,num_actions=action_space.n,hidden_size=hidden_size)\
                                 .double().to(device)
        
        self.behavior_policy = behavior_policy
        
        self.update_target_network()
        
    def act(self, observation):
        output = self.Q_estimator_policy(observation.to(device)).detach().cpu().numpy()
        action, _= self.behavior_policy.get_action_value(output)
        return action


    def update_target_network(self):
        "copy policy network parameters to target network parameters"
        self.Q_estimator_target.load_state_dict(self.Q_estimator_policy.state_dict())
        

In [79]:
p = make_policy ( "Epsilon_Greedy" , params={ "eps" : EPS } )
agent = CartPoleAgent( envm.action_space, behavior_policy=p , memory_capacity=MEM_CAPACITY,
                       hidden_size=HIDDEN_SIZE )

In [82]:
env = gym.make('CartPole-v1')

cartpole_nbr_actions = []

outdir = 'cartpole-v0/CartPol-agent-results'
envm = wrappers.Monitor(env, directory=outdir, force=True, video_callable=False)
env.seed(0)

#####################  hyper params #####################
episode_count = 500
env.verbose = True
np.random.seed(0)

rsum = 0
BATCH_SIZE = 32
LR = 0.001
EPS = 0.2
GAMMA = 0.99
TARGET_UPDATE = 100
MEM_CAPACITY  = 1000
HIDDEN_SIZE = [128]

log_interval = 10
#########################################################

################## AGENT ################## 
p = make_policy ( "Epsilon_Greedy" , params={ "eps" : EPS, "decay":0.9999 } )
agent = CartPoleAgent( envm.action_space, behavior_policy=p , memory_capacity=MEM_CAPACITY,
                       hidden_size=HIDDEN_SIZE )
###########################################

################# LOSS + optimizer ################
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam ( agent.Q_estimator_policy.parameters() , lr=LR ) 
###################################################

nbr_steps = 0

for i in range(episode_count):
    obs = envm.reset()
    env.verbose = (i % 100 == 0 and i > 0)  # afficher 1 episode sur 100
    if env.verbose:
        env.render()
    j = 0
    rsum = 0
    
    Phi_S = torch.from_numpy(obs)

    while True:
        action =  agent.act(Phi_S)
        j+=1
        obs, reward, done, _ = envm.step(action)
        Phi_next_S =  torch.from_numpy(obs)
        
        if done:
            reward = -1
        # Store the transition in memory
        agent.replay_memory.push(Phi_S, action, Phi_next_S, reward, done)
        
        
        if ( len(agent.replay_memory) >= MEM_CAPACITY ):
                    
            # get X_batch 
            state_batch, action_batch, next_state_batch, reward_batch, not_final_state_mask = \
            agent.replay_memory.sample(BATCH_SIZE)
            

            
            # create Y_batch
            expected_Q = reward_batch.to(device).reshape(-1)
            next_Q = agent.Q_estimator_target(next_state_batch.to(device)).max(1)[0]
            expected_Q += not_final_state_mask* GAMMA * next_Q
            

            # Forward pass
            current_Q = agent.Q_estimator_policy(state_batch.to(device))
            current_Q = current_Q[ list(range(BATCH_SIZE)) , action_batch.reshape(-1).tolist() ]
            loss = criterion(current_Q, expected_Q)

            # Backward a
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            nbr_steps += 1 
         
            if( nbr_steps == TARGET_UPDATE ) :
                print(nbr_steps)
                agent.update_target_network()
                print("update")
            
        if(done):
            if i % log_interval == 0:
                print('episode {}| nbr_actions: {}| epsilon: {:.2f}'.format(i, j, agent.behavior_policy.eps))
            break
           

            
#         # logs
#         rsum += reward
#         j += 1
#         if env.verbose:
#             env.render()
#         if done:
#             cartpole_nbr_actions.append(j)
#             print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions")
#             break
        

print("done")
env.close()

episode 0| nbr_actions: 10| epsilon: 0.20
episode 10| nbr_actions: 10| epsilon: 0.20
episode 20| nbr_actions: 10| epsilon: 0.20
episode 30| nbr_actions: 10| epsilon: 0.19
episode 40| nbr_actions: 9| epsilon: 0.19
episode 50| nbr_actions: 14| epsilon: 0.19
episode 60| nbr_actions: 9| epsilon: 0.19
episode 70| nbr_actions: 8| epsilon: 0.19
episode 80| nbr_actions: 10| epsilon: 0.18
episode 90| nbr_actions: 12| epsilon: 0.18
episode 100| nbr_actions: 10| epsilon: 0.18
100
update
episode 110| nbr_actions: 12| epsilon: 0.18
episode 120| nbr_actions: 11| epsilon: 0.18
episode 130| nbr_actions: 11| epsilon: 0.17
episode 140| nbr_actions: 9| epsilon: 0.17
episode 150| nbr_actions: 12| epsilon: 0.17
episode 160| nbr_actions: 9| epsilon: 0.17
episode 170| nbr_actions: 10| epsilon: 0.17
episode 180| nbr_actions: 9| epsilon: 0.16
episode 190| nbr_actions: 11| epsilon: 0.16
episode 200| nbr_actions: 12| epsilon: 0.16
episode 210| nbr_actions: 12| epsilon: 0.16
episode 220| nbr_actions: 9| epsilon: 

In [None]:
env.close()

In [83]:
agent.Q_estimator_policy

DQN_Q_Estimator(
  (layers): ModuleList(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=2, bias=True)
  )
)