In [None]:
from torch import randint
from torch import nn, optim
import torch 
import gym
import numpy as np

from collections import deque
import random

from scores.score_logger import ScoreLogger

In [374]:
ENV_NAME = "CartPole-v1"
GAMMA = 0.95
MEMORY = 1000000
BATCH_SIZE = 50
LEARNING_RATE = 0.01
EXPLORATION_DECAY = 0.99999
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01


In [375]:
class DQN:
    def __init__(self, observation_space, action_space):
        self.model = nn.Sequential(
            nn.Linear(observation_space.shape[0], 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, action_space.n)
        )
        self.observation_space = observation_space
        self.action_space = action_space

        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.loss_fn = nn.MSELoss()
        self.exploration_rate = EXPLORATION_MAX
        self.discount = GAMMA
        self.memory = deque(maxlen=MEMORY)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Sometimes act randomly. Do so less and less as the exploration rate decays.
    def act(self, state):
        if (np.random.rand() < self.exploration_rate):
            return self.action_space.sample()
        # print(self.model(torch.from_numpy(state)))
        # print(self.model(torch.from_numpy(state)).argmax().item())
        return self.model(torch.from_numpy(state)).argmax().item()
    
    def get_q_next(self, next_state):
        return self.discount * self.model(torch.from_numpy(next_state)).max()

    def experience_replay(self):
        # Don't replay if we don't have enough memory
        if len(self.memory) < BATCH_SIZE:
            return
            
        batch = random.sample(self.memory, BATCH_SIZE)
        # self.optimizer.zero_grad()
        for state, action, reward, next_state, terminal in batch:    
            q_update = reward
            # Update the q value for the action we took
            # Bellman inspired update
            # Current state rewards plus next state rewards discounted by gamma
            if not terminal:
                q_update = reward + self.get_q_next(next_state)
            else: 
                # create long tensor
                q_update = torch.tensor(q_update, dtype=torch.float32)
            
            ## Get the q_values for the current state
            old_q_values = self.model(torch.from_numpy(state))
            new_q_values = old_q_values.clone()

            ## Update the q_value for the action we took
            new_q_values[0][action] = q_update

            ## Update the q_value for the action we took
            loss = self.loss_fn(old_q_values, new_q_values)

            # We reset the optimizer each time because we are training in batches of one
            self.optimizer.zero_grad()

            # Back propagate the loss
            loss.backward(retain_graph=True)

            # Update the weights
            self.optimizer.step()   
            
        # Decay the exploration rate
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)     

    

In [376]:
# Create environment and a way to track the score
env = gym.make(ENV_NAME)
score_logger = ScoreLogger(ENV_NAME)

# Reset the environment and get the first state
state, info = env.reset(seed=46, return_info=True)

# Create the agent
DQN_AGENT = DQN(env.observation_space, env.action_space)

run = 0 # run is the number of episodes
while run < 10000:
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.shape[0]])
    step = 0
    while(True): 
        step += 1

        # Predict action then take action in environment
        action = DQN_AGENT.act(state)
        state_next, reward, terminal, info = env.step(action)

        # Get set reward negative if game over
        reward = reward if not terminal else -reward
        state_next = np.reshape(state_next, [1, env.observation_space.shape[0]])

        # Store experience in memory
        DQN_AGENT.remember(state, action, reward, state_next, terminal)
        state = state_next

        if terminal:
            print("Run: " + str(run) + ", exploration: " + str(DQN_AGENT.exploration_rate) + ", score: " + str(step))
            score_logger.add_score(step, run)
            break
        
        # Experience replay - train model
        DQN_AGENT.experience_replay()
        


Run: 1, exploration: 1.0, score: 33
Scores: (min: 33, avg: 33, max: 33)

Run: 2, exploration: 1.0, score: 16
Scores: (min: 16, avg: 24.5, max: 33)



  self._save_png(input_path=SCORES_CSV_PATH,


Run: 3, exploration: 0.9998800065997806, score: 13
Scores: (min: 13, avg: 20.666666666666668, max: 33)

Run: 4, exploration: 0.9997400324974013, score: 15
Scores: (min: 13, avg: 19.25, max: 33)

Run: 5, exploration: 0.999650059493457, score: 10
Scores: (min: 10, avg: 17.4, max: 33)

Run: 6, exploration: 0.9995700902876621, score: 9
Scores: (min: 9, avg: 16, max: 33)

Run: 7, exploration: 0.9994801325779049, score: 10
Scores: (min: 9, avg: 15.142857142857142, max: 33)

Run: 8, exploration: 0.9993502079563297, score: 14
Scores: (min: 9, avg: 15, max: 33)

Run: 9, exploration: 0.9992502774324905, score: 11
Scores: (min: 9, avg: 14.555555555555555, max: 33)

Run: 10, exploration: 0.9990904093785458, score: 17
Scores: (min: 9, avg: 14.8, max: 33)

Run: 11, exploration: 0.9989705251231978, score: 13
Scores: (min: 9, avg: 14.636363636363637, max: 33)

Run: 12, exploration: 0.9987707499974752, score: 21
Scores: (min: 9, avg: 15.166666666666666, max: 33)

Run: 13, exploration: 0.998571014822982