In [None]:
from torch import randint
from torch import nn, optim
import torch 
import gym
import numpy as np

from collections import deque
import random

from scores.score_logger import ScoreLogger

In [None]:
ENV_NAME = "CartPole-v1"
GAMMA = 0.95
MEMORY = 1000000
BATCH_SIZE = 20
LEARNING_RATE = 0.01
EXPLORATION_DECAY = 0.995
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01


In [282]:
class DQN:
    def __init__(self, observation_space, action_space):
        self.model = nn.Sequential(
            nn.Linear(observation_space.shape[0], 6),
            nn.ReLU(),
            nn.Linear(6, 6),
            nn.ReLU(),
            nn.Linear(6, action_space.n)
        )
        self.observation_space = observation_space
        self.action_space = action_space

        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.loss_fn = nn.MSELoss()
        self.exploration_rate = EXPLORATION_MAX
        self.discount = GAMMA
        self.memory = deque(maxlen=MEMORY)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Sometimes act randomly. Do so less and less as the exploration rate decays.
    def act(self, state):
        if (np.random.rand() < self.exploration_rate):
            return self.action_space.sample()
        # print(self.model(torch.from_numpy(state)))
        # print(self.model(torch.from_numpy(state)).argmax().item())
        return self.model(torch.from_numpy(state)).argmax().item()
    
    def get_q_next(self, next_state):
        return self.discount * self.model(torch.from_numpy(next_state)).max()

    def experience_replay(self):
        # Don't replay if we don't have enough memory
        print(len(self.memory))
        if len(self.memory) < BATCH_SIZE:
            return
            
        batch = random.sample(self.memory, BATCH_SIZE)
        # self.optimizer.zero_grad()
        for state, action, reward, next_state, terminal in batch:    
            q_update = reward
            # Update the q value for the action we took
            # Bellman inspired update
            # Current state rewards plus next state rewards discounted by gamma
            if not terminal:
                q_update = reward + self.get_q_next(next_state)
            else: 
                # create long tensor
                q_update = torch.tensor(q_update, dtype=torch.float32)
            
            ## Get the q_values for the current state
            q_values = self.model(torch.from_numpy(state))
            prediction, _ = torch.max(q_values, axis=1)  

            ## Update the q_value for the action we took
            loss = self.loss_fn(prediction, q_update.reshape(1))

            # We reset the optimizer each time because we are training in batches of one
            self.optimizer.zero_grad()

            # Back propagate the loss
            loss.backward(retain_graph=True)

            # Update the weights
            self.optimizer.step()   
            
        # Decay the exploration rate
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)     

    

In [283]:
# Create environment and a way to track the score
env = gym.make(ENV_NAME)
score_logger = ScoreLogger(ENV_NAME)

# Reset the environment and get the first state
state, info = env.reset(seed=42, return_info=True)

# Create the agent
DQN_AGENT = DQN(env.observation_space, env.action_space)

run = 0 # run is the number of episodes
while run < 100:
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.shape[0]])
    step = 0
    while(True): 
        step += 1

        # Predict action then take action in environment
        action = DQN_AGENT.act(state)
        state_next, reward, terminal, info = env.step(action)

        # Get set reward negative if game over
        reward = reward if not terminal else -reward
        state_next = np.reshape(state_next, [1, env.observation_space.shape[0]])

        # Store experience in memory
        DQN_AGENT.remember(state, action, reward, state_next, terminal)
        state = state_next

        if terminal:
            print("Run: " + str(run) + ", exploration: " + str(DQN_AGENT.exploration_rate) + ", score: " + str(step))
            score_logger.add_score(step, run)
            break
        
        # Experience replay - train model
        DQN_AGENT.experience_replay()
        


  deprecation(
  deprecation(


1
2
3
4
5
6
7
8
9
10
11
12
Run: 1, exploration: 1.0, score: 13
Scores: (min: 13, avg: 13, max: 13)

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
Run: 2, exploration: 0.8475428503023453, score: 40


  self._save_png(input_path=SCORES_CSV_PATH,


Scores: (min: 13, avg: 26.5, max: 40)

54
55
56
57
58
59
60
61
62
63
64
Run: 3, exploration: 0.8020760579717637, score: 12
Scores: (min: 12, avg: 21.666666666666668, max: 40)

66
67
68
69
70
71
72
73
74
75
76
77
78
Run: 4, exploration: 0.7514768435208588, score: 14
Scores: (min: 12, avg: 19.75, max: 40)

80
81
82
83
84
85
86
Run: 5, exploration: 0.7255664080186093, score: 8
Scores: (min: 8, avg: 17.4, max: 40)

88
89
90
91
92
93
94
95
96
97
Run: 6, exploration: 0.6900935609921609, score: 11
Scores: (min: 8, avg: 16.333333333333332, max: 40)

99
100
101
102
103
104
105
106
107
Run: 7, exploration: 0.6596532430440636, score: 10
Scores: (min: 8, avg: 15.428571428571429, max: 40)

109
110
111
112
113
114
115
116
117
118
119
120
Run: 8, exploration: 0.6211445383053219, score: 13
Scores: (min: 8, avg: 15.125, max: 40)

122
123
124
125
126
127
128
129
130
131
Run: 9, exploration: 0.5907768628656763, score: 11
Scores: (min: 8, avg: 14.666666666666666, max: 40)

133
134
135
136
137
138
139
140
