In [1]:
import torch
import numpy
import matplotlib.pyplot as plt

from environment import Environment
from agent import Agent
from experiencereplay import ExperienceReplay

vs_environment = Environment(grid_size = 10, render_on = False)

vs_environment.reset()

vs_agent = Agent(grid_size = 10)

experience_replay = ExperienceReplay(capacity=10000, batch_size=32)


In [2]:
episodes = 5000
max_steps = 200


optimizer = torch.optim.SGD(vs_agent.model.parameters(), lr=0.001, momentum=0.9)


for episode in range(episodes):

    state = vs_environment.reset()
    for step in range(max_steps):
        
        print('Episode: ', episode)
        print('Step: ', step)
        print('Epsilon: ', vs_agent.epsilon)

        
        action = vs_agent.get_action(state)
        stored_state = state.clone()
        next_state, reward, done = vs_environment.step(action)
        experience_replay.add_experience(stored_state, action.clone(), reward.clone(), next_state.clone(), done.clone())

        if experience_replay.can_provide_sample():
            optimizer.zero_grad()
            
            experiences = experience_replay.sample_batch()
            states = torch.stack([experience.state for experience in experiences])
    
            actions = torch.stack([experience.action for experience in experiences])
        
            rewards = torch.stack([experience.reward for experience in experiences])
        
            next_states = torch.stack([experience.next_state for experience in experiences])
        
            dones = torch.stack([experience.done for experience in experiences])
        
            current_q_values = vs_agent.model(states)
        
            next_q_values = vs_agent.model(next_states)
        
            target_q_values = current_q_values.clone()
        
            for i in range(len(experiences)):
                if dones[i]:
                    target_q_values[i, actions[i]] = rewards[i]
                else:
                    target_q_values[i, actions[i]] = rewards[i]

            
            loss = torch.mean((current_q_values - target_q_values) ** 2)
            loss.backward()
            optimizer.step()
            
        
        if done:
            break
        state = next_state    
            

Episode:  0
Step:  0
Epsilon:  1
Episode:  0
Step:  1
Epsilon:  0.998
Episode:  0
Step:  2
Epsilon:  0.996004
Episode:  0
Step:  3
Epsilon:  0.994011992
Episode:  0
Step:  4
Epsilon:  0.992023968016
Episode:  0
Step:  5
Epsilon:  0.990039920079968
Episode:  0
Step:  6
Epsilon:  0.9880598402398081
Episode:  0
Step:  7
Epsilon:  0.9860837205593285
Episode:  0
Step:  8
Epsilon:  0.9841115531182099
Episode:  0
Step:  9
Epsilon:  0.9821433300119734
Episode:  0
Step:  10
Epsilon:  0.9801790433519495
Episode:  0
Step:  11
Epsilon:  0.9782186852652456
Episode:  0
Step:  12
Epsilon:  0.976262247894715
Episode:  0
Step:  13
Epsilon:  0.9743097233989256
Episode:  0
Step:  14
Epsilon:  0.9723611039521278
Episode:  0
Step:  15
Epsilon:  0.9704163817442236
Episode:  0
Step:  16
Epsilon:  0.9684755489807352
Episode:  0
Step:  17
Epsilon:  0.9665385978827737
Episode:  0
Step:  18
Epsilon:  0.9646055206870082
Episode:  0
Step:  19
Epsilon:  0.9626763096456342
Episode:  0
Step:  20
Epsilon:  0.960750957

KeyboardInterrupt: 

In [3]:
current_q_values

tensor([[64.2211, 63.5185, 67.9057, 65.0593],
        [64.5635, 63.9652, 68.3738, 65.4466],
        [67.9939, 66.1991, 69.5468, 67.8212],
        [66.8491, 66.2162, 70.9120, 67.7715],
        [69.6798, 66.1129, 65.6895, 67.2937],
        [66.8491, 66.2162, 70.9120, 67.7715],
        [66.4420, 65.1804, 69.0565, 66.7044],
        [62.5944, 62.0197, 66.4191, 63.4846],
        [66.4884, 64.8463, 67.9404, 66.3588],
        [68.3389, 64.8257, 64.4318, 65.9423],
        [64.2546, 63.7383, 68.1770, 65.2020],
        [66.1750, 64.6281, 68.2870, 66.2350],
        [68.7280, 66.1105, 67.6889, 67.5076],
        [68.1466, 66.6181, 70.4059, 68.2378],
        [67.1429, 65.4236, 69.1545, 67.1530],
        [64.2211, 63.5185, 67.9057, 65.0593],
        [68.1749, 66.4248, 69.1966, 67.8912],
        [69.4067, 66.0076, 65.8603, 67.1888],
        [66.8632, 65.3377, 69.3815, 67.0886],
        [69.0324, 65.8613, 66.4456, 67.1495],
        [65.2438, 64.5380, 68.9952, 66.0373],
        [69.6320, 65.7209, 64.9877

In [4]:
next_q_values

tensor([[64.2211, 63.5185, 67.9057, 65.0593],
        [64.5635, 63.9652, 68.3738, 65.4466],
        [70.0847, 66.0149, 65.5499, 67.4206],
        [66.8491, 66.2162, 70.9120, 67.7715],
        [69.2455, 66.6861, 68.1949, 68.0747],
        [66.8491, 66.2162, 70.9120, 67.7715],
        [66.4420, 65.1804, 69.0565, 66.7044],
        [62.5944, 62.0197, 66.4191, 63.4846],
        [66.6085, 64.7608, 67.5554, 66.2437],
        [66.6744, 64.7326, 67.3084, 66.1476],
        [64.2546, 63.7383, 68.1770, 65.2020],
        [66.1750, 64.6281, 68.2870, 66.2350],
        [68.7484, 65.3608, 65.3483, 66.6214],
        [68.1466, 66.6181, 70.4059, 68.2378],
        [67.1429, 65.4236, 69.1545, 67.1530],
        [64.2211, 63.5185, 67.9057, 65.0593],
        [67.6493, 65.8253, 68.2154, 67.2001],
        [69.3891, 66.4978, 67.7005, 67.9039],
        [66.8632, 65.3377, 69.3815, 67.0886],
        [68.8973, 65.0100, 64.1237, 66.1659],
        [65.2438, 64.5380, 68.9952, 66.0373],
        [69.1888, 66.5122, 67.9722

In [5]:
states = [experience.state for experience in experience_replay.memory]

In [6]:
next_states = [experience.next_state for experience in experience_replay.memory]

In [7]:
actions = [experience.action for experience in experience_replay.memory]

In [8]:
rewards = [experience.reward for experience in experience_replay.memory]

In [21]:
actions[3]

tensor(0)

In [22]:
states[3].view(10,10)

tensor([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [23]:
next_states[3].view(10,10)

tensor([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [24]:
rewards[3]

tensor(-3)

tensor(3)

In [10]:
vs_agent.model(torch.stack(states))

tensor([[0.1495, 0.0890, 0.1665, 0.5950],
        [0.1495, 0.0890, 0.1665, 0.5950],
        [0.1500, 0.0878, 0.1595, 0.6027],
        ...,
        [0.1409, 0.0745, 0.1471, 0.6375],
        [0.1409, 0.0745, 0.1471, 0.6375],
        [0.1409, 0.0745, 0.1471, 0.6375]], grad_fn=<SoftmaxBackward0>)

In [17]:
next_states[5].view(10,10)

tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [8]:
actions = experience

tensor([2])

In [None]:
def learn(self, experiences):
        states = torch.tensor([experience.state for experience in experiences])
    
        actions = torch.tensor([experience.action for experience in experineces])
    
        rewards = torch.tensor([experience.reward for experience in experiences])
    
        next_states = torch.tensor([experience.next_state for experience in experiences])
    
        dones = torch.tensor([experience.done for experience in experiences])
    
        current_q_values = self.model(states)
    
        next_q_values = self.model(next_states)
    
        target_q_values = current_q_values.copy()
    
        for i in range(len(experiences)):
            if dones[i]:
                target_q_values[i, actions[i]] = rewards[i]
            else:
                target_q_values[i, actions[i]] = rewards[i] + self.gamma * np.max(next_q_values[i])