In [1]:
# from unityagents import UnityEnvironment
from collections import deque

import numpy as np
import progressbar as pb

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import mujoco_py
import gym

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
from go1_env import G1Env

In [4]:
env = gym.make('G1-v0')

Creating window glfw


In [5]:
# reset the environment
env_info = env.reset()

# size of each action
action_size = env.action_space.shape[0]
print('Size of each action:', action_size)

state_size = env.observation_space.shape[0]
print('Observation size: {}'.format(state_size))
print('The state for the agent looks like:', env_info)

Size of each action: 12
Observation size: 119
The state for the agent looks like: [ 0.40092975  0.99203035  0.03629254  0.10068183 -0.06649659 -0.05594706
  0.08929206  0.00699006  0.08145568  0.05251449 -0.09577245 -0.01940435
 -0.00632346 -0.03831936  0.03891466 -0.08892761  0.02021591 -0.10142359
 -0.0222417  -0.15413703  0.0424993   0.05078041 -0.05221894 -0.02373102
  0.0548006  -0.04078534 -0.13746454 -0.06021428 -0.03895167 -0.04067696
  0.03710263 -0.03437089  0.03418381 -0.22265368 -0.01516714  0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.         -1.
 -1.          1.          1.          1.          1.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          1.
 -1.         -1.         -1.     

In [6]:
NUM_AGENTS = 1

def interact(action):
    action = action.reshape(NUM_AGENTS, action_size)
    next_state, reward, done, info = env.step(action)
    # next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
    return next_state.reshape(NUM_AGENTS, -1), np.array(reward).reshape(NUM_AGENTS, -1), np.array(done).reshape(NUM_AGENTS, -1)

def reset():
    state = env.reset().reshape(NUM_AGENTS, -1)
    return state

In [7]:
NET_SIZE = 512

class Policy(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        super(Policy, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.bn1 = nn.BatchNorm1d(fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.bn2 = nn.BatchNorm1d(fc1_size)
        self.fc3_mu = nn.Linear(fc2_size, action_size)
        self.fc3_std = nn.Linear(fc2_size, action_size)

    def forward(self, state, log_std_min=-20, log_std_max=2):
        x = self.bn0(state)
        x = torch.relu(self.bn1(self.fc1(state)))
        x = torch.relu(self.bn2(self.fc2(x)))

        mean = self.fc3_mu(x)
        std = self.fc3_std(x)
        std = torch.clamp(std, log_std_min, log_std_max).exp()

        return mean, std
    
class Value(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        
        super(Value, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, x):
        x = self.bn0(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
class Q(nn.Module):
    
    def __init__(self, state_size, action_size, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        
        super(Q, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size+action_size)
        self.fc1 = nn.Linear(state_size + action_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, s, a):
        x = torch.cat([s, a], 1)
        x = self.bn0(x)
        x = torch.relu(self.fc1(x)) 
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [8]:
state_size

119

In [9]:
from algorithms.sac import Agent

agent = Agent(
    state_size=state_size, 
    action_size=action_size,
    policy_network=Policy,
    value_network=Value,
    q_network=Q,
    n_agents=NUM_AGENTS, 
    device=device,
)

In [10]:
# agent.policy_network.load_state_dict(torch.load("./trained_models/sac-reacher/POLICY.pth"))
# agent.value_network_local.load_state_dict(torch.load("./trained_models/sac-reacher/VALUE_LOCAL.pth"))
# agent.value_network_target.load_state_dict(torch.load("./trained_models/sac-reacher/VALUE_LOCAL.pth"))
# agent.q_network_1.load_state_dict(torch.load("./trained_models/sac-reacher/Q_1.pth"))
# agent.q_network_2.load_state_dict(torch.load("./trained_models/sac-reacher/Q_2.pth"))

In [11]:
def run(n_episodes, t_max, print_every):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=print_every)  # last 100 scores
    widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA()]
    timer = pb.ProgressBar(widgets=widget, maxval=n_episodes).start()
    
    frame_counter = 0
    
    for i_episode in range(1, n_episodes+1):
        states = reset()
        score = 0
        for t in range(t_max):
            frame_counter += 1
            actions = agent.act(states)
            next_states, rewards, dones = interact(actions)
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards.mean()
            if np.any(dones):
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        agent.writer.add_scalar('score/mean', score, i_episode)
        if i_episode % print_every == 0:
            print('\rEpisode {}\tScore Mean: {:.2f}\tScore STD: {:.2f}'.format(i_episode, np.mean(scores_window), np.std(scores_window)))
        
        # if np.mean(scores_window) > 300:
        #     print(scores_window)
        #     break
            
        timer.update(i_episode)
    return scores

In [12]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f5af7261790>

In [13]:
%time scores = run(t_max=int(10000), n_episodes=int(10000), print_every=100)

training loop:   0% |                                          | ETA:  --:--:--

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
torch.save(agent.policy_network.state_dict(), "./trained_models/sac-crawler/POLICY.pth")
torch.save(agent.value_network_target.state_dict(), "./trained_models/sac-crawler/VALUE_TARGET.pth")
torch.save(agent.q_network_1.state_dict(), "./trained_models/sac-crawler/Q_1.pth")
torch.save(agent.q_network_2.state_dict(), "./trained_models/sac-crawler/Q_2.pth")

In [None]:
env.close()