In [37]:
import torch
from torch import nn
from torch.optim import Adam
import matplotlib.pyplot as plt
import numpy as np 
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from Buffers.ExperienceReplayBuffer import ExperienceReplay

<center> <h1> Constants 

In [38]:
SEED = 13 
GAMMA = 0.995
LR = 0.01
STEPS = 1000
BATCH_SIZE = 64
EPISODES = 1000

In [39]:
env = gym.make("Walker2d-v5",render_mode = "rgb_array")
RecordVideo(env,"../Results/PolicyBased",lambda x: x%25 == 0 and x != 0 , fps=15 )
actionNum = env.action_space.shape[0] 
stateNum = env.observation_space.shape[0]
print((actionNum,stateNum))

(6, 17)


In [40]:
import time
env.reset()
for steps in range(1):
    action = env.action_space.sample()
    _,rewards,terminated,truncated,_=env.step(action)
    print(rewards)
    time.sleep(0.1)
env.close()

0.8442168068813372


In [41]:
class Critic(nn.Module):
    def __init__(self,stateNum,h1,h2,h3,actionNum):
        super().__init__()
        self.fc1 = nn.Linear(stateNum + actionNum,h1)
        self.fc2 = nn.Linear(h1,h2)
        self.fc3 = nn.Linear(h2,h3)
        self.fc4 = nn.Linear(h3,1)
    def forward(self,state,action):
        x = torch.hstack((state,action))
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        Q = torch.relu(self.fc4(x))
        return Q

In [42]:
class Actor(nn.Module):
    def __init__(self,stateNum,h1,h2,h3,actionNum):
        super().__init__()
        self.fc1 = nn.Linear(stateNum,h1)
        self.fc2 = nn.Linear(h1,h2)
        self.fc3 = nn.Linear(h2,h3)
        self.fc4 = nn.Linear(h3,actionNum)

    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        A = torch.tanh(self.fc4(x))
        return A

In [43]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.manual_seed(SEED)
critic        = Critic(stateNum,400,200,100,actionNum).to(device)
actor         = Actor(stateNum,400,200,100,actionNum).to(device)
critic_target = Critic(stateNum,400,200,100,actionNum).to(device)
actor_target  = Actor(stateNum,400,200,100,actionNum).to(device)
critic_target.load_state_dict(critic.state_dict())
actor_target.load_state_dict(actor.state_dict())

critic_criterion = nn.MSELoss()
# actor_criterion = 
critic_optim =  Adam(critic.parameters(),LR)
actor_optim= Adam(actor.parameters(),LR)

buffer = ExperienceReplay(BATCH_SIZE,device)



In [44]:
critic.state_dict()

OrderedDict([('fc1.weight',
              tensor([[-0.1702, -0.0086,  0.1295,  ...,  0.1957, -0.0211, -0.0404],
                      [ 0.1335,  0.1998, -0.0718,  ..., -0.0370,  0.1102,  0.1365],
                      [ 0.1876, -0.0603, -0.1387,  ..., -0.0537,  0.0341,  0.1460],
                      ...,
                      [ 0.2035,  0.0252,  0.0173,  ..., -0.1964,  0.1715, -0.1880],
                      [-0.1487,  0.1465,  0.0671,  ..., -0.0907, -0.0044, -0.1642],
                      [-0.0393,  0.1688, -0.1711,  ...,  0.0992, -0.1128, -0.0826]],
                     device='cuda:0')),
             ('fc1.bias',
              tensor([-6.3814e-02, -1.3460e-01, -1.7037e-01, -1.3998e-01,  1.5878e-01,
                       1.3988e-01,  1.8016e-01,  1.3112e-01,  5.7631e-02, -1.1514e-01,
                      -1.9691e-01,  1.3262e-01,  7.6445e-03,  1.1214e-01,  7.2446e-03,
                       1.4982e-01, -1.7051e-01,  3.7485e-02, -1.8250e-01, -1.6072e-01,
                      -4.0

In [45]:
s =np.random.randn(6)
s

array([-0.12043589, -0.27729373, -0.20291126, -1.47787215, -1.71265115,
        1.17501457])

In [46]:
s.clip(-0.2,0.2)

array([-0.12043589, -0.2       , -0.2       , -0.2       , -0.2       ,
        0.2       ])

In [47]:
def soft_update(online,target,tau):
    online_dict = online.state_dict()
    target_dict = target.state_dict()
    for key in online_dict.keys():
        target_dict[key]= tau*online_dict[key] + (1-tau) * target_dict[key]
    target.load_state_dict(target_dict)

In [None]:
tau = 0.005
rewards = []
for episode in range(EPISODES):
    old_observation,info =env.reset(seed=SEED)
    cumulative_reward = 0
    for step in range(STEPS):
        old_observation=torch.Tensor(old_observation).reshape(1,-1).to(device)
        action = actor(old_observation).cpu().detach().numpy().squeeze()
        exploration_noise = np.random.randn(action.shape[0])*0.1
        exploration_noise=exploration_noise.clip(-0.3,0.3)
        old_observation=old_observation.cpu().detach().numpy().squeeze()
        new_observation,reward,terminated,truncated,info = env.step(action+exploration_noise)
        cumulative_reward+=reward
        done = terminated or truncated
        buffer.append(old_observation,action,reward,new_observation,done)
        rewards.append(reward)
        old_observation = new_observation
        if(buffer.size() >= BATCH_SIZE):
            old_state,old_action,reward,new_state,done=buffer.sample(BATCH_SIZE)
            reward = reward.reshape(-1,1)
            done = done.reshape(-1,1)   
            y_hat = critic(old_state,actor(old_state))
            with torch.no_grad():
                y = reward + GAMMA * critic_target(new_state,actor_target(new_state)) * (1 - done)
            
            critic_loss =critic_criterion(y,y_hat)    
            critic_optim.zero_grad()
            critic_loss.backward()
            critic_optim.step()
            actor_optim.zero_grad()
            actor_loss = -critic(old_state,actor(old_state)).mean()
            actor_loss.backward()
            actor_optim.step()
            with torch.no_grad():
                soft_update(critic,critic_target,tau)
                soft_update(actor,actor_target,tau)
        if(truncated or terminated):
            break;
    print(f"Episode: {episode} | Reward: {cumulative_reward}")    

Episode: 0 | Reward: -17.875495222663634
Episode: 1 | Reward: -82.92580356312604
Episode: 2 | Reward: -70.63997108529537
Episode: 3 | Reward: -83.34905884382066
Episode: 4 | Reward: -83.75135154104781
Episode: 5 | Reward: -70.31006791801812
Episode: 6 | Reward: -74.07208112865709
Episode: 7 | Reward: -83.87745477775897
Episode: 8 | Reward: -85.83852711349287
Episode: 9 | Reward: -77.79021846725604
Episode: 10 | Reward: -71.98237357733184
Episode: 11 | Reward: -77.47686053588518
Episode: 12 | Reward: -84.68253058023357
Episode: 13 | Reward: -83.7435708141066
Episode: 14 | Reward: -86.19307015612222
Episode: 15 | Reward: -81.16401628219874
Episode: 16 | Reward: -82.53479874381728
Episode: 17 | Reward: -81.84203693634754
Episode: 18 | Reward: -84.73082353046517


KeyboardInterrupt: 