In [None]:
import gymnasium as gym
import math
from tqdm import tqdm

env = gym.make('CartPole-v1')

def sigmoid(x):
    return 1/(1+math.exp(-x))

import random
class Bernulli():
    """ 
    Logistic regression, manual implementation
    """
    def __init__(self, size=4):
        self.w = [random.random()-0.5 for _ in range(size)]

    def p(self, state):
        weighted_sum = sum([self.w[i]*state[i] for i in range(len(state))])
        return sigmoid(weighted_sum)


    def sample(self, state):
        return self.p(state) > random.random()
    

    def log_derivative(self, index, state): # chaining log, sigmoid, linear_layer
        p = self.p(state)
        return [(index - p) * s_i for s_i in state]

    def update(self, alpha, grad):
        self.w = [self.w[i] + alpha*grad[i] for i in range(len(grad))]

    def policy_gradient(self, actions, rewards, states):
        grad = [0. for _ in range(len(self.w))]
        discount = 1.
        for i in range(len(actions)-1, -1, -1):
            cumulative_reward = 0.
            for j in range(len(rewards)-1, i-1, -1):
                cumulative_reward += (discount**(j-i))*rewards[j]

            derivatives = self.log_derivative(int(actions[i]), states[i])
            grad = [grad[i] + derivatives[i]*cumulative_reward for i in range(len(self.w))]
        return grad           


policy = Bernulli()

for _ in tqdm(range(1000)):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    gradients = policy.policy_gradient(actions, rewards, observations)
    policy.update(0.001, gradients)


env.close()



In [3]:
class MLPPol(torch.nn.Module):

    def __init__(self, net):
        super().__init__()
        self.net = net

    def forward(self, state):
        return self.net(torch.tensor(state, dtype=torch.float32))
    
    def sample(self, state):
        with torch.no_grad():
            try:
                return self(state).multinomial(num_samples=1, replacement=True).item() 
            except:
                print(self(state))
                raise Exception()

    def sample_training(self, state):
        probs = self(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)
    
    def reward(self, log_probs, rewards, gamma=0.99):
        loss = 0
        G = 0
        for t in reversed(range(len(rewards))):
            G = rewards[t] + gamma * G
            loss -= log_probs[t] * G  # REINFORCE
        return loss
            
    def sample_best(self,state):
        with torch.no_grad():
            try:
                # print(self(state))
                return torch.argmax(self(state)).item()
            except:
                print(self(state))
                raise Exception()

In [None]:
import torch
from tqdm import tqdm
import gymnasium as gym
# torch based implementation

input_size = 4
hidden_size = 4
output_size = 2



env = gym.make('CartPole-v1')



network = torch.nn.Sequential(
    torch.nn.Linear(input_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

policy = MLPPol(network)

optimizer = torch.optim.Adam(policy.parameters(), 0.01)

for _ in tqdm(range(1000)):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    
    counter = 0
    while not episode_over:
        action, log_prob = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        counter += 1
        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    loss = policy.reward(log_probs, rewards)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


env.close()



In [None]:

env = gym.make('CartPole-v1', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = counter > 100 #terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)


env.close()



In [189]:


class GridEnv(gym.Env):
    def __init__(self, size=5):
        super().__init__()
        self.size = size
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Discrete(size*size)
        self.state = None
        self.reset()
        self.obstacles = []
        self.max_steps = 10
        self.current_step = 0

    def reset(self, state=None): # observation, info
        # self.state = (0, 0)
        # random initial state 
        self.state = (torch.randint(self.size-1, (1,)).item(), torch.randint(self.size-1, (1,)).item())
        if state is not None:
            self.state = state
        self.current_step = 0
        return self.state, {}

    def set_obstacles(self, obstacles):
        self.obstacles = obstacles
        for obs in obstacles:
            if obs[0] < 0 or obs[0] >= self.size or obs[1] < 0 or obs[1] >= self.size:
                raise ValueError("Obstacle coordinates out of bounds.")
            if obs == (self.size - 1, self.size - 1):
                raise ValueError("Obstacle cannot be at the goal position.")

    def render(self, mode='human'):
        grid = [[' ' for _ in range(self.size)] for _ in range(self.size)]
        # if in bounds
        if not(self.state[0] < 0 or self.state[0] >= self.size or self.state[1] < 0 or self.state[1] >= self.size):
            grid[self.state[0]][self.state[1]] = 'A'
        for obs in self.obstacles:
            grid[obs[0]][obs[1]] = 'X'
        grid[self.size - 1][self.size - 1] = 'G'
        if mode == 'human':
            print('\n'.join(['.'.join(row) for row in grid]))
            print()
        return grid

    def translate_action_to_human(self, action):
        action_dict = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        return action_dict.get(action, "Invalid action")

    def step(self, action):
        x, y = self.state
        if action == 0:  # up
            x = x - 1
        elif action == 1:  # down
            x = x + 1
        elif action == 2:  # left
            y = y - 1
        elif action == 3:  # right
            y = y + 1

        self.state = (x, y)

        self.current_step += 1
        if self.current_step >= self.max_steps:
            return self.state, -0., False, True, {}

        if (x, y) == (self.size - 1, self.size - 1): # goal
            # print("goal")
            return self.state, 1000.0, True, False, {}
        else:
            for obs in self.obstacles: # wall
                if (x, y) == obs:
                    # print("obstacle")
                    return self.state, -1.0*(self.max_steps-self.current_step), True, False, {}
            # if our of bounds
            if x < 0 or x >= self.size or y < 0 or y >= self.size:
                # print("out of bounds")
                return self.state, -1.0*(self.max_steps-self.current_step), True, False, {}
        # If the agent moves to a valid position
        return self.state, -0.01, False, False, {}

    

env = GridEnv(size=5)
env.set_obstacles([(1, 1), (2, 2), (3, 3)])
env.render(mode='human')
env.close()

 . . . . 
 .X. . . 
 . .X. . 
 . .A.X. 
 . . . .G



In [None]:
import torch
from tqdm import tqdm

input_size = 2
hidden_size = 4
output_size = 4

network = torch.nn.Sequential(
    torch.nn.Linear(input_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

# policy = MLPPol(network)
optimizer = torch.optim.Adam(policy.parameters(), 0.01)


for _ in tqdm(range(100000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    
    counter = 0
    while not episode_over:
        action, log_prob = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(observation, action, reward)

        counter += 1
        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    # if sum(rewards) > 0:
    #     print(sum(rewards), len(rewards))

    loss = policy.reward(log_probs, rewards)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

env.close()


  0%|          | 0/100000 [00:00<?, ?it/s]




AttributeError: 'MLPAactuator' object has no attribute 'sample_training'

In [None]:
env = GridEnv(size=5)
env.set_obstacles([(1, 1), (2, 2), (3, 3)])

for _ in range(1):

    observation, info = env.reset(state=(0, 0))


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:
        env.render()

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)
        print(env.translate_action_to_human(action))


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        print(sum(rewards))
        counter +=1
        episode_over = terminated or truncated 
        if not episode_over:
            observations.append(observation)

env.render()
env.close()

NameError: name 'GridEnv' is not defined

In [5]:
%load_ext tensorboard

import torch
from tqdm import tqdm
import gymnasium as gym
from gymnasium.wrappers import RecordEpisodeStatistics
from torch.utils.tensorboard import SummaryWriter
from copy import deepcopy

import datetime
log_dir = 'logs/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

env = gym.make('InvertedPendulum-v5', reset_noise_scale=0.1)

def soft_update(target, source, tau=0.001):
    for t_param, s_param in zip(target.parameters(), source.parameters()):
        t_param.data.copy_(tau * s_param.data + (1.0 - tau) * t_param.data)


class MLPAactuator(torch.nn.Module):

    def __init__(self, q_net, p_net, device='cpu'):
        super().__init__()
        self.q_net = q_net.to(device)
        self.p_net = p_net.to(device)
        self.targ_p_net = deepcopy(p_net)
        self.targ_q_net = deepcopy(q_net)
        self.device = device
    
    def q(self, state, action):
        sa = torch.cat([state, action], dim=1)
        return self.q_net(sa)
    
    def tq(self, state, action):
        sa = torch.cat([state, action], dim=1)
        return self.targ_q_net(sa)
    
    def p(self, state):
        return self.p_net(state)
    
    def tp(self, state):
        return self.targ_p_net(state)

    def _sample(self, state):
        return self.p_net(state)

    def sample(self, state, training=False):
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        if training:
            return self._sample(state)
        with torch.no_grad():
            return self._sample(state)

    def q_loss(self, state, action, new_state, reward, is_terminal, gamma=0.99): # reward is computed after taking the action
        return (self.q(state, action) - (reward + (1-is_terminal)*gamma*self.tq(new_state, self.tp(new_state))))**2
    
    def p_loss(self, state, action, new_state, reward, is_terminal, gamma=0.99):
        return -self.q(state, self.p(state))
    
    def freeze_q(self):
        # freeze all parameters of a neural network
        for param in self.q_net.parameters():
            param.requires_grad = False
        for param in self.targ_p_net.parameters():
            param.requires_grad = False
        for param in self.targ_q_net.parameters():
            param.requires_grad = False

    def unfreeze_q(self):
        # unfreeze all parameters of a neural network
        for param in self.q_net.parameters():
            param.requires_grad = True
        for param in self.targ_p_net.parameters():
            param.requires_grad = True
        for param in self.targ_q_net.parameters():
            param.requires_grad = True
    
    def freeze_p(self):
        # freeze all parameters of a neural network
        for param in self.p_net.parameters():
            param.requires_grad = False

    def unfreeze_p(self):
        # unfreeze all parameters of a neural network
        for param in self.p_net.parameters():
            param.requires_grad = True

    def soft_update(self):
        soft_update(self.targ_p_net, self.p_net)
        soft_update(self.targ_q_net, self.q_net)

    

import random
class ReplayBuffer():

    def __init__(self):
        self.events = []
        self.sweeping_index = 0

    def add(self, states, actions, rewards):
            if len(self.events) < 100000:
                for i in range(len(states)):
                    if i == len(states)-1: # terminal
                        self.events.append([ # state, action,rewads, 
                            states[i],
                            actions[i],
                            states[i],
                            rewards[i],
                            float((i == len(states)-1)) # reaching terminal ?
                        ])
                    else:
                        # print(states[i].shape, len(actions), i)
                        self.events.append([ # state, action,rewads, 
                            states[i],
                            actions[i],
                            states[i+1],
                            rewards[i],
                            False # reaching terminal ?
                        ])
            else: # replace older samples
                for i in range(len(states)):
                    if i == len(states)-1: # terminal
                        self.events[self.sweeping_index] = [ # state, action,rewads, 
                            states[i],
                            actions[i],
                            states[i],
                            rewards[i],
                            float((i == len(states)-1)) # reaching terminal ?
                        ]
                    else:
                        # print(states[i].shape, len(actions), i)
                        self.events[self.sweeping_index] = [ # state, action,rewads, 
                            states[i],
                            actions[i],
                            states[i+1],
                            rewards[i],
                            False # reaching terminal ?
                        ]
                self.sweeping_index += 1
                if self.sweeping_index == len(self.events):
                    self.sweeping_index = 0

    def sample(self, n):
        return random.choices(self.events, k=n) 


state_size = 4
action_size = 1
hidden_size_p = 2
hidden_size_q = 16
output_size = 1

p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size_p),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size_p, output_size),
    # torch.nn.Softmax(dim=-1)
)
q_net = torch.nn.Sequential(
    torch.nn.Linear(state_size+action_size, hidden_size_q),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size_q, output_size)
    # torch.nn.Softplus()
)

policy = MLPAactuator(q_net, p_net, device='cuda')

optimizer = torch.optim.Adam(policy.parameters(), 0.0001)
writer = SummaryWriter(log_dir)

n_episodes = 100000
n_rollouts = 5
batch_size = 256
training_iters = 5

rb = ReplayBuffer()  


for episode in tqdm(range(n_episodes)):

    for rollout in range(n_rollouts):


        observation, info = env.reset()
        episode_over = False

        rewards = []
        actions = []
        observations = [observation]
        log_probs = []
        
        while not episode_over: # unroll
            action = policy.sample(observation, training=True)

            # clip in -3,+3
            noise = torch.normal(mean=0, std=1., size=action.shape).to(action.device)
            action = torch.clip(action+noise, -3., 3.) 
            

            actions.append(action)

            observation, reward, terminated, truncated, info = env.step((action.cpu().detach().squeeze(0),))
            rewards.append(reward)

            episode_over = terminated or truncated
            if not episode_over:
                observations.append(observation)

        writer.add_scalar("reward", sum(rewards), episode*n_rollouts + rollout)
        #rewards = torch.flip(torch.cumsum(torch.flip(torch.tensor(rewards), (0,)), 0), (0, ))
        # print(rewards)
        
        rb.add(observations, actions, rewards)

    for training_iter in range(training_iters):
        batch = rb.sample(batch_size)

        bstate = torch.tensor([b[0] for b in batch], dtype=torch.float32).to(policy.device)
        baction = torch.tensor([b[1] for b in batch], dtype=torch.float32).to(policy.device).unsqueeze(1)
        bnew_state = torch.tensor([b[2] for b in batch], dtype=torch.float32).to(policy.device)
        breward = torch.tensor([b[3] for b in batch], dtype=torch.float32).to(policy.device)
        bis_terminal = torch.tensor([b[4] for b in batch], dtype=torch.float32).to(policy.device)

        optimizer.zero_grad()

        policy.freeze_p()
        q_loss = policy.q_loss(bstate, baction, bnew_state, breward, bis_terminal).sum()/batch_size
        q_loss.backward()
        policy.unfreeze_p()
        
        policy.freeze_q()
        p_loss = policy.p_loss(bstate, baction, bnew_state, breward, bis_terminal).sum()/batch_size
        p_loss.backward()
        policy.unfreeze_q()

        torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=0.5) # grad clipping

        optimizer.step()

        writer.add_scalar("Ploss", -p_loss.item(), episode*training_iters + training_iter)
        writer.add_scalar("Qloss", q_loss.item(), episode*training_iters + training_iter)

        policy.soft_update()


env.close()



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


 25%|██▌       | 25386/100000 [39:52<1:57:11, 10.61it/s]


KeyboardInterrupt: 

In [8]:

env = gym.make('InvertedPendulum-v5', render_mode="human", reset_noise_scale=0.01)

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample(observation)
        action = torch.clip(action, -3., 3.)
        actions.append(action)
        
        observation, reward, terminated, truncated, info = env.step((action[0].cpu(),))
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = counter > 100 #terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

env.close()



Exception ignored in: <function WindowViewer.__del__ at 0x7fb2f3715c60>
Traceback (most recent call last):
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 359, in __del__
    self.free()
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py", line 352, in free
    if glfw.get_current_context() == self.window:
       ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/glfw/__init__.py", line 2264, in get_current_context
    return _glfw.glfwGetCurrentContext()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/glfw/__init__.py", line 628, in errcheck
    _reraise(exc[1], exc[2])
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/glfw/__init__.py", line 52, in _reraise
    raise exception.with_traceback(traceback)
  File "/local0/scra

GLFWError: (65537) b'The GLFW library is not initialized'

### PPO