### Solving the CartPole problem with Logistic regression

Here we show an implementation of the REINFORCE algorithm to learn the parameters for solving the CartPole problem on a finite time horizon. 
This considers that we need to learn the parmeter p of a Bernoulli distribution where the p is parametrized by a $\sigma(Xw)$, with w being the state
of the CartPole system.

In [4]:
import gymnasium as gym
import math
from tqdm import tqdm
import torch

env = gym.make('CartPole-v1')

def sigmoid(x):
    return 1/(1+math.exp(-x))

import random
class Bernulli():
    """ 
    Logistic regression, manual implementation. 
    """
    def __init__(self, size=4): # random weights 
        self.w = [random.random()-0.5 for _ in range(size)]

    def p(self, state): # forward pass
        weighted_sum = sum([self.w[i]*state[i] for i in range(len(state))])
        return sigmoid(weighted_sum)

    def sample(self, state): # randomly sample action given current p
        return self.p(state) > random.random()
    
    def sample_best(self, state): # randomly sample action given current p
        return self.p(state) > 0.5
    
    def log_derivative(self, index, state): # chaining log, sigmoid, linear_layer
        p = self.p(state)
        return [(index - p) * s_i for s_i in state]

    def update(self, alpha, grad): # gradient update
        self.w = [self.w[i] + alpha*grad[i] for i in range(len(grad))]

    def policy_gradient(self, actions, rewards, states): # gradient computation
        grad = [0. for _ in range(len(self.w))]
        discount = 1.
        for i in range(len(actions)-1, -1, -1):
            cumulative_reward = 0.
            for j in range(len(rewards)-1, i-1, -1):
                cumulative_reward += (discount**(j-i))*rewards[j]

            derivatives = self.log_derivative(int(actions[i]), states[i])
            grad = [grad[i] + derivatives[i]*cumulative_reward for i in range(len(self.w))]
        return grad           

# Training

policy = Bernulli() # create the policy


### -- training loop

for _ in tqdm(range(1000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    gradients = policy.policy_gradient(actions, rewards, observations)
    policy.update(0.001, gradients)

env.close()

100%|██████████| 1000/1000 [00:16<00:00, 61.92it/s]


### Visualizer

In [5]:

env = gym.make('CartPole-v1', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = counter > 100 #terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)


env.close()

