In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [2]:
from typing import List


import gym
import numpy
import torch
import torch.nn
import torch.optim
import torch.distributions

In [3]:
# try REINFORCE algorithm, described in Foundations of Deep Reinforcement Learning Ch02
seed = 827


class REINFORCEPolicy(torch.nn.Module):
    def __init__(self, in_dim: int, out_dim: int, gamma: float):
        super(REINFORCEPolicy, self).__init__()
        self.log_probs: List[float] = []
        self.rewards: List[float] = []
        self.gamma = gamma
        self.model = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, out_dim)
        )
        self.train()
        
    def reset_episode(self):
        self.log_probs = []
        self.rewards = []

    def forward(self, x: torch.Tensor):
        return self.model(x)
    
    def action(self, env_state):
        x = torch.from_numpy(env_state.astype(numpy.float32))
        prob_dist_param = self.forward(x)
        prob_dist = torch.distributions.Categorical(logits=prob_dist_param)
        action = prob_dist.sample()  # pi(a|s)
        log_prob = prob_dist.log_prob(action)
        self.log_probs.append(log_prob)  # store for post-episode training
        
        return action.item()
    
    def train_episode(self, optimizer):
        """Train based on the rewards of the current episode."""
        num_rewards = len(self.rewards)
        returns = numpy.empty(num_rewards, dtype=numpy.float32)
        future_return = 0.0
        
        for r in reversed(range(num_rewards)):
            future_return = self.rewards[r] + self.gamma * future_return
            returns[r] = future_return

        # print(torch.tensor(returns))
        # print(torch.stack(self.log_probs))
        # print(-torch.stack(self.log_probs))
        loss = torch.sum(-torch.stack(self.log_probs) * torch.tensor(returns))
        # print(loss)
        optimizer.zero_grad()
        loss.backward()  # backpropagate
        optimizer.step()  # gradient *ascent* since rewards are maximized, update weights
        return loss

    
def train(env, num_episodes: int, max_steps_per_episode: int, min_solves: int, gamma: float):    
    policy = REINFORCEPolicy(in_dim=env.observation_space.shape[0], out_dim=env.action_space.n, gamma=gamma)
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.001)

    for ep in range(num_episodes):
        state, info = env.reset()
        policy.reset_episode()
        
        for step in range(max_steps_per_episode):
            action = policy.action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            policy.rewards.append(reward)
            
            if terminated or truncated:
                break
        
        # train once per episode
        loss = policy.train_episode(optimizer)
        episode_rewards = sum(policy.rewards)

        print(f'Episode {ep:03}: Loss={loss:10.03f}, Rewards={episode_rewards}/{env.spec.reward_threshold}')
        
        if episode_rewards > env.spec.reward_threshold:
            print('Solved!')
            min_solves -= 1
            
        if min_solves <= 0:
            return True
        
    return False


In [4]:
env = gym.make("CartPole-v1", render_mode="human")
env.metadata["render_fps"] = 300
env.action_space.seed(seed)
_ = env.reset(seed=seed)

In [5]:
train(env, num_episodes=2000, max_steps_per_episode=500, min_solves=50, gamma=0.99)

Episode 000: Loss=    62.603, Rewards=14.0/475.0
Episode 001: Loss=    72.637, Rewards=15.0/475.0
Episode 002: Loss=   409.788, Rewards=35.0/475.0
Episode 003: Loss=   132.416, Rewards=19.0/475.0
Episode 004: Loss=   332.350, Rewards=32.0/475.0
Episode 005: Loss=    70.363, Rewards=15.0/475.0
Episode 006: Loss=   186.964, Rewards=24.0/475.0
Episode 007: Loss=   419.581, Rewards=36.0/475.0
Episode 008: Loss=    75.672, Rewards=15.0/475.0
Episode 009: Loss=    47.176, Rewards=12.0/475.0
Episode 010: Loss=   109.137, Rewards=18.0/475.0
Episode 011: Loss=    42.120, Rewards=10.0/475.0
Episode 012: Loss=   256.234, Rewards=28.0/475.0
Episode 013: Loss=   660.743, Rewards=46.0/475.0
Episode 014: Loss=    96.832, Rewards=17.0/475.0
Episode 015: Loss=    96.759, Rewards=17.0/475.0
Episode 016: Loss=    31.828, Rewards=10.0/475.0
Episode 017: Loss=    65.027, Rewards=14.0/475.0
Episode 018: Loss=    40.887, Rewards=11.0/475.0
Episode 019: Loss=   172.140, Rewards=23.0/475.0
Episode 020: Loss=  

Episode 168: Loss=   687.156, Rewards=50.0/475.0
Episode 169: Loss=   822.670, Rewards=55.0/475.0
Episode 170: Loss=    55.597, Rewards=12.0/475.0
Episode 171: Loss=   123.863, Rewards=19.0/475.0
Episode 172: Loss=  4775.161, Rewards=151.0/475.0
Episode 173: Loss=   351.969, Rewards=34.0/475.0
Episode 174: Loss=   239.735, Rewards=28.0/475.0
Episode 175: Loss=   134.682, Rewards=20.0/475.0
Episode 176: Loss=   938.250, Rewards=60.0/475.0
Episode 177: Loss=   254.523, Rewards=29.0/475.0
Episode 178: Loss=   419.964, Rewards=38.0/475.0
Episode 179: Loss=  1829.737, Rewards=86.0/475.0
Episode 180: Loss=   243.546, Rewards=28.0/475.0
Episode 181: Loss=   602.112, Rewards=47.0/475.0
Episode 182: Loss=   652.357, Rewards=48.0/475.0
Episode 183: Loss=   597.538, Rewards=45.0/475.0
Episode 184: Loss=  4092.723, Rewards=138.0/475.0
Episode 185: Loss=    68.902, Rewards=14.0/475.0
Episode 186: Loss=   328.439, Rewards=33.0/475.0
Episode 187: Loss=   235.292, Rewards=27.0/475.0
Episode 188: Loss=

Episode 334: Loss= 11709.357, Rewards=298.0/475.0
Episode 335: Loss=  9073.451, Rewards=252.0/475.0
Episode 336: Loss= 12361.185, Rewards=302.0/475.0
Episode 337: Loss=  6685.950, Rewards=205.0/475.0
Episode 338: Loss=  1180.666, Rewards=68.0/475.0
Episode 339: Loss=  4297.975, Rewards=153.0/475.0
Episode 340: Loss=  4568.886, Rewards=161.0/475.0
Episode 341: Loss=  4638.196, Rewards=162.0/475.0
Episode 342: Loss=  2787.562, Rewards=111.0/475.0
Episode 343: Loss=  9084.835, Rewards=248.0/475.0
Episode 344: Loss=  6597.555, Rewards=199.0/475.0
Episode 345: Loss=  4914.833, Rewards=164.0/475.0
Episode 346: Loss=  4625.227, Rewards=165.0/475.0
Episode 347: Loss=  9315.501, Rewards=255.0/475.0
Episode 348: Loss=  4009.093, Rewards=146.0/475.0
Episode 349: Loss=  4051.135, Rewards=146.0/475.0
Episode 350: Loss=  4029.096, Rewards=143.0/475.0
Episode 351: Loss=  7168.767, Rewards=208.0/475.0
Episode 352: Loss=  7988.683, Rewards=222.0/475.0
Episode 353: Loss=  5976.335, Rewards=190.0/475.0
E

Episode 498: Loss=  5900.879, Rewards=183.0/475.0
Episode 499: Loss=  2548.760, Rewards=108.0/475.0
Episode 500: Loss= 20147.072, Rewards=458.0/475.0
Episode 501: Loss= 22799.438, Rewards=500.0/475.0
Solved!
Episode 502: Loss= 22710.582, Rewards=500.0/475.0
Solved!
Episode 503: Loss=  7695.150, Rewards=214.0/475.0
Episode 504: Loss=  6088.067, Rewards=183.0/475.0
Episode 505: Loss=  7041.023, Rewards=206.0/475.0
Episode 506: Loss=  8523.514, Rewards=236.0/475.0
Episode 507: Loss= 11803.944, Rewards=300.0/475.0
Episode 508: Loss=  8488.762, Rewards=237.0/475.0
Episode 509: Loss= 11862.623, Rewards=301.0/475.0
Episode 510: Loss= 11106.145, Rewards=288.0/475.0
Episode 511: Loss= 14962.786, Rewards=356.0/475.0
Episode 512: Loss= 20914.402, Rewards=461.0/475.0
Episode 513: Loss=  4166.455, Rewards=151.0/475.0
Episode 514: Loss=  6895.925, Rewards=206.0/475.0
Episode 515: Loss= 11801.699, Rewards=305.0/475.0
Episode 516: Loss=  2874.917, Rewards=117.0/475.0
Episode 517: Loss=  2094.505, Rewa

True

In [6]:
env.reset()
env.close()