In [129]:
import torch
import torch.nn as nn
from collections import namedtuple
import gym
import numpy as np
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

In [130]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70
MAX_ITER = 10000000

In [131]:
class Net(nn.Module):
    def __init__(self, num_features:int, hidden_size:int, num_actions:int):
        super(Net, self).__init__()
        # Define two linear layers
        self.linear1 = nn.Linear(in_features=num_features, out_features=hidden_size)
        self.linear2 = nn.Linear(in_features=hidden_size, out_features=num_actions)

        # Define a ReLU activation function
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)  # First linear layer
        x = self.relu(x)     # ReLU activation
        x = self.linear2(x)  # Second linear layer
        return x

In [132]:
Episode = namedtuple('Episode', field_names=['total_rewards', 'steps'])
episodeStep = namedtuple('episodeStep', field_names=['state', 'action'])

In [133]:
def create_episode(env:gym.Env, net:Net):
    net.eval()
    total_rewards = 0
    steps = []
    sm = nn.Softmax(dim=0)
    
    current_state = env.reset()[0]
    
    while True:
        current_state_tensor = torch.FloatTensor(current_state)
        action_prob = sm(net(current_state_tensor))
        action_prob = action_prob.detach().numpy()
        action = np.random.choice(env.action_space.n, p=action_prob)
        
        next_state, reward, terminated, _, info = env.step(action=action)
        total_rewards += reward
        current_step = episodeStep(state=current_state, action=action)
        steps.append(current_step)
        
        if terminated:
            e = Episode(total_rewards=total_rewards, steps=steps)
            return e
            
        
        current_state = next_state
    

In [134]:
def create_batch(env:gym.Env, net:Net, batch_size:int):
    batch = []
    for i in range(batch_size):
        episode = create_episode(env, net)
        batch.append(episode)
        
    return batch
    

In [135]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s:s.total_rewards, batch))
    reward_percentile = np.percentile(rewards, percentile)
    mean_reward = np.mean(rewards)
    
    training_states = []
    training_actions = []
    for total_rewards, steps in batch:
        if total_rewards< reward_percentile:
            continue
        training_states.extend(map(lambda step: step.state, steps))
        training_actions.extend(map(lambda step: step.action, steps))
        
    return training_states, training_actions, reward_percentile, mean_reward
        
        
        

In [136]:
if __name__=="__main__":
    env = gym.make("CartPole-v0")
    
    num_features = env.observation_space.shape[0]
    num_actions = env.action_space.n
    net = Net(num_features=num_features, hidden_size=HIDDEN_SIZE, num_actions=num_actions)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(params = net.parameters(), lr = 0.01)
    
    writer = SummaryWriter(log_dir='cart_pole_tb')
    for i in range(MAX_ITER):
        batch = create_batch(env, net, BATCH_SIZE)
        training_states, training_actions, reward_percentile, mean_reward = filter_batch(batch=batch, percentile=PERCENTILE)
        training_states = torch.FloatTensor(training_states)
        training_actions = torch.LongTensor(training_actions)
        # Zero the parameter gradients
        net.train()
        optimizer.zero_grad()

        # Forward pass
        outputs = net(training_states)
        outputs = outputs.float()
        loss = criterion(outputs, training_actions)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        print('Current iteration: {}, mean reward: {}'.format(i, mean_reward))
        
        writer.add_scalar("loss", loss.item(), i)
        writer.add_scalar("reward_bound", reward_percentile, i)
        writer.add_scalar("mean_reward", mean_reward, i)
        if mean_reward > 199:
            break
        
    writer.close()
        

Current iteration: 0, mean reward: 30.5
Current iteration: 1, mean reward: 28.0
Current iteration: 2, mean reward: 41.0
Current iteration: 3, mean reward: 29.0
Current iteration: 4, mean reward: 30.5
Current iteration: 5, mean reward: 49.5
Current iteration: 6, mean reward: 40.5
Current iteration: 7, mean reward: 55.0
Current iteration: 8, mean reward: 61.5
Current iteration: 9, mean reward: 68.0
Current iteration: 10, mean reward: 80.0
Current iteration: 11, mean reward: 60.5
Current iteration: 12, mean reward: 81.5
Current iteration: 13, mean reward: 89.0
Current iteration: 14, mean reward: 97.5
Current iteration: 15, mean reward: 93.0
Current iteration: 16, mean reward: 104.0
Current iteration: 17, mean reward: 155.0
Current iteration: 18, mean reward: 117.0
Current iteration: 19, mean reward: 105.0
Current iteration: 20, mean reward: 145.5
Current iteration: 21, mean reward: 103.5
Current iteration: 22, mean reward: 104.0
Current iteration: 23, mean reward: 156.0
Current iteration:

In [137]:
# env = gym.make("CartPole-v0")
# current_state = env.reset()[0]
# current_state = torch.FloatTensor(current_state)

In [138]:
# num_features = env.observation_space.shape[0]
# num_actions = env.action_space.n
# net = Net(num_features=num_features, hidden_size=HIDDEN_SIZE, num_actions=num_actions)