In [4]:
import gymnasium as gym
import cookiedisaster
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
print(torch.__version__)

SEED=2
np.random.seed(SEED)

env = gym.make('cookiedisaster-v1')

2.2.2


In [5]:
env.reset(seed=SEED) 
# env.render()
DUMMY_STATE=env.step(0)
MAX_TIME=5*25 # 5 cookies elapse time
PUNISHMENT=-1 # may have to be adjusted

def normalize(value, min_value, max_value, scale_min=-1, scale_max=1):
    return ((value - min_value) / (max_value - min_value)) * (scale_max - scale_min) + scale_min

def preprocess_state(state):
    # Assuming state is a dictionary like:
    # {'robot': {'pos': x, 'vel': y}, 'cookie': {'pos': z, 'time': w}}
    robot_pos = normalize(state[0]['agent']['pos'], 0, 10)
    robot_vel = normalize(state[0]['agent']['vel'], -7, 7)
    cookie_pos = normalize(state[0]['cookie']['pos'], 0, 10)
    cookie_time = normalize(state[0]['cookie']['time'], 0, 5)
    distance = robot_pos - cookie_pos
    direction = 1 if distance > 0 else -1
    
    # Return the normalized state as a numpy array
    return np.array([robot_pos, robot_vel, cookie_pos, cookie_time,distance, direction])


class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        return self.network(state)

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    def forward(self, state):
        return self.network(state)

input_dim = 6  # From preprocess_state function
output_dim = env.action_space.n

actor = Actor(input_dim, output_dim)
critic = Critic(input_dim)


actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)

num_episodes = 1000
gamma = 0.99  # Discount factor for future rewards

episode_rewards = []  # Track rewards per episode
action_counts = np.zeros(output_dim)
def train_actor_critic(env, num_episodes=10, gamma=0.99, actor_lr=0.001, critic_lr=0.001):
    # Environment and model parameters
    input_dim = 6  # From preprocess_state function
    output_dim = env.action_space.n

    # Initialize models
    actor = Actor(input_dim, output_dim)
    critic = Critic(input_dim)

    # Initialize optimizers
    actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)
    
    # Training loop
    
    
    for episode in range(num_episodes):
        state = env.reset()
        state = preprocess_state(state)
        done = False
        total_reward = 0
        count = 0
        epsilon_start = 1.0
        epsilon_min = 0.01
        epsilon_decay = 0.995

        while not done:
            
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            distribution = torch.distributions.Categorical(action_probs)
            epsilon = max(epsilon_min, epsilon_start * epsilon_decay**episode)
            if np.random.rand() < epsilon:
                action = env.action_space.sample()  # Random action
            else:
                action = distribution.sample()
            
            action_counts[action] += 1
            next_state= env.step(action.item())
            reward = next_state[1]
            done = next_state[2]
            
            if next_state[1]>0:
                break
            # missed 5 cookies (MAX_TIME), minus points and break
            if count>MAX_TIME:
                # env._cumulative_reward+=PUNISHMENT
                # total_reward+=PUNISHMENT
                break
            count+=1
            # if count>100:
            #     print('count',count)
            #     break
            
            next_state = preprocess_state(next_state)
            
            total_reward += reward
            
            # Critic update
            value = critic(state_tensor)
            next_value = critic(torch.FloatTensor(next_state).unsqueeze(0))
            td_error = reward + gamma * next_value * (1 - int(done)) - value
            critic_loss = td_error.pow(2)
            
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()
            
            # Actor update
            action = torch.tensor(action)
            actor_loss = -distribution.log_prob(action) * td_error.detach()
            
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()
            
            state = next_state
            
        
        episode_rewards.append(total_reward)
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")
    
    return actor, critic, episode_rewards, action_counts, episode_rewards

def evaluate_policy(env, actor, num_episodes=100):
    total_rewards = 0.0
    rewards = []    
    for _ in range(num_episodes):
        state = env.reset()
        state = preprocess_state(state)
        done = False
        total_reward = 0
        count = 0

        while not done:
            if count > MAX_TIME:  # Break if max time exceeded without positive reward
                break
            
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            distribution = torch.distributions.Categorical(action_probs)
            action = distribution.sample()
            
            next_state= env.step(action.item())
            reward = next_state[1]
            rewards.append(reward)
            done = next_state[2]
            
            if next_state[1]>0:
                break
            
            if count>MAX_TIME:
                
                break
            count+=1
          
            
            next_state = preprocess_state(next_state)
            
            
            total_reward += reward

       

    average_reward = total_rewards / num_episodes
    print(f"Average Reward over {num_episodes} episodes: {average_reward}")
    return average_reward, rewards

def find_optimal_policy(env, initial_actor_lr=0.001, initial_critic_lr=0.001, gamma=0.99, threshold_reward=None, max_iterations=10, eval_episodes=10):
    actor_lr = initial_actor_lr
    critic_lr = initial_critic_lr
    
    best_average_reward = float('-inf')
    best_actor = None

    for iteration in range(max_iterations):
        print(f"Iteration {iteration + 1}/{max_iterations}: Training Started")
        actor, critic, episode_rewards, action_counts, episode_rewards= train_actor_critic(env, num_episodes=num_episodes, gamma=gamma, actor_lr=actor_lr, critic_lr=critic_lr)
        average_reward, rewards = evaluate_policy(env, actor, num_episodes=eval_episodes)
        print(f"Iteration {iteration + 1}: Average Reward = {average_reward}")
        
        # Check if the current model is the best one
        if average_reward > best_average_reward:
            best_average_reward = average_reward
            best_actor = actor
            print("New best model found!")
            
            # Optional: Adjust learning rates based on performance, implement your strategy here
            # actor_lr *= learning_rate_decay
            # critic_lr *= learning_rate_decay
            
        # If a threshold is defined and met, stop training
        if threshold_reward is not None and average_reward >= threshold_reward:
            print(f"Desired threshold reward of {threshold_reward} achieved.")
            break
        
        # Optional: Implement additional logic to adjust training parameters or terminate early
    
    return best_actor, best_average_reward, rewards

best_actor, best_reward, rewards = find_optimal_policy(env, initial_actor_lr=0.001, initial_critic_lr=0.001, threshold_reward=-2.5, max_iterations=num_episodes)
print(f"Best average reward achieved: {best_reward}")


  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Iteration 1/1000: Training Started
Episode 0, Total Reward: -0.5


  action = torch.tensor(action)


Episode 100, Total Reward: 0
Episode 200, Total Reward: 0
Episode 300, Total Reward: -2.5
Episode 400, Total Reward: -2.5
Episode 500, Total Reward: -2.5
Episode 600, Total Reward: -2.5
Episode 700, Total Reward: -1.0
Episode 800, Total Reward: -2.5
Episode 900, Total Reward: -0.5
Average Reward over 10 episodes: 0.0
Iteration 1: Average Reward = 0.0
New best model found!
Desired threshold reward of -2.5 achieved.
Best average reward achieved: 0.0


In [6]:
def test_actor_critic(env, actor, episodes=num_episodes):
    total_rewards = []
    state = env.reset()
    total_reward = 0
    
    for _ in range(episodes):
        state = preprocess_state(state)  # Adjust based on your state preprocessing
        done = False
      
        count = 0
        
        
      
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = actor(state_tensor)
        action = torch.argmax(q_values).item()
        next_state= env.step(action)
        reward = next_state[1]
        done = next_state[2]    
        # next_state = preprocess_state(next_state)
        state = next_state
        total_reward += reward
        
        total_rewards.append(total_reward)
    
    avg_reward = np.mean(total_rewards)
    print(f"Average reward over {episodes} episodes: {avg_reward}")
    return avg_reward, total_rewards


In [7]:
env = gym.make('cookiedisaster-v1',render_mode='human')
avg_act, act_rewards=test_actor_critic(env, best_actor, episodes=500)
env.close()


  logger.deprecation(
  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


KeyboardInterrupt: 

In [8]:
env.close()