In [1]:
import gymnasium as gym
import cookiedisaster
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
print(torch.__version__)

SEED=2
np.random.seed(SEED)

env = gym.make('cookiedisaster/GridWorld-v0')

2.2.2


# chat 2.0

In [2]:
def feature_engineering(state, action):
    """
    Feature engineering function to convert state and action into a feature vector.

    Parameters:
    - state: Raw state information from the environment.
    - action: Chosen action by the agent.

    Returns:
    - features: Feature vector representing the state-action pair.
    """
    # Extract relevant information from the state
    agent_pos = state['agent']['pos']
    agent_vel = state['agent']['vel']
    cookie_pos = state['cookie']['pos']
    cookie_time = state['cookie']['time']

    # Initialize feature vector
    features = []

    # Relative position of the agent and the cookie
    relative_pos = cookie_pos - agent_pos
    features.append(relative_pos)

    # Direction of the cookie
    direction = 1 if relative_pos > 0 else -1
    features.append(direction)

    # Agent velocity
    features.append(agent_vel)

    # Time remaining for the cookie to disappear
    features.append(cookie_time)

    # Cookie position
    features.append(cookie_pos)

    # Agent position
    features.append(agent_pos)

    # # Feature 4: Squared terms for position difference and velocity
    # features.append(relative_pos ** 2)
    # features.append(agent_vel ** 2)

    # # Feature 5: Interaction term between velocity and relative position
    # features.append(agent_vel * relative_pos)

    # Optionally, include features related to the action
    # These could be one-hot encoded or numerical values representing the chosen action
    features.append(action)

    # proximity to the wall
    features.append(10-agent_pos)

    # Convert the feature vector to numpy array for compatibility with agent's update function
    features=np.array(features)
    # print("features",features)
    # Normalize features to have mean=0 and variance=1
    features = (features - np.mean(features)) / np.std(features)
    # print("features",features)
    return features



In [3]:
class LinearQAgent:
    def __init__(self, num_features, learning_rate=0.05, gamma=0.99,l2_reg=0.01):
        self.weights = np.zeros(num_features)
        self.learning_rate = learning_rate # Step size
        self.gamma = gamma # Discount factor
        self.l2_reg = l2_reg # L2 regularization strength

    def predict_q(self, features):
        return np.dot(self.weights, features)

    def update(self, features, reward, next_features, done):
        q_current = self.predict_q(features)
        q_next_max = 0 if done else np.max([self.predict_q(next_features) for a in range(3)])  # Assuming 3 actions
        target = reward + self.gamma * q_next_max
        error = target - q_current
        self.weights += self.learning_rate * (error * features- self.l2_reg * self.weights)
        # print(self.weights)

    def choose_action(self, state, epsilon=0.1):
        if np.random.rand() < epsilon:  # Exploration
            return np.random.choice([0, 1, 2])  # Assuming actions are 0, 1, 2
        else:  # Exploitation
            q_values = [self.predict_q(feature_engineering(state, a)) for a in range(3)]
            return np.argmax(q_values)




In [4]:
def train_agent(env, agent, feature_engineering, number_of_episodes=1000, episode_length=1000, epsilon_start=1.0, epsilon_decay=0.995, epsilon_min=0.1):
    """
    Trains the agent in the given environment.

    Parameters:
    - env: The environment instance.
    - agent: The agent instance.
    - feature_engineering: Function to convert state and action into features.
    - number_of_episodes: Total number of episodes for training.
    - episode_length: The maximum length of an episode.
    - epsilon_start: Starting value of epsilon for ε-greedy policy.
    - epsilon_decay: The decay rate of epsilon after each episode.
    - epsilon_min: The minimum value of epsilon.
    """
    epsilon = epsilon_start
    state = env.reset()[0]
    for episode in range(number_of_episodes):
        
        total_reward = 0
        
        for _ in range(episode_length):
            action = agent.choose_action(state, epsilon)
            next_state, reward, _, _, _ = env.step(action)
            
            # Feature Engineering for current and next state
            features = feature_engineering(state, action)
            next_features = feature_engineering(next_state, action)  # For simplicity
            
            # Update the agent
            agent.update(features, reward, next_features, False)  # Force 'done' to False since the episode doesn't naturally end
            
            state = next_state
            total_reward += reward
        
        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Logging
        if (episode + 1) % 10 == 0:  # Log every 100 episodes
            print(f"Episode {episode + 1}: Total Reward = {total_reward}, Epsilon = {epsilon}")
    # return total_reward


In [5]:
def test_agent(env, agent, feature_engineering, num_episodes=3, max_steps_per_episode=200):
    """
    Test the agent in the given environment with a maximum number of steps and render its performance.

    Parameters:
    - env: The environment instance.
    - agent: The trained agent instance.
    - feature_engineering: Function to convert state and action into features.
    - num_episodes: Number of episodes to run the agent.
    - max_steps_per_episode: Maximum number of steps to execute per episode.
    """
    total_rewards = []  # To keep track of the rewards for each episode

    for episode in range(num_episodes):
        state = env.reset()[0]
        episode_rewards = 0

        for _ in range(max_steps_per_episode):
            action = agent.choose_action(state, epsilon=0)  # Use epsilon=0 for no exploration
            next_state, reward, _, _, _ = env.step(action)
            state = next_state  # Update to the new state

            episode_rewards += reward

            # env.render()  # Render the current state of the environment

        print(f"Episode {episode + 1}: Total Reward = {episode_rewards}")
        total_rewards.append(episode_rewards)

    env.close()  # Close the environment window if it's open
    print(f"Average Reward over {num_episodes} episodes: {np.mean(total_rewards)}")


In [6]:
agent=LinearQAgent(num_features=8, learning_rate=0.01,l2_reg=0.001)
env = gym.make('cookiedisaster/GridWorld-v0')
train_agent(env, agent, feature_engineering, number_of_episodes=500, episode_length=1000, epsilon_start=1.0, epsilon_decay=0.997, epsilon_min=0.15)
print(agent.weights)



  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Episode 10: Total Reward = -26.23848695251783, Epsilon = 0.9704017769489168
Episode 20: Total Reward = -33.842660954903955, Epsilon = 0.9416796087056153
Episode 30: Total Reward = -38.79817381682741, Epsilon = 0.9138075656044898
Episode 40: Total Reward = -37.11587655474579, Epsilon = 0.8867604854519608
Episode 50: Total Reward = -37.915960976613626, Epsilon = 0.860513950810667
Episode 60: Total Reward = -60.365976470913985, Epsilon = 0.835044266956004
Episode 70: Total Reward = -68.57606120033631, Epsilon = 0.8103284404851119
Episode 80: Total Reward = -51.086459031910486, Epsilon = 0.7863441585589971
Episode 90: Total Reward = -52.15513715647097, Epsilon = 0.7630697687590515
Episode 100: Total Reward = -57.61295101170633, Epsilon = 0.7404842595397826
Episode 110: Total Reward = -64.45690607747026, Epsilon = 0.7185672412601078
Episode 120: Total Reward = -63.55428624762215, Epsilon = 0.6972989277760897
Episode 130: Total Reward = -55.5513931205547, Epsilon = 0.676660118578492
Episode 

In [7]:
env = gym.make('cookiedisaster/GridWorld-v0',render_mode='human')
test_agent(env, agent, feature_engineering, num_episodes=1, max_steps_per_episode=200)


Episode 1: Total Reward = -29.371980511599027
Average Reward over 1 episodes: -29.371980511599027


In [8]:
# a=1.0
# for i in range(1000):
#     a*=0.997
#     print('i',i,'a',a)