In [None]:
!pip install optuna gymnasium

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-non

In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class RiverSwimEnv(gym.Env):
    def __init__(self, nS=6):
        super(RiverSwimEnv, self).__init__()
        self.nS = nS
        self.nA = 2  # LEFT = 0, RIGHT = 1
        self.state = 0
        self.steps_taken = 0
        self.max_steps = 20

        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)

        # Define transition probabilities and rewards
        self.P = self._init_dynamics()

    def _init_dynamics(self):
        P = {}
        for s in range(self.nS):
            P[s] = {a: [] for a in range(self.nA)}

        # LEFT transitions
        for s in range(self.nS):
            P[s][0] = [(1.0, max(0, s-1), 5/1000 if s == 0 else 0, False)]

        # RIGHT transitions
        P[0][1] = [(0.3, 0, 0, False), (0.7, 1, 0, False)]
        for s in range(1, self.nS - 1):
            P[s][1] = [
                (0.1, max(0, s-1), 0, False),
                (0.6, s, 0, False),
                (0.3, min(self.nS-1, s+1), 0, False)
            ]
        P[self.nS-1][1] = [(0.7, self.nS-1, 1, False), (0.3, self.nS-2, 0, False)]

        return P

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = 0
        self.steps_taken = 0
        return self.state, {}

    def step(self, action):
        transitions = self.P[self.state][action]
        i = self.np_random.choice(len(transitions), p=[t[0] for t in transitions])
        p, next_state, reward, _ = transitions[i]
        self.state = next_state
        self.steps_taken += 1

        # Check if max steps reached
        done = self.steps_taken >= self.max_steps

        return next_state, reward, done, False, {}

    def render(self):
        print(f"Current state: {self.state}")

# Register the environment
gym.register(
    id='RiverSwim-v0',
    entry_point='__main__:RiverSwimEnv',
    max_episode_steps=20,
)

# Create the environment
env = gym.make('RiverSwim-v0')

In [None]:
import gymnasium as gym
env= gym.make('FrozenLake-v1', is_slippery=False)

In [None]:
import numpy as np
import gymnasium as gym
import csv
from datetime import datetime

class REINFORCE:
    def __init__(self, env, learning_rate=0.001, gamma=0.99):
        self.env = env
        self.lr = learning_rate
        self.gamma = gamma
        self.n_actions = env.action_space.n
        #self.N = env.N  # Assuming the environment has an attribute N for grid size
        self.n_states = env.observation_space.n
        self.action_preferences = np.zeros((self.n_states, self.n_actions))

    def square_max_policy(self, state):
        preferences = self.action_preferences[state]
        exp_preferences = np.exp(preferences - np.max(preferences))
        return exp_preferences**2 / np.sum(exp_preferences**2)

    def choose_action(self, state):
        policy = self.square_max_policy(state)
        return np.random.choice(self.n_actions, p=policy)

    def update_policy(self, episode):
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            policy = self.square_max_policy(state)

            # Compute the gradient
            grad = np.zeros(self.n_actions)
            for a in range(self.n_actions):
                if a == action:
                    grad[a] = 2 * policy[a] * (1 - policy[a])
                else:
                    grad[a] = -2 * policy[a] * policy[action]

            # Update action preferences
            self.action_preferences[state] += self.lr * G * grad

    def train(self, n_episodes):
        for _ in range(n_episodes):
            state, _ = self.env.reset()
            episode = []
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                episode.append((state, action, reward))
                state = next_state
            self.update_policy(episode)

def evaluate_agent(env, agent, n_eval_episodes):
    total_reward = 0
    for _ in range(n_eval_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = agent.choose_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
    return total_reward / n_eval_episodes

def run_experiment(env, n_episodes, eval_freq, n_eval_episodes):
    agent = REINFORCE(env)
    results = []

    iterations = n_episodes // eval_freq
    print(iterations)
    for iteration in range(iterations):
        agent.train(eval_freq)
        avg_reward = evaluate_agent(env, agent, n_eval_episodes)
        step_count = (iteration + 1) * eval_freq * 20
        print(step_count)
        results.append((step_count, avg_reward))
        print(f"Iteration {iteration + 1}, Average Reward: {avg_reward:.2f}")

    return results

# Experiment parameters
n_episodes = 10000
eval_freq = 51
n_eval_episodes = 100
num_runs = 1

# Initialize results storage
all_results = []

for run in range(num_runs):
    print(f"Starting run {run + 1}/{num_runs}")
    env = gym.make('RiverSwim-v0')
    run_results = run_experiment(env, n_episodes, eval_freq, n_eval_episodes)
    all_results.append(run_results)
    env.close()

# Process results
step_sizes = [result[0] for result in all_results[0]]  # Assuming all runs have the same step sizes
print(step_sizes)
averaged_rewards = []

for i in range(len(step_sizes)):
    rewards_at_episode = [run[i][1] for run in all_results]
    avg_reward = np.mean(rewards_at_episode)
    averaged_rewards.append(avg_reward)

# Create the final list of tuples (episode, averaged_reward)
final_results = list(zip(step_sizes, averaged_rewards))

# Save results to CSV file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"reinforce_squaremax_riversim_results_{timestamp}.csv"

with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Step', 'Average Reward'])  # Write header
    csvwriter.writerows(final_results)  # Write data

print(f"Results saved to {filename}")

Starting run 1/1
196
1020
Iteration 1, Average Reward: 0.03
2040
Iteration 2, Average Reward: 0.07
3060
Iteration 3, Average Reward: 0.03
4080
Iteration 4, Average Reward: 0.03
5100
Iteration 5, Average Reward: 0.04
6120
Iteration 6, Average Reward: 0.07
7140
Iteration 7, Average Reward: 0.03
8160
Iteration 8, Average Reward: 0.04
9180
Iteration 9, Average Reward: 0.05
10200
Iteration 10, Average Reward: 0.06
11220
Iteration 11, Average Reward: 0.03
12240
Iteration 12, Average Reward: 0.04
13260
Iteration 13, Average Reward: 0.03
14280
Iteration 14, Average Reward: 0.03
15300
Iteration 15, Average Reward: 0.03
16320
Iteration 16, Average Reward: 0.03
17340
Iteration 17, Average Reward: 0.03
18360
Iteration 18, Average Reward: 0.03
19380
Iteration 19, Average Reward: 0.04
20400
Iteration 20, Average Reward: 0.03
21420
Iteration 21, Average Reward: 0.03
22440
Iteration 22, Average Reward: 0.03
23460
Iteration 23, Average Reward: 0.03
24480
Iteration 24, Average Reward: 0.05
25500
Iterati

In [None]:
import numpy as np
import gymnasium as gym
import csv
from datetime import datetime

class REINFORCE:
    def __init__(self, env, learning_rate=0.001, gamma=0.99):
        self.env = env
        self.lr = learning_rate
        self.gamma = gamma
        self.n_actions = env.action_space.n
        self.n_states = env.observation_space.n
        self.action_preferences = np.zeros((self.n_states, self.n_actions))

    def square_max_policy(self, state):
        preferences = self.action_preferences[state]
        exp_preferences = np.exp(preferences - np.max(preferences))
        return exp_preferences**2 / np.sum(exp_preferences**2)

    def choose_action(self, state):
        policy = self.square_max_policy(state)
        return np.random.choice(self.n_actions, p=policy)

    def update_policy(self, episode):
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            policy = self.square_max_policy(state)

            grad = np.zeros(self.n_actions)
            for a in range(self.n_actions):
                if a == action:
                    grad[a] = 2 * policy[a] * (1 - policy[a])
                else:
                    grad[a] = -2 * policy[a] * policy[action]

            self.action_preferences[state] += self.lr * G * grad

    def train(self, n_steps):
        steps_taken = 0
        while steps_taken < n_steps:
            state, _ = self.env.reset()
            episode = []
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated
                episode.append((state, action, reward))
                state = next_state
                steps_taken += 1
                if steps_taken >= n_steps:
                    break
            self.update_policy(episode)
        return steps_taken

def evaluate_agent(env, agent, n_eval_episodes):
    total_reward = 0
    for _ in range(n_eval_episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = agent.choose_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
        total_reward += episode_reward
    return total_reward / n_eval_episodes

def run_experiment(env, total_steps, eval_freq, n_eval_episodes):
    agent = REINFORCE(env)
    results = []
    steps_taken = 0

    while steps_taken < total_steps:
        steps_in_iteration = agent.train(eval_freq)
        steps_taken += steps_in_iteration
        avg_reward = evaluate_agent(env, agent, n_eval_episodes)
        results.append((steps_taken, avg_reward))
        print(f"Steps: {steps_taken}, Average Reward: {avg_reward:.2f}")

    return results

# Experiment parameters
total_steps = 200000
eval_freq = 1000
n_eval_episodes = 100
num_runs = 1

# Initialize results storage
all_results = []

for run in range(num_runs):
    print(f"Starting run {run + 1}/{num_runs}")
    env = gym.make('RiverSwim-v0')
    run_results = run_experiment(env, total_steps, eval_freq, n_eval_episodes)
    all_results.append(run_results)
    env.close()

# Process results
step_sizes = [result[0] for result in all_results[0]]
averaged_rewards = []

for i in range(len(step_sizes)):
    rewards_at_step = [run[i][1] for run in all_results]
    avg_reward = np.mean(rewards_at_step)
    averaged_rewards.append(avg_reward)

# Create the final list of tuples (steps, averaged_reward)
final_results = list(zip(step_sizes, averaged_rewards))

# Save results to CSV file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"reinforce_squaremax_riversim_results_{timestamp}.csv"

with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Steps', 'Average Reward'])  # Write header
    csvwriter.writerows(final_results)  # Write data

print(f"Results saved to {filename}")