In [None]:
import numpy as np
import pandas as pd

def compute_rewards(agent_action, opponent_action):
    reward_matrix = np.array([[3, 0], [5, 1]])
    agent_reward = reward_matrix[agent_action, opponent_action]
    opponent_reward = reward_matrix[opponent_action, agent_action]
    return agent_reward, opponent_reward

def run_reinforce(num_steps, num_episodes, learning_rate, agent_p1, agent_p2, opponent_p1, opponent_p2, initial_agent_state, initial_opponent_state):
    # Define the policies for the agent and the opponent with parametrized initial conditions
    agent_policy = np.array([[agent_p1, 1-agent_p1], [agent_p2, 1-agent_p2]])
    opponent_policy = np.array([[opponent_p1, 1-opponent_p1], [opponent_p2, 1-opponent_p2]])

    for episode in range(num_episodes):
        agent_state = initial_agent_state
        opponent_state = initial_opponent_state

        agent_episode_cooperation = 0
        agent_episode_actions = []
        opponent_episode_actions = []

        for i in range(num_steps):
            # Agent's action
            agent_action = np.random.choice(2, p=agent_policy[agent_state])
            agent_episode_actions.append(agent_action)

            # Opponent's action
            opponent_action = np.random.choice(2, p=opponent_policy[opponent_state])
            opponent_episode_actions.append(opponent_action)

            # Update states
            agent_state = opponent_action
            opponent_state = agent_action

            # Compute rewards based on the actions of both agents
            agent_reward, opponent_reward = compute_rewards(agent_action, opponent_action)

            # Update episode cooperation count
            if agent_action == 0:
                agent_episode_cooperation += 1

        # Update the policies using REINFORCE update rule
        for t in range(len(agent_episode_actions)):
            agent_action_t = agent_episode_actions[t]
            opponent_action_t = opponent_episode_actions[t]

            agent_policy[agent_state][agent_action_t] += learning_rate * agent_episode_cooperation
            opponent_policy[opponent_state][opponent_action_t] += learning_rate * agent_episode_cooperation  # The opponent's policy uses the agent's cooperation count

            # Normalize the policies to make sure they are valid probability distributions
            agent_policy[agent_state] /= np.sum(agent_policy[agent_state])
            opponent_policy[opponent_state] /= np.sum(opponent_policy[opponent_state])

        # Calculate and return the cooperation rate for the last episode
        if episode == num_episodes - 1:
            cooperation_rate = agent_episode_cooperation / num_steps
            return cooperation_rate

# Set up a grid of hyperparameters for random sampling
param_grid = {
    "num_steps": [10, 50, 100, 200, 500, 1000],
    "num_episodes": [100, 200, 500, 1000],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1],
    "agent_p1": [0.1, 0.25, 0.5, 0.75, 0.9],
    "agent_p2": [0.1, 0.25, 0.5, 0.75, 0.9],
    "opponent_p1": [0.1, 0.25, 0.5, 0.75, 0.9], 
    "opponent_p2": [0.1, 0.25, 0.5, 0.75, 0.9],
    "initial_agent_state": [0, 1],  # Initial state for agent
    "initial_opponent_state": [0, 1]  # Initial state for opponent
}

# Randomly sample from the parameter grid
num_samples = 90  # Adjust the number of samples as needed
results = []

for _ in range(num_samples):
    print(_)
    params = {key: np.random.choice(values) for key, values in param_grid.items()}
    cooperation_rate = run_reinforce(**params)
    params["Cooperation Rate"] = cooperation_rate  # Add the final cooperation rate to the parameters
    results.append(params)  # Store the hyperparameters and the final cooperation rate

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Print the DataFrame
print(df)