In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pyspiel
import random

from collections import defaultdict

In [None]:
game = pyspiel.load_game("matrix_pd")  # load prisoner's dilemma game



In [None]:
class RandomAgent: 
    """
    Random agent that has uniform policy to serve as baseline.
    """

    def __init__(self, player_id, num_actions):
        """
        Attributes:
            player_id (int): agent ID (0,1 )
            num_actions (int): potential actions
        """
        self.player_id = player_id
        self.num_actions = num_actions

    def select_action(self):
        """
        Random action selection
        """
        return random.randint(0, self.num_actions - 1)

    def update(self, action, reward):
        """
        Method only for symmetry with other agent
        """
        pass 

    def __str__(self):
        return f"RandomAgent(player_id={self.player_id})"


In [None]:
class QLearningAgent:
    """
    Q-learning agent for Prisoner's Dilemma.
    """

    def __init__(self, player_id, num_actions, alpha=0.1, gamma=0.1, epsilon=0.2):
        """
        Attributes:
            player_id (int): agent ID (0, 1)
            num_actions (int): potential actions
            alpha (float): learning rate
            gamma (float): discount factor
            epsilon (float): exploration rate
        """
        self.player_id = player_id
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(lambda: np.zeros(num_actions))

    def select_action(self, state):
        """
        epside-greedy action selection
        """
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1)
        else:
            return int(np.argmax(self.Q[state]))

    def update(self, state, action, reward, next_state):
        """
        update Q-value with bellman equation
        """
        best_next_action = int(np.argmax(self.Q[next_state]))
        td_target = reward + self.gamma * self.Q[next_state][best_next_action]
        td_error = td_target - self.Q[state][action]
        self.Q[state][action] += self.alpha * td_error

        return

    def __str__(self):
        return f"QLearningAgent(player_id={self.player_id})"

In [None]:
def simulate(num_episodes=1000, use_random_baseline=False):
    """
    Runs an iterated Prisoner's Dilemma simulation.

    Parameters:
        num_episodes (int): simulated episode count
        use_random_baseline (bool): Declare true if random agent desired

    Returns:
        rewards_over_time (list): Average reward per episode
        trained_agents (list): List of trained agent instances
    """
    num_players = 2  # define number of players
    num_actions = game.num_distinct_actions()  # store number of unique actions

    if use_random_baseline:  # check if parameter is true 
        agents = [QLearningAgent(0, num_actions), RandomAgent(1, num_actions)]  # instantiate random agent when True 
    else:  # otherwise false 
        agents = [QLearningAgent(0, num_actions), QLearningAgent(1, num_actions)]  # instantiate Q-learning agents when False

    rewards = []  # initialize empty list to store rewards
    cooperations = []  # initialize empty list to store cooperation rates
    prior_actions = [0, 0]  # initialize empty list to store prior actions

    for episode in range(num_episodes):  # iterate across episodes
        state = tuple(prior_actions)  # initialize state as tuple of previous actions 

        actions = [agent.select_action(state) for agent in agents]  # select actions for each agent
        state = game.new_initial_state()  # reset game state
        state.apply_actions(actions)  # pass selected actions to current game state
        
        reward = state.rewards()  # store rewards from game state
        next_state = tuple(actions)  # store next state as tuple of actions taken

        for index, agent in enumerate(agents):  # iterate across agents
            agent.update(state, actions[index], reward[index], next_state)  # update Q values for relevant agent(s) 

        rewards.append(np.mean(reward))  # store average reward from both agents
        prior_actions = actions  # update prior actions for next episode

        cooperation = actions.count(0)  # store number of cooperation outcomes per episode
        cooperations.append(cooperation/num_players)  # store cooperation rate per episode
    
    return rewards, cooperations, agents  # return rewards, cooperations, and trained agents


def plot_results(rewards):
    fig, axis = plt.subplots(figsize=(20, 12))  # initialize figure, axis objects 
    axis.plot(np.arange(len(rewards)), rewards)
    axis.set_xlabel('Episode')
    axis.set_ylabel('Average Reward per Episode')
    axis.set_title("Q-Learning Agent, Iterated Prisoner's Dilemma"le)
    plt.show()

    return 

def plot_cooperation(coop_rates):
    fig, axis = plt.subplots(figsize=(20, 12))  # initialize figure, axis objects 
    axis.plot(np.arange(len(coop_rates)), coop_rates)
    axis.set_xlabel("Episode")
    axis.set_ylabel("Cooperation Rate")
    axis.set_title("Cooperation Rate Over Time")
    axis.grid()
    plt.show()

    return


In [None]:
if __name__ == "__main__":
    rewards, coop_rates, trained_agents = simulate(num_episodes=1000, use_random_baseline=False)
    plot_results(rewards)
    plot_cooperation(coop_rates)

    print("\nLearned Q-tables:")
    for idx, agent in enumerate(trained_agents):
        if isinstance(agent, QLearningAgent):
            print(f"Agent {idx} Q-table:")
            for state, values in agent.Q.items():
                print(f"  State {state}: {values}")