In [1]:
import numpy as np
from evoenv.envs import MatrixGame
from evoenv.matrixtools import FloatTupleDtype
import random
import tqdm.notebook as tq
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

### DQN Class

In [2]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, input_size, output_size, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.input_size = input_size
        self.output_size = output_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model = DQN(input_size, output_size)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
    
    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.output_size)
        else:
            with torch.no_grad():
                q_values = self.model(torch.tensor(state).float())
                return q_values.argmax().item()

    def train(self, state, action, reward, next_state, done):
        state_tensor = torch.tensor(state).float()
        next_state_tensor = torch.tensor(next_state).float()
        q_values = self.model(state_tensor)[action]
        next_q_values = self.model(next_state_tensor).max().item()
        target = reward + (1 - done) * self.gamma * next_q_values
        loss = F.mse_loss(q_values, torch.tensor(target).float())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

### Running game utils

In [3]:
def play_game_no_memory(agent1, agent2, env, num_episodes=100):
    input_size = 1
    output_size = 2
        
    state = 0
    _, _ = env.reset()
    
    for episode in range(num_episodes):   
        action1 = agent1.select_action([state])
        action2 = agent2.select_action([state])
                
        next_state, actions, rewards, done = env.step((action1, action2))
        
        agent1.train([state], action1, rewards[0], [state], False)
        agent2.train([state], action2, rewards[1], [state], False)
    
    return agent1, agent2

def run_experiment(n_agents, env, num_episodes=100, num_games=100, shuffle=True):
    """
    Description: 
    n_agents: in populaiton
    env: environment to run
    num_episodes: number of episodes in each matrix game to perform
    num_games: number of rounds of pairings to perform
    shuffle: same opponenet every game (False) or different opponent each game (True)
    """
    
    input_size = 1
    output_size = 2
    
    # initialize agents
    agents = [DQNAgent(input_size=input_size, output_size=output_size) for x in range(n_agents)]
    
    for epoch in tq.tqdm(range(num_games)):
        if shuffle:
            random.shuffle(agents)
        pairs = [(agents[i], agents[i+1]) for i in range(0, n_agents, 2)]
    
        # train each agent with their pair
        for agent1, agent2 in pairs:
            play_game_no_memory(agent1, agent2, env, num_episodes)

    return agents

def get_q_vals(agents):
    all_q_vals = []
    state_tensor = torch.tensor([0]).float()
    
    for agent in agents:
        agent_q = agent.model(state_tensor).detach().numpy()
        all_q_vals.append(agent_q)
        
    return np.array(all_q_vals)    

### No Memory Tests

#### Envs

In [4]:
# CHICKEN GAME
chickens_game_rewards = np.array([[(1, 1), (-1, 2)],
                                 [(2,-1), (-2, -2)]], dtype=FloatTupleDtype(2))

chickens_game_env = MatrixGame(chickens_game_rewards)
_, _, = chickens_game_env.reset()

# BATTLE OF SEXES
battle_of_sexes_rewards = np.array([[(-1, -1), (1, 2)],
                                 [(2,1), (-2, -2)]], dtype=FloatTupleDtype(2))

battle_of_sexes_env = MatrixGame(battle_of_sexes_rewards)
_, _, = battle_of_sexes_env.reset()


# LET GEORGE DO IT
LGDI_rewards = np.array([[(-1, -1), (2, 1)],
                         [(1, 2), (-2, -2)]], dtype=FloatTupleDtype(2))

LGDI_env = MatrixGame(LGDI_rewards)
_, _, = LGDI_env.reset()

# Prisoner's Dilemma
PD_rewards = np.array([[(-1, -1), (2, -2)],
                         [(-2,2), (1, 1)]], dtype=FloatTupleDtype(2))

PD_env = MatrixGame(PD_rewards)
_, _, = PD_env.reset()

### Chicken Game

#### No shuffle

In [None]:
agents = run_experiment(n_agents=50, env=chickens_game_env, num_episodes=10000, num_games=1, shuffle=False)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Chicken Game No Shuffle Q-Values")
plt.show()

  0%|          | 0/1 [00:00<?, ?it/s]

#### Shuffle

In [None]:
agents = run_experiment(n_agents=50, env=chickens_game_env, num_episodes=100, num_games=100, shuffle=True)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Chicken Game Shuffle Q-Values")
plt.show()

### Battle of the Sexes

#### No shuffle

In [None]:
agents = run_experiment(n_agents=50, env=battle_of_sexes_env, num_episodes=10000, num_games=1, shuffle=False)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Battle of the Sexes No Shuffle Q-Values")
plt.show()

#### Shuffle

In [None]:
agents = run_experiment(n_agents=50, env=battle_of_sexes_env, num_episodes=100, num_games=100, shuffle=True)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Battle of the Sexes Shuffle Q-Values")
plt.show()

### Let George do it

#### No shuffle

In [None]:
agents = run_experiment(n_agents=50, env=LGDI_env, num_episodes=10000, num_games=1, shuffle=False)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Let George do it No Shuffle Q-Values")
plt.show()

#### Shuffle

In [None]:
agents = run_experiment(n_agents=50, env=LGDI_env, num_episodes=100, num_games=100, shuffle=True)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Let George do it Shuffle Q-Values")
plt.show()

### Prisoner's Dilemma

#### No shuffle

In [None]:
agents = run_experiment(n_agents=50, env=PD_env, num_episodes=10000, num_games=1, shuffle=False)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Prisoner's Dilemma No Shuffle Q-Values")
plt.show()

#### Shuffle

In [None]:
agents = run_experiment(n_agents=50, env=PD_env, num_episodes=100, num_games=100, shuffle=True)

all_q_vals = get_q_vals(agents)
plt.scatter(all_q_vals[:,0], all_q_vals[:,1])
plt.xlabel("Action 0 Q Value")
plt.ylabel("Action 1 Q Value")
plt.title("Prisoner's Dilemma Shuffle Q-Values")
plt.show()