In [35]:
# import statements
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import importlib

import player
import rl_player
importlib.reload(player)
importlib.reload(rl_player)
from player import *
from rl_player import *

In [36]:
def train_agent(agent, env, num_episodes, batch_size, epsilon_decay):
    total_rewards = []
    epsilon = 1.0

    for episode in range(num_episodes):
        initial_state = env.reset()
        state = initial_state
        done = False
        total_reward = 0

        while not done:
            # Choose an action using epsilon-greedy strategy
            action = agent.get_action(state, agent.name, epsilon)

            # Take the chosen action and observe the next state, reward, and done flag from the environment
            next_state, reward, done = env.step(action)

            # Add the experience to the agent's replay buffer
            agent.add_experience(state, action, reward, next_state, done)

            # Update the agent's Q-network if there are enough experiences in the replay buffer
            if len(agent.replay_buffer) >= batch_size:
                agent.replay_experience(batch_size, agent.name)

            # Update the game state and history for the next iteration
            state = next_state

            # Accumulate the total reward
            total_reward += reward

        # Decay epsilon
        epsilon *= epsilon_decay

        # Save the total reward for this episode
        total_rewards.append(total_reward)

        # Print the episode number and total reward
        print(f"Episode {episode + 1}/{num_episodes} - Total Reward: {total_reward}")

    return total_rewards

In [40]:
state_dim = 54
action_dim = 13
learning_rate = 0.0001
gamma = 0.99

agent = QLearningAgent(state_dim, action_dim, learning_rate, gamma, 'Player 1')

RL_FUNCS = {'decision_fn': rl_decision, 'block_fn': income_block, 'dispose_fn': random_dispose, 'keep_fn': random_keep}
players = [Player('Player 1', RL_FUNCS), Player('Player 2', GREEDY_FUNCS), Player('Player 3', GREEDY_FUNCS), Player('Player 4', TRUTH_FUNCS)]

env = Environment('Player 1', players)

num_episodes = 2000
batch_size = 32
epsilon_decay = 0.99

total_rewards = train_agent(agent, env, num_episodes, batch_size, epsilon_decay)

Episode 1/2000 - Total Reward: -207
Episode 2/2000 - Total Reward: -185
Episode 3/2000 - Total Reward: -207
Episode 4/2000 - Total Reward: -227
Episode 5/2000 - Total Reward: -199
Episode 6/2000 - Total Reward: -191
Episode 7/2000 - Total Reward: -185
Episode 8/2000 - Total Reward: -229
Episode 9/2000 - Total Reward: -207
Episode 10/2000 - Total Reward: -229
Episode 11/2000 - Total Reward: -187
Episode 12/2000 - Total Reward: -177
Episode 13/2000 - Total Reward: -230
Episode 14/2000 - Total Reward: -217
Episode 15/2000 - Total Reward: -186
Episode 16/2000 - Total Reward: -230
Episode 17/2000 - Total Reward: -202
Episode 18/2000 - Total Reward: -199
Episode 19/2000 - Total Reward: -198
Episode 20/2000 - Total Reward: -227
Episode 21/2000 - Total Reward: -197
Episode 22/2000 - Total Reward: -208
Episode 23/2000 - Total Reward: -188
Episode 24/2000 - Total Reward: 262
Episode 25/2000 - Total Reward: -182
Episode 26/2000 - Total Reward: 261
Episode 27/2000 - Total Reward: -182
Episode 28/2

In [46]:
print(sum([x > 0 for x in total_rewards]) / len(total_rewards))

0.3095
