In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)

<torch._C.Generator at 0x10dcaa830>

In [2]:
# Define the environment class
class Environment:
    def __init__(self,state_size,action_size,N_steps):
        self.state_size = state_size  # Size of the state space
        self.action_size = action_size  # Number of possible actions
        self.N_steps = N_steps # Number of steps to take per episode
        self.reset()

    def reset(self):
        '''
        This function resets the environment to the initial state
        '''
        self.state = np.random.choice([0, 1], size=self.state_size)
        self.steps = 0
        return self.state

    def step(self, action):
        '''
        This function returns the reward,
        the new state of the environment,
        and number of steps taken for the current
        episode based on the action taken.
        Input: 
            action vector
        Output: 
            new state
            reward
            done
        '''
        reward = self.get_reward() 
        self.state = np.repeat(action,self.state_size) # New random state
        self.steps += 1
        done = self.steps >= self.N_steps  # End after N steps
        return self.state, reward, done
    
    def move(self, action):
        '''
        This function returns the new state of the environment
        based on the action taken.
        Input: 
            action vector
        Output: 
            new state
        '''
        self.state = np.repeat(action,self.state_size)
        return 
    
    def get_reward(self):
        '''
        This function computed the reward
        based on the action taken.
        Output: 
            reward
        '''
        reward = np.sum(self.state)
        
        return reward   

In [3]:
# Define the Q-value policy network class
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 32)
        self.fc2 = nn.Linear(32, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = self.fc2(x)
        return x

In [15]:
## Main Function

# Environment parameters
state_size = 4  # Size of the state space
action_size = 2  # Number of possible actions
N_steps = 100 # Number of steps to take per episode
        
# Hyperparameters
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995  # Decay rate of exploration
gamma = 0.95  # Discount factor
learning_rate = 0.001

# Initialize environment and Q-network
env = Environment(state_size,action_size)
model = QNetwork(env.state_size, env.action_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_episodes = 1000

for episode in range(total_episodes):
    state = env.reset()
    total_reward = 0
    
    while True:
        # Choose action using epsilon-greedy policy
        if np.random.rand() <= epsilon:
            action = np.random.randint(env.action_size)
        else:
            with torch.no_grad():
                q_values = model(torch.tensor(state, dtype=torch.float32))
                action = torch.argmax(q_values).item()

        # Take action and observe next state, reward, done
        next_state, reward, done = env.step(action)

        # Update Q-value using Bellman equation
        q_values_next = model(torch.tensor(next_state, dtype=torch.float32))
        target = reward + gamma * torch.max(q_values_next)

        q_values = model(torch.tensor(state, dtype=torch.float32))
        loss = nn.functional.mse_loss(q_values[action], target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_reward += reward
        state = next_state

        if done:
            break

    # Decay epsilon
    if epsilon > 0.01:
        epsilon *= epsilon_decay

    # Print episode results
    if (episode + 1) % 50 == 0:
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon}, state: {state}, action: {action}")

# Testing the trained agent
state = env.reset()
total_reward = 0

while True:
    with torch.no_grad():
        q_values = model(torch.tensor(state, dtype=torch.float32))
        action = torch.argmax(q_values).item()

    next_state, reward, done = env.step(action)
    total_reward += reward
    state = next_state

    if done:
        break

print(f"Test Total Reward: {total_reward}, state: {state}")


Episode: 50, Total Reward: 237, Epsilon: 0.778312557068642, state: [1 1 1 1], action: 1
Episode: 100, Total Reward: 287, Epsilon: 0.6057704364907278, state: [1 1 1 1], action: 1
Episode: 150, Total Reward: 289, Epsilon: 0.47147873742168567, state: [0 0 0 0], action: 0
Episode: 200, Total Reward: 313, Epsilon: 0.3669578217261671, state: [1 1 1 1], action: 1
Episode: 250, Total Reward: 326, Epsilon: 0.285607880564032, state: [1 1 1 1], action: 1
Episode: 300, Total Reward: 334, Epsilon: 0.22229219984074702, state: [1 1 1 1], action: 1
Episode: 350, Total Reward: 354, Epsilon: 0.1730128104744653, state: [1 1 1 1], action: 1
Episode: 400, Total Reward: 384, Epsilon: 0.1346580429260134, state: [1 1 1 1], action: 1
Episode: 450, Total Reward: 360, Epsilon: 0.10480604571960442, state: [1 1 1 1], action: 1
Episode: 500, Total Reward: 386, Epsilon: 0.08157186144027828, state: [1 1 1 1], action: 1
Episode: 550, Total Reward: 379, Epsilon: 0.06348840406243188, state: [1 1 1 1], action: 1
Episode: