## **Multi-armed bandit problem**

### Reinforcement learning

**MDP** - Markov Decision Processes

$\;s_t$ - current state\
$\;o_t$  - obesrvation made in $s_t$\
$\;a_t$ - decision made from state $s_t$

$\quad a_t \in \{-1, 0, 1\}$, where\
$\quad \quad a_t = -1$ -> move left\
$\quad \quad a_t = 0$ -> stay\
$\quad \quad a_t = 1$ -> move right


In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import random

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions) -> None:
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)


    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.softmax(self.layer3(x), dim=0)
        return x

In [35]:
n_bandits = 6
model = DQN(1, n_bandits).to(device)

model(torch.tensor([1.0]))

tensor([0.1783, 0.1349, 0.1938, 0.2037, 0.1310, 0.1583],
       grad_fn=<SoftmaxBackward0>)

In [31]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the AdamW optimizer
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

n_obersevations = 1
n_actions = n_bandits

In [69]:
model = DQN(n_obersevations, n_actions)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, amsgrad=True)
# memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return F.one_hot(torch.argmax(model(state)), n_actions)
    else:
        return F.one_hot(torch.randint(0, n_actions, (1,)), n_actions)

In [70]:
select_action(torch.tensor([1.0]))

tensor([[0, 0, 0, 0, 0, 1]])