# Chapter 2 - N-armed Bandits

## Deep Reinforcement Learning _in Action_

### Listing 2.1

In [1]:
def get_best_action(actions):
    best_action = 0
    max_action_value = 0
    for i in range(len(actions)):  # A
        cur_action_value = get_action_value(actions[i])  # B
        if cur_action_value > max_action_value:
            best_action = i
            max_action_value = cur_action_value
    return best_action

### Listing 2.2

In [None]:
import numpy as np
from scipy import stats
import random
import matplotlib.pyplot as plt

In [None]:
n = 10
probs = np.random.rand(n)
eps = 0.2

### Listing 2.3

In [None]:
def get_reward(prob, n=10):
    reward = 0
    for i in range(n):
        if random.random() < prob:
            reward += 1
    return reward

In [None]:
reward_test = [get_reward(0.7) for _ in range(2000)]

In [None]:
np.mean(reward_test)

In [None]:
sum = 0
x = [4, 5, 6, 7]
for i in range(len(x)):
    sum = sum + x[i]
sum

In [None]:
plt.figure(figsize=(9, 5))
plt.xlabel('Reward', fontsize=22)
plt.ylabel('# Observations', fontsize=22)
plt.hist(reward_test, bins=9)
plt.show()

### Listing 2.4

In [None]:
# 10 actions x 2 columns
# Columns: Count #, Avg Reward
record = np.zeros((n, 2))
record

In [None]:
def update_record(record, action, r):
    new_r = (record[action, 0] * record[action, 1] + r) / (record[action, 0] + 1)
    record[action, 0] += 1
    record[action, 1] = new_r
    return record

### Listing 2.5

In [None]:
def get_best_arm(record):
    arm_index = np.argmax(record[:, 1], axis=0)
    return arm_index

### Listing 2.6

In [None]:
fig, ax = plt.subplots(1, 1)
ax.set_xlabel('Plays')
ax.set_ylabel('Avg Reward')
fig.set_size_inches(9, 5)

n = 10
record = np.zeros((n, 2))
probs = np.random.rand(n)
eps = 0.2

rewards = [0]
for i in range(500):
    if random.random() > eps:
        choice = get_best_arm(record)
    else:
        choice = np.random.randint(n)
    r = get_reward(probs[choice])
    record = update_record(record, choice, r)
    mean_reward = ((i + 1) * rewards[-1] + r) / (i + 2)
    rewards.append(mean_reward)
    
ax.scatter(np.arange(len(rewards)), rewards)
plt.show()

### Listing 2.7

In [None]:
def softmax(av, tau=1.12):
    softm = np.exp(av / tau) / np.sum(np.exp(av / tau))
    return softm

In [None]:
x = np.arange(10)
x

In [None]:
av = np.zeros(10)
av

In [None]:
p = softmax(av)
p

In [None]:
np.random.choice(x, p=p)

In [None]:
n = 10
probs = np.random.rand(n)
record = np.zeros((n, 2))

fig, ax = plt.subplots(1, 1)
ax.set_xlabel('Plays')
ax.set_ylabel('Avg Reward')
fig.set_size_inches(9, 5)

rewards = [0]
for i in range(500):
    p = softmax(record[:, 1])
    choice = np.random.choice(np.arange(n), p=p)
    r = get_reward(probs[choice])
    record = update_record(record, choice, r)
    mean_reward = ((i + 1) * rewards[-1] + r) / (i + 2)
    rewards.append(mean_reward)

ax.scatter(np.arange(len(rewards)), rewards)
plt.show()

## 자동 미분

In [None]:
import torch

In [None]:
x = torch.Tensor([2, 4])  # 입력 데이터
x