In [1]:
import random
import numpy as np

In [16]:
# 每个老虎的中奖概率，0-1之间的均匀分布
probs = np.random.uniform(size=10)

# 记录每个老虎机的返回值
rewards = [[1] for _ in range(10)]

probs, rewards

(array([0.83504158, 0.80112914, 0.34951631, 0.97895463, 0.17571434,
        0.6492362 , 0.0588519 , 0.12026324, 0.01447005, 0.73551329]),
 [[1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])

In [3]:
# 随机选择的概率递减的贪婪算法
def choose_one_1():
    # 求出现在已经玩了多少次
    played_count = sum([len(i) for i in rewards])

    # 小概率随机选择一根拉杆
    if random.random() < 1 / played_count:
        return random.randint(0, 9)

    # 计算每个老虎机的平均奖励
    rewards_mean = [np.mean(i) for i in rewards]

    return np.argmax(rewards_mean)


choose_one_1()

0

In [18]:
def try_and_play():
    i = choose_one_3()

    # 玩老虎机，得到结果
    reward = 0
    if random.random() < probs[i]:
        reward = 1

    # 记录玩的结果
    rewards[i].append(reward)


try_and_play()

rewards

[[1], [1], [1, 1], [1], [1], [1], [1], [1], [1], [1]]

In [19]:
def get_result():
    for _ in range(5000):
        try_and_play()

    # 期望的最好结果
    target = probs.max() * 5000

    # 实际玩出的结果
    result = sum([sum(i) for i in rewards])

    return target, result


get_result()

(4894.773164922458, 4872)

In [12]:
# ucb算法：多探索玩的少的机器
# 随机选择的概率递减的贪婪算法


def choose_one_2():
    played_count = [len(i) for i in rewards]
    played_count = np.array(played_count)

    # 求出置信上界
    # 分子是总共玩了多少次，取根号后让他的增长速度变慢
    # 分母是每台老虎机玩的次数，乘以2让他的增长速度变快
    # 随着玩的次数增加，分母会很快超过分子的增长速度，倒是分数越来越小
    # 具体到每一台老虎机，则是玩的次数越多，分数就越小，也就是ucb的加权越小
    # 所以ucb衡量了每一台老虎机的不确定性，不确定性越大，探索的价值就越大
    fenzi = played_count.sum() ** 0.5
    fenmu = played_count * 2
    ucb = fenzi / fenmu

    # ucb本身取根号
    # 大于1的数会被缩小，小于1的数会被放大，这样保持ucb恒定在一定的数值范围内
    ucb = ucb**0.5

    # 计算每个老虎机的奖励平均
    rewards_mean = [np.mean(i) for i in rewards]
    rewards_mean = np.array(rewards_mean)

    # ucb和期望求和
    ucb += rewards_mean

    return ucb.argmax()

In [15]:
# beta分布测试
print('当数字小的时候，beta分布的概率有很大的随机性')
for _ in range(5):
    print(np.random.beta(1, 1))

print('当数字大的时候，beta分布逐渐稳定')
for _ in range(5):
    print(np.random.beta(1e5, 1e5))

当数字小的时候，beta分布的概率有很大的随机性
0.04184626052672714
0.24759334763518906
0.41085833685252665
0.9883149392207878
0.038118174565751874
当数字大的时候，beta分布逐渐稳定
0.5005013278621445
0.5008268229563304
0.5004407782349973
0.49767518630978136
0.5008352313003872


In [17]:
# 汤普森采样法


def choose_one_3():
    # 求出每个老虎机出1的次数+1
    count_1 = [sum(i) + 1 for i in rewards]

    # 求出每个老虎机出0的次数+1
    count_0 = [sum(1 - np.array(i)) + 1 for i in rewards]

    # 按照beta分布计算奖励分布，这可以认为是每一台老虎机中奖的概率
    beta = np.random.beta(count_1, count_0)

    return beta.argmax()


choose_one_3()

7