<a href="https://colab.research.google.com/github/sathwikreddykatla/tutorial/blob/master/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from collections import defaultdict
import random
import math

class QTable:
    def __init__(self):
        self.table = defaultdict(lambda: defaultdict(float))

    def update(self, state, action, value):
        self.table[state][action] = value

    def get_q_value(self, state, action):
        return self.table[state][action]

    def get_argmax_q(self, state, actions):
        max_action = actions[0]
        max_q = self.get_q_value(state, max_action)
        for action in actions[1:]:
            q_value = self.get_q_value(state, action)
            if q_value > max_q:
                max_q = q_value
                max_action = action
        return max_action

class EpsilonGreedy:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon

    def reset(self):
        pass

    def select(self, state, actions, qfunction):
        if random.random() < self.epsilon:
            return random.choice(actions)
        arg_max_q = qfunction.get_argmax_q(state, actions)
        return arg_max_q

class EpsilonDecreasing:
    def __init__(self, epsilon=1.0, alpha=0.999, lower_bound=0.1):
        self.epsilon_greedy_bandit = EpsilonGreedy(epsilon)
        self.initial_epsilon = epsilon
        self.alpha = alpha
        self.lower_bound = lower_bound

    def reset(self):
        self.epsilon_greedy_bandit = EpsilonGreedy(self.initial_epsilon)

    def select(self, state, actions, qfunction):
        result = self.epsilon_greedy_bandit.select(state, actions, qfunction)
        self.epsilon_greedy_bandit.epsilon = max(
            self.epsilon_greedy_bandit.epsilon * self.alpha, self.lower_bound
        )
        return result

class Softmax:
    def __init__(self, tau=1.0):
        self.tau = tau

    def reset(self):
        pass

    def select(self, state, actions, qfunction):
        total = 0.0
        for action in actions:
            total += math.exp(qfunction.get_q_value(state, action) / self.tau)
        rand = random.random()
        cumulative_probability = 0.0
        result = None
        for action in actions:
            probability = (
                math.exp(qfunction.get_q_value(state, action) / self.tau) / total
            )
            if cumulative_probability <= rand <= cumulative_probability + probability:
                result = action
            cumulative_probability += probability
        return result

class UpperConfidenceBounds:
    def __init__(self):
        self.total = 0
        self.times_selected = {}

    def select(self, state, actions, qfunction):
        for action in actions:
            if action not in self.times_selected.keys():
                self.times_selected[action] = 1
        self.total += 1
        max_actions = []
        max_value = float("-inf")
        for action in actions:
            value = (
                qfunction.get_q_value(state, action)
                + math.sqrt((2 * math.log(self.total)) / self.times_selected[action])
            )
            if value > max_value:
                max_actions = [action]
                max_value = value
            elif value == max_value:
                max_actions += [action]
        result = random.choice(max_actions)
        self.times_selected[result] = self.times_selected[result] + 1
        return result

def run_bandit(bandit, episodes=200, episode_length=500, drift=True):
    actions = [0, 1, 2, 3, 4]
    state = 1
    rewards = []
    for _ in range(0, episodes):
        bandit.reset()
        probabilities = [0.1, 0.3, 0.7, 0.2, 0.1]
        times_selected = defaultdict(lambda: 0)
        qtable = QTable()
        episode_rewards = []
        for step in range(0, episode_length):
            if drift and step == episode_length / 2:
                probabilities = [0.5, 0.2, 0.0, 0.3, 0.3]
            action = bandit.select(state, actions, qtable)
            reward = 0
            if random.random() < probabilities[action]:
                reward = 5
            episode_rewards += [reward]
            times_selected[action] = times_selected[action] + 1
            qtable.update(
                state,
                action,
                (reward / times_selected[action])
                - (qtable.get_q_value(state, action) / times_selected[action]),
            )
        rewards += [episode_rewards]
    return rewards

if __name__ == "__main__":
    rewards = run_bandit(EpsilonGreedy())
    print(rewards)

[[0, 0, 5, 0, 0, 0, 5, 0, 5, 5, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 5, 5, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 0, 0, 0, 0, 5, 0, 0, 5, 0, 5, 5, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 5, 0, 0, 5, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,