# Testing snippets in examples_mab.rst

In [1]:
from ilovebandits.mab.agents import EpsilonGreedyAgent, UCBAgent, TSAgent
from ilovebandits.mab.q_estimators import QEstMean

# Initialize a Q estimator. Here, we employ a sample average estimator with initial Q values of 0.
q_estimator=QEstMean(arms=5, qvals_init=[0, 0, 0, 0, 0]) # Note: qvals_init indicates the initial reward estimation for each arm.

# Initialize an epsilon-greedy agent, UCB agent, and Thompson Sampling agent.
ep_greedy = EpsilonGreedyAgent(epsilon=0.1, q_estimator=q_estimator)
ucb = UCBAgent(c=1, q_estimator=q_estimator)
ts = TSAgent(arms=5)

In [2]:
from ilovebandits.mab.agents import EpsilonGreedyAgent
from ilovebandits.mab.q_estimators import QEstMean

# Initialize agent
agent = EpsilonGreedyAgent(epsilon=0.1, q_estimator=q_estimator)

# Select an action (arm) based on the agent's strategy and current estimates.
sel_arm, count_sel, prob_sel_arm = agent.take_action()  

# Imagine that the selected arm sel_arm produced a reward equal to 1. To update the agent with this new information, you would do:
reward = 1
agent.q_estimator.estimate(reward=reward, action=sel_arm)  # Update the agent with the observed reward for the selected action.

# Let's select another arm and update the agent again. Imagine, now we received again a reward equal to 1 for the selected arm.
sel_arm, count_sel, prob_sel_arm = agent.take_action()
reward = 1
agent.q_estimator.estimate(reward=reward, action=sel_arm)

# To access the current estimates of the rewards for each arm, you can use:
print(agent.q_estimator.qvals)  # This will print the current estimated rewards for each arm.

# To access the number of times each arm has been updated, you can use:
print(agent.q_estimator.arm_count_updates)  

# To access the number of times each arm has been selected, you can use:
print(agent.arm_count) 

# Sometime we want the agent to forget/reset everything it has learned. To do this, you can use:
agent.reset_agent()  # This will reset the agent's internal state, including reward estimates and counts.

print(agent.q_estimator.qvals)  # This will print the current estimated rewards for each arm.
print(agent.q_estimator.arm_count_updates)  
print(agent.arm_count) 

[0, 0, 1.0, 0, 0]
[0.0, 0.0, 2.0, 0.0, 0.0]
[0.0, 0.0, 2.0, 0.0, 0.0]
[0, 0, 0, 0, 0]
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]


# Testing Snippets in examples_cnban.rst

snippet 1

In [3]:
from ilovebandits.agents import EpsGreedyConAgent
from sklearn.ensemble import RandomForestRegressor
RANDOM_SEED = 42

arms = 4
eps_agent = EpsGreedyConAgent(
    arms=arms,
    base_estimator=RandomForestRegressor(random_state=RANDOM_SEED),
    n_rounds_random=50,
    epsilon=0.1,
    one_model_per_arm=True,
    rng_seed=RANDOM_SEED,
)

snippet 2

In [None]:
import numpy as np
from ilovebandits.agents import EpsGreedyConAgent
from sklearn.ensemble import RandomForestRegressor
RANDOM_SEED = 42

# We update the agent with a new batch of samples. Imagine the following training data:

# Array with  arms selected for each sample
a_train = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 1,
       1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
       3, 3, 3, 3])

# Array with  rewards obtained for each sample
r_train = np.array([ -3.,   4., -11.,  10.,  13.,  11.,  47.,  24.,  20.,  35.,  76.,
        84.,   4.,   3.,  13.,   5.,  -6.,   8., -22.,  20.,  26.,  22.,
        94.,  48.,  40.,  70., 152., 168.,   8.,   6.,  26.,  10.,  -9.,
        12., -33.,  30.,  39.,  33., 141.,  72.,  60., 105., 228., 252.,
        12.,   9.,  39.,  15.])

# Array with feature values (three feature columns) obtained for each sample
c_train = np.array([[ 1, -1,  2],
       [ 2,  3,  4],
       [ 3, -3,  8],
       [ 4,  8, 10],
       [ 1, -1,  2],
       [ 2,  3,  4],
       [ 3, -3,  8],
       [ 4,  8, 10],
       [ 1, -1,  2],
       [ 2,  3,  4],
       [ 3, -3,  8],
       [ 4,  8, 10],
       [ 1, -1,  2],
       [ 2,  3,  4],
       [ 3, -3,  8],
       [ 4,  8, 10],
       [ 2, -2,  4],
       [ 4,  6,  8],
       [ 6, -6, 16],
       [ 8, 16, 20],
       [ 2, -2,  4],
       [ 4,  6,  8],
       [ 6, -6, 16],
       [ 8, 16, 20],
       [ 2, -2,  4],
       [ 4,  6,  8],
       [ 6, -6, 16],
       [ 8, 16, 20],
       [ 2, -2,  4],
       [ 4,  6,  8],
       [ 6, -6, 16],
       [ 8, 16, 20],
       [ 3, -3,  6],
       [ 6,  9, 12],
       [ 9, -9, 24],
       [12, 24, 30],
       [ 3, -3,  6],
       [ 6,  9, 12],
       [ 9, -9, 24],
       [12, 24, 30],
       [ 3, -3,  6],
       [ 6,  9, 12],
       [ 9, -9, 24],
       [12, 24, 30],
       [ 3, -3,  6],
       [ 6,  9, 12],
       [ 9, -9, 24],
       [12, 24, 30]])

In [None]:
eps_agent = EpsGreedyConAgent(
        arms=arms,
        base_estimator=RandomForestRegressor(random_state=RANDOM_SEED),
        n_rounds_random=5,
        epsilon=0.1,
        one_model_per_arm=True,
        rng_seed=RANDOM_SEED,
    )

#### UPDATE AGENT ######
eps_agent.update_agent(c_train=c_train, a_train=a_train, r_train=r_train)

# check number of updates of the agent
print(eps_agent.update_agent_counts)
# Check agent hybrid model if option selected:
print(eps_agent.model)
# Check agent disjoint models if option selected:
print(eps_agent.models)
# Check number of features used by the agent
print(eps_agent.nfeats)


########### PREDICT AGENT ##########
dummy_context = np.ones((1, eps_agent.nfeats))  # Create a dummy context with the appropriate number of features
(sel_arm, prob_sel_arm) = eps_agent.take_action(context=dummy_context)
print(f"Selected arm: {sel_arm}, Probability of selected arm to be chosen: {prob_sel_arm}")

# Do additional 10 arm selections to finish n_rounds_random and start epsilon-greedy selections (we just imagine the same dummy_context for simplicity)
for i in range(10):
    (sel_arm, prob_sel_arm) = eps_agent.take_action(context=dummy_context)
    print(f"Selected arm: {sel_arm}, Probability of selected arm to be chosen: {prob_sel_arm}")

1
None
[RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42), RandomForestRegressor(random_state=42)]
3
Selected arm: 0, Probability of selected arm to be chosen: 0.25
Selected arm: 3, Probability of selected arm to be chosen: 0.25
Selected arm: 2, Probability of selected arm to be chosen: 0.25
Selected arm: 1, Probability of selected arm to be chosen: 0.25
Selected arm: 1, Probability of selected arm to be chosen: 0.25
Selected arm: 2, Probability of selected arm to be chosen: 0.925
Selected arm: 3, Probability of selected arm to be chosen: 0.025
Selected arm: 2, Probability of selected arm to be chosen: 0.925
Selected arm: 2, Probability of selected arm to be chosen: 0.925
Selected arm: 2, Probability of selected arm to be chosen: 0.925
Selected arm: 2, Probability of selected arm to be chosen: 0.925


snippet 3

In [44]:

from sklearn.ensemble import RandomForestClassifier

from ilovebandits.agents import EpsGreedyConAgent
from ilovebandits.data_bandits.base import DataBasedBanditFromPandas
from ilovebandits.data_bandits.utils import GenrlBanditDataLoader
from ilovebandits.sim import SimContBandit

import pandas as pd

RANDOM_SEED = 42
RANDOM_STATE = 42

dataset_for_sims = GenrlBanditDataLoader().get_statlog_shuttle_data()
reward_delay = 10

iterations = 1000
min_ites_to_train = 30  # minimum number of iterations to start training the agent
update_factor = 28  # if 1, it updates the model every iteration, if 2, it updates every two iterations, etc.

model_env = DataBasedBanditFromPandas(
    df=dataset_for_sims,
    reward_delay=reward_delay,
    random_state=RANDOM_STATE,
)
narms = model_env.arms
agent = EpsGreedyConAgent(
    arms=narms,
    base_estimator=RandomForestClassifier(random_state=RANDOM_STATE),
    n_rounds_random=50,
    epsilon=0.1, 
    one_model_per_arm=False,
    rng_seed=RANDOM_SEED,
    min_samples_to_ignore_arm=10,
)

simulator = SimContBandit(
    agent=agent,
    model_env=model_env,
    min_ites_to_train=min_ites_to_train,
    update_factor=update_factor,
)

res = simulator.simulate(iterations=iterations)

#### You can obtain the rewards obtained by the agent at each iteration as a pandas DataFrame with thw followin coide line:
# It contains 4 columns: 
#   -'ite': iteration the reward was received, 
#   -'arm': arm was selected,
#   -'context': context features used,
#   -'reward': reward received at 'ite'
rew_agent = pd.DataFrame(res['rew_agent'])
print(rew_agent)

# You can also obtain a list of the actions selected by the agent at each iteration:
print(res['actions'])

# You can also obtain a list of the chosen action probabilities:
print(res['prob_actions'])

# You can also access the agent and model environment at the end of the simulation:
print(res['agent'])
print(res['model_env'])

# For more information, please refer to the API docs.

  a_idx, a_prob = self.agent.take_action(context=context)


Agent updated in iterations: [112, 140, 168, 196, 224, 252, 280, 308, 336, 364, 392, 420, 448, 476, 504, 532, 560, 588, 616, 644, 672, 700, 728, 756, 784, 812, 840, 868, 896, 924, 952, 980]
      ite  arm                              context  reward
0      11    0   [53, 2, 88, 0, 52, -13, 35, 37, 2]       1
1      12    5     [84, -2, 88, 0, 6, 0, 4, 83, 78]       0
2      13    4   [53, 0, 103, 0, 54, 26, 50, 49, 0]       0
3      14    3  [43, -1, 86, 0, 42, -19, 43, 45, 2]       0
4      15    3   [45, 0, 107, 0, 46, 28, 62, 60, 0]       0
..    ...  ...                                  ...     ...
985   996    0     [51, 0, 86, 0, 52, 7, 36, 35, 0]       1
986   997    0    [54, 0, 84, -6, 54, 0, 30, 30, 0]       1
987   998    0    [45, 0, 77, 0, 46, 19, 32, 31, 0]       1
988   999    0  [37, 0, 104, 0, 20, 15, 67, 84, 16]       1
989  1000    4  [79, -1, 84, 0, -10, 40, 4, 94, 90]       1

[990 rows x 4 columns]
[np.int64(0), np.int64(5), np.int64(4), np.int64(3), np.int64(3), 