In [1]:
import numpy as np

The dataset dataset.txt contains 10,000 instances corrresponding to distinct site visits by users-events in the language of this part. Each instance comprises 102 space-delimited columns of integers:


*   Column 1: The arm played by a
uniformly-random policy out of 10 arms (news articles)
*   Column 2: The reward received from the arm played|1 if the user clicked 0 otherwise

*   Columns 3: The 100-dim flattened context; 10 features per arm (incorporating the content of the article and its match with the visiting user).





In [2]:
# Load data and reshape based on provided code snippet
data = np.loadtxt("/content/dataset.txt")
arms, rewards, contexts = data[:, 0], data[:, 1], data[:, 2:]
arms = arms.astype(int)
rewards = rewards.astype(float)
contexts = contexts.astype(float)
n_arms = len(np.unique(arms))  # Number of unique arms
n_events = len(contexts)        # Number of events (rounds)
n_dims = int(len(contexts[0]) / n_arms)  # Number of features per arm
contexts = contexts.reshape(n_events, n_arms, n_dims)

In [3]:
# Initialize UCB parameters
counts = np.zeros(n_arms)     # Track number of times each arm is chosen
values = np.zeros(n_arms)     # Track cumulative reward for each arm
cumulative_reward = 0         # Track total reward for benchmarking

In [4]:
c = 2  # Exploration parameter, typically tuned

# Run the UCB algorithm over each event
for t in range(n_events):
    # Step 1: Compute the UCB score for each arm
    ucb_values = np.zeros(n_arms)
    for arm in range(n_arms):
        if counts[arm] > 0:
            # Calculate UCB using current reward estimate and exploration bonus
            mean_reward = values[arm] / counts[arm]
            bonus = c * np.sqrt(np.log(t + 1) / counts[arm])
            ucb_values[arm] = mean_reward + bonus
        else:
            # Initialize unexplored arms with a high UCB to ensure they get explored
            ucb_values[arm] = float('inf')

    # Step 2: Select the arm with the highest UCB score
    chosen_arm = np.argmax(ucb_values)

    # Step 3: Observe reward for the chosen arm
    observed_reward = rewards[t] if arms[t] == chosen_arm else 0

    # Step 4: Update the chosen arm's count and cumulative reward
    counts[chosen_arm] += 1
    values[chosen_arm] += observed_reward
    cumulative_reward += observed_reward

In [5]:
# Output benchmarking metrics
print("Total cumulative reward:", cumulative_reward)
print("Selection count per arm:", counts)
print("Average reward per arm:", values / np.maximum(counts, 1))

Total cumulative reward: 90.0
Selection count per arm: [ 924. 1167. 1113.  953.  973.  973. 1058.  924.  972.  943.]
Average reward per arm: [0.00108225 0.02313625 0.01886792 0.00419727 0.0061665  0.0061665
 0.01417769 0.00108225 0.00617284 0.00318134]
