In [None]:
'''Lab 6: Multi-Armed Bandit'''
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings in output

# Import standard random number generator and NumPy
import random
import numpy as np

# Set seed values for reproducibility of results
random.seed(1693)       # Fix Python's random module seed
np.random.seed(1693)    # Fix NumPy's random number generator seed

# -----------------------------------------------
# Q6-0: Initialize problem environment
# -----------------------------------------------

num_arms = 5  # Specify the number of arms (actions) available to the agent
probabilities = list(np.random.rand(num_arms))  # Generate random float probabilities [0,1) for each arm

print(probabilities)  # Q6-0: Print list of generated arm probabilities

# -----------------------------------------------
# Define reward function (environment model)
# -----------------------------------------------

def reward(prob):               # Define reward generator function based on arm probability
    n_iterations = 10           # Number of times to simulate the outcome (like pulling same arm 10 times)
    reward = 0                  # Initialize total reward to 0
    for i in range(n_iterations):         # Repeat for n_iterations
        if random.random() > prob:        # Simulate stochastic reward: reward = 1 if random > probability
            reward += 1                   # Add 1 to reward if condition met
    return reward              # Return the total reward for the arm

# -----------------------------------------------
# Q6-1: Evaluate initial reward for each arm
# -----------------------------------------------

for i in range(num_arms):                             # Iterate through each arm index
    r = reward(probabilities[i])                      # Call reward function using arm's probability
    print(f'The total reward for arm {i} is {r}')     # Q6-1: Print reward for each arm

# -----------------------------------------------
# Q6-2: Initialize memory array (action-value)
# -----------------------------------------------

starting_arm = 3                                      # Choose initial action index (arm 3)
av = np.array([starting_arm, 0]).reshape(1,2)         # Initialize memory array: action=3, reward=0, shape=(1,2)

print(av)  # Q6-2: Print initialized memory array

# -----------------------------------------------
# Q6-3: Define function to choose best arm
# -----------------------------------------------

def bestArm(memory):                  # Define function to select best arm based on memory array
    arm_rewards = {}                 # Dictionary to track cumulative reward per arm
    arm_counts = {}                 # Dictionary to track number of times each arm is selected

    for record in memory:           # Iterate through each record in memory array
        arm = record[0]             # Extract arm index
        rew = record[1]             # Extract reward received

        if arm not in arm_rewards:  # Initialize tracking if arm is new
            arm_rewards[arm] = rew
            arm_counts[arm] = 1
        else:                       # Accumulate reward and count if arm already tracked
            arm_rewards[arm] += rew
            arm_counts[arm] += 1

    best_arm = 0                    # Default to arm 0 as best
    best_avg = -1                   # Start with lowest possible average

    for arm in arm_rewards:         # Loop through each tracked arm
        avg = arm_rewards[arm] / arm_counts[arm]  # Compute mean reward for this arm
        if avg > best_avg:                      # If better than current best, update
            best_avg = avg
            best_arm = arm

    return best_arm  # Return the index of the best-performing arm so far

print(f'The best arm is #{bestArm(av)}')  # Q6-3: Print best arm based on initial memory

# -----------------------------------------------
# Q6-4: Run epsilon-greedy simulation
# -----------------------------------------------

n_trials = 10       # Set number of simulations (trials)
epsilon = 0.25      # Exploration rate: 25% explore, 75% exploit
total_reward = 0    # Track total reward across trials

for i in range(n_trials):               # Loop over each trial
    if random.random() > epsilon:      # With 75% probability (1 - epsilon), exploit
        arm = bestArm(av)              # Select best arm based on memory
    else:                              # Otherwise (25%), explore
        arm = random.randint(0, num_arms - 1)  # Pick a random arm

    r = reward(probabilities[arm])     # Get reward from chosen arm using reward function

    new_record = np.array([arm, r]).reshape(1, 2)  # Format new action-reward record as 1x2 array
    av = np.vstack((av, new_record))              # Append new record to memory array

    total_reward += r                 # Update cumulative reward
    cumulative_mean = total_reward / (i + 1)  # Compute average reward so far

    print(cumulative_mean)  # Q6-4: Print cumulative mean reward for this trial


In [None]:
'''Assignment 6: Multi-Armed Bandit'''
# Suppress TensorFlow logs and warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # "Set environment variable to suppress TF logs"

# Import necessary libraries
import random
import numpy as np
import tensorflow as tf

# Set seeds for reproducibility (assignment uses fixed seeds)
random.seed(1693)          # Ensures repeatable results from random module
np.random.seed(1693)       # Ensures repeatable results from NumPy
tf.random.set_seed(1693)   # Ensures repeatable results from TensorFlow

# ----------------------------------------
# Initialize 2-armed bandit environment
# ----------------------------------------

n = 2  # Number of arms ("Specify number of actions/arms")

# Define arms as a 2D NumPy array where each row represents an arm
# Each arm has a [mean, standard deviation] for its normal reward distribution
arms = np.array([[3,1], [6,2]])  # Arm 0: mean=3, sd=1; Arm 1: mean=6, sd=2

print(f'0: ', arms)  # Q6-0: Print the initialized arms with their parameters

# ----------------------------------------
# Reward function using normal distribution
# ----------------------------------------

def reward(dist):
    mean, sd = dist                        # Unpack mean and standard deviation of selected arm
    zscore = np.random.normal(0, 1, 1)     # Draw a random sample from standard normal (mean=0, sd=1)
    score = mean + zscore * sd             # Convert z-score to a value from N(mean, sd)
    return score                           # Return reward value

print(f'1: ', reward([5,1]))  # Q6-1: Print a sample reward from an arm with mean=5, sd=1

# ----------------------------------------
# Initialize memory (action-value) array
# ----------------------------------------

starting_arm = 0            # Start with arm 0
starting_reward = 0         # Initial reward is 0
av = np.array([starting_arm, starting_reward]).reshape(1,2)  # Reshape to 1x2 array

print(f'2: ', av)  # Q6-2: Print memory with the first action-reward pair

# ----------------------------------------
# Define function to return best arm so far
# ----------------------------------------

def bestArm(a):
    bestArm = 0         # Initialize best arm as 0
    bestMean = 0        # Initialize best mean reward as 0

    for u in a: 
        this_action = a[np.where(a[:,0] == u[0])]  # Get all records for current arm u[0]
        avg = np.mean(this_action[:, 1])           # Calculate average reward for this arm

        if bestMean < avg:                         # If this arm's avg reward is better...
            bestMean = avg                         # ...update bestMean
            bestArm = u[0]                         # ...update bestArm index

    return bestArm  # Return the arm with the highest observed average reward

# ----------------------------------------
# Run epsilon-greedy simulation
# ----------------------------------------

n_trials = 20   # Total number of learning iterations
eps = 0.7       # Probability of exploration (i.e., choosing a random arm)

for i in range(n_trials):
    if random.random() > eps:  # 30% of the time (1 - epsilon), do exploitation
        choice = bestArm(av)   # Choose best arm based on history
    else:                      # 70% of the time, do exploration
        choice = np.random.randint(0, n)  # Randomly select an arm

    score = reward(arms[choice])              # Get reward using current arm's distribution
    thisAV = np.array([choice, score]).reshape(1,2)  # Format new action-reward pair
    av = np.concatenate((av, thisAV), axis=0)        # Append to memory array
    runningMean = np.mean(av[:,1])                   # Calculate cumulative mean reward

    print(f'3: ', runningMean)  # Q6-3: Print current mean reward after each trial

print(av)  # Q6-4: Print the final memory array after all trials
