In [1]:
import json

In [2]:
from typing import Iterable, Optional, Tuple
import numpy as np
from scipy.stats import beta

In [3]:
from sweetbean import Block, Experiment
from sweetbean.stimulus import Bandit, Text
from sweetbean.variable import (
    DataVariable,
    FunctionVariable,
    SharedVariable,
    SideEffect,
    TimelineVariable,
)

In [4]:
# Welcome message
instruction_welcome = Text(
    text='Welcome to our decision-making experiment. In this experiment, you will make decisions between two options. \
    Press the SPACE key to continue.',
    choices=[' ']
)

# Explanation of the task
instruction_task = Text(
    text=json.dumps("""
    <div class="slotmachine" style="position: absolute; top:1vh; left:0 vw; width: 20vw; height: 20vh; border-color:orange
        ">
        </div>
        <div class="slotmachine" style="position: absolute; top:1vh; right:0 vw; left:40vw; width: 20vw; height: 20vh; border-color:blue
        ">
        </div>
    """)+ 'In each trial, you will see two slot machines (bandits) on the screen. <b> Click on a bandit </b> to make your choice\
    Press SPACE to continue.',
    choices=[' ']
)

# Explanation of feedback/reward system
instruction_feedback = Text(
    text='After selecting a bandit, you will receive feedback showing whether you won reward (1 point) or not (0 point). One bandit tends to give rewards more often than the other. However, this may change during the experiment.\
    Press SPACE to continue.',
    choices=[' ']
)

# End instructions
instruction_end = Text(
    text='Your goal is to earn as many points as possible. Pay attention to feedback, as the probabilities of winning for each bandit may change over time. \
    You are now ready to begin. Good luck! Press Q to start the task.',
    choices=['q', ' ']
)

# Create a list of instruction 
# for the instruction block
instruction_list = [
    instruction_welcome,
    instruction_task,
    instruction_feedback,
    instruction_end
]

# Create the instruction block
instruction_block = Block(instruction_list)

In [5]:
class BanditsGeneral:
    """
    A generalized form of the multi-armed bandit which can enable a drifting paradigm, 
    reversal learning, reward/no-reward and reward/penalty schedules, and arm-correlations.
    
    Features:
    - Drift: Reward probabilities change gradually over time
    - Reversals: Sudden swaps in reward probabilities (controlled by hazard rate)
    - Correlated arms: Reward probabilities can move together
    - Flexible reward schedules: Binary rewards (0/1) or penalty schedules (-1/+1)
    """
    
    def __init__(
        self,
        n_arms: int = 2,
        init_reward_prob: Optional[Iterable[float]] = None,
        drift_rate: float = 0.0,
        hazard_rate: float = 0.0,
        reward_prob_correlation: float = 0.0,
        reward_schedule: str = "binary",  # "binary" (0/1) or "penalty" (-1/+1)
        bounds: Tuple[float, float] = (0.0, 1.0),
        seed: Optional[int] = None,
    ):
        """
        Args:
            n_arms: Number of arms
            init_reward_prob: Initial reward probabilities for each arm
            drift_rate: Rate of Gaussian random walk drift (std dev per step)
            hazard_rate: Probability of reversal on each step
            reward_prob_correlation: Correlation between arm drifts (-1 to 1)
            reward_schedule: "binary" for 0/1 rewards, "penalty" for -1/+1 rewards
            bounds: Min and max values for reward probabilities
            seed: Random seed for reproducibility
        """
        self.n_arms = n_arms
        self.drift_rate = drift_rate
        self.hazard_rate = hazard_rate
        self.reward_prob_correlation = reward_prob_correlation
        self.reward_schedule = reward_schedule
        self.bounds = bounds
        
        # Random number generator
        self.rng = np.random.default_rng(seed)
        
        # Initialize reward probabilities
        if init_reward_prob is None:
            init_reward_prob = self.rng.uniform(bounds[0], bounds[1], n_arms)
        else:
            init_reward_prob = np.array(init_reward_prob)
            if len(init_reward_prob) != n_arms:
                raise ValueError(f"init_reward_prob must have length {n_arms}")
        
        self.init_reward_prob = np.array(init_reward_prob)
        self.reward_prob = self.init_reward_prob.copy()
        
        # Tracking
        self.t = 0
        self.history = {
            'choices': [],
            'rewards': [],
            'reward_probs': [self.reward_prob.copy()],
            'reversals': []
        }
        
        # For correlated drift
        if self.reward_prob_correlation != 0 and self.n_arms == 2:
            # Construct covariance matrix for bivariate normal
            self.drift_cov = np.array([
                [1, self.reward_prob_correlation],
                [self.reward_prob_correlation, 1]
            ]) * (self.drift_rate ** 2)
        else:
            self.drift_cov = None
    
    def step(self, choice: int) -> Tuple[float, dict]:
        """
        Execute one step: apply drift/reversals, then generate reward for chosen arm.
        
        Args:
            choice: Index of chosen arm (0 to n_arms-1)
            
        Returns:
            reward: The reward received
            info: Dictionary with additional information
        """
        if choice < 0 or choice >= self.n_arms:
            raise ValueError(f"Choice must be between 0 and {self.n_arms-1}")
        
        # Apply drift and reversals BEFORE reward is generated
        reversal_occurred = self._apply_dynamics()
        
        # Generate reward based on current probabilities
        reward = self._generate_reward(choice)
        
        # Update history
        self.history['choices'].append(choice)
        self.history['rewards'].append(reward)
        self.history['reward_probs'].append(self.reward_prob.copy())
        self.history['reversals'].append(reversal_occurred)
        
        self.t += 1
        
        info = {
            'reward_prob': self.reward_prob[choice],
            'all_reward_probs': self.reward_prob.copy(),
            'reversal': reversal_occurred,
            'timestep': self.t
        }
        
        return reward, info
    
    def _apply_dynamics(self) -> bool:
        """Apply drift and check for reversals."""
        reversal_occurred = False
        
        # Check for reversal (sudden swap)
        if self.hazard_rate > 0 and self.rng.random() < self.hazard_rate:
            self._apply_reversal()
            reversal_occurred = True
        
        # Apply gradual drift
        if self.drift_rate > 0:
            self._apply_drift()
        
        return reversal_occurred
    
    def _apply_reversal(self):
        """Apply a reversal: swap the reward probabilities."""
        if self.n_arms == 2:
            # Simple swap for 2 arms
            self.reward_prob = self.reward_prob[::-1]
        else:
            # For >2 arms, randomly permute
            self.reward_prob = self.rng.permutation(self.reward_prob)
    
    def _apply_drift(self):
        """Apply Gaussian random walk drift to reward probabilities."""
        if self.drift_cov is not None and self.n_arms == 2:
            # Correlated drift for 2 arms
            drift = self.rng.multivariate_normal(np.zeros(2), self.drift_cov)
        else:
            # Independent drift
            drift = self.rng.normal(0, self.drift_rate, self.n_arms)
        
        # Apply drift and clip to bounds
        self.reward_prob = np.clip(
            self.reward_prob + drift,
            self.bounds[0],
            self.bounds[1]
        )
    
    def _generate_reward(self, choice: int) -> float:
        """Generate reward for chosen arm based on current probability."""
        # Bernoulli trial
        success = self.rng.random() < self.reward_prob[choice]
        
        if self.reward_schedule == "binary":
            return 1.0 if success else 0.0
        elif self.reward_schedule == "penalty":
            return 1.0 if success else -1.0
        else:
            raise ValueError(f"Unknown reward_schedule: {self.reward_schedule}")
    
    def new_sess(self):
        """Reset to initial state for a new session."""
        self.reward_prob = self.init_reward_prob.copy()
        self.t = 0
        self.history = {
            'choices': [],
            'rewards': [],
            'reward_probs': [self.reward_prob.copy()],
            'reversals': []
        }
    
    def get_optimal_arm(self) -> int:
        """Return the index of the arm with highest current reward probability."""
        return int(np.argmax(self.reward_prob))
    
    def get_history_array(self) -> dict:
        """Return history as numpy arrays for analysis."""
        return {
            'choices': np.array(self.history['choices']),
            'rewards': np.array(self.history['rewards']),
            'reward_probs': np.array(self.history['reward_probs']),
            'reversals': np.array(self.history['reversals'])
        }




In [6]:
# Initialize BanditsGeneral with negatively correlated drift
bandit = BanditsGeneral(
    n_arms=2,
    init_reward_prob=[0.6, 0.4],  # Initial probabilities for Bandit 1 and Bandit 2
    drift_rate=0.02,              # Drift rate for Gaussian random walk
    reward_schedule="binary",  # Binary reward schedule (0/1)
    reward_prob_correlation=-0.8, # Negative correlation between arms
    seed=42                       # Random seed for reproducibility
)

# Simulate rewards
rewards_bandit_1 = []
rewards_bandit_2 = []
n_trials=10
for _ in range(n_trials):
    # Simulate a step for Bandit 1
    reward_1, _ = bandit.step(0)  # Choose Bandit 1 (index 0)
    rewards_bandit_1.append(reward_1)
    
    # Simulate a step for Bandit 2
    reward_2, _ = bandit.step(1)  # Choose Bandit 2 (index 1)
    rewards_bandit_2.append(reward_2)

In [7]:
rewards_bandit_1

[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0]

In [8]:
rewards_bandit_2

[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]

In [9]:
# Define colors for bandits
bandit_colors = {
    "bandit_1": "orange",
    "bandit_2": "blue"
}
# format rewards_bandit_1 and rewards_bandit_2 into timeline structure
timeline = []
for r1, r2 in zip(rewards_bandit_1, rewards_bandit_2):
    timeline.append({
        "bandit_1": {"color": bandit_colors["bandit_1"], "value": r1},
        "bandit_2": {"color": bandit_colors["bandit_2"], "value": r2}
    })
# Print the timeline (for testing)
for i, trial in enumerate(timeline):
    print(f"Trial {i+1}: {trial}")

Trial 1: {'bandit_1': {'color': 'orange', 'value': 0.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 2: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 3: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 1.0}}
Trial 4: {'bandit_1': {'color': 'orange', 'value': 0.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 5: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 6: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 1.0}}
Trial 7: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 8: {'bandit_1': {'color': 'orange', 'value': 0.0}, 'bandit_2': {'color': 'blue', 'value': 1.0}}
Trial 9: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'blue', 'value': 0.0}}
Trial 10: {'bandit_1': {'color': 'orange', 'value': 1.0}, 'bandit_2': {'color': 'b

In [10]:
bandit_1 = TimelineVariable("bandit_1")
bandit_2 = TimelineVariable("bandit_2")

score = SharedVariable("score", 0)
value = DataVariable("value", 0)

update_score = FunctionVariable(
    "update_score", lambda sc, val: sc + val, [score, value]
)

update_score_side_effect = SideEffect(score, update_score)

bandit_task = Bandit(
    bandits=[bandit_1, bandit_2],
    side_effects=[update_score_side_effect],
)

score_text = FunctionVariable("score_text", lambda sc: f"Score: {sc}", [score])

show_score = Text(duration=1000, text=score_text)

trial_sequence = Block([bandit_task, show_score], timeline=timeline)
experiment = Experiment([instruction_block,trial_sequence])

In [11]:
experiment.to_html("bandit_new.html", path_local_download="2armed_bandit.json")