## Setting up the Environment

In [1]:
import numpy as np
import random
import math

In [2]:
# sample products for recommendation categories and Q-table initialization
products = ["Electronics", "Clothing", "Books", "Groceries", "Toys"]
q_table = {product: 0 for product in products}
preferred_products = ["Electronics", "Books"]  # Products the user prefers

In [3]:
print(q_table)

{'Electronics': 0, 'Clothing': 0, 'Books': 0, 'Groceries': 0, 'Toys': 0}


### Learning Parameters

In [4]:
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.3  # Exploration rate (to prevent convergence)

### Comparison Metrics

In [5]:
# helper function to compare pre and post poisoning model performance
def calculate_metrics(preferred_products):

    # Recommendation Accuracy (aligned recommendations)
    preferred_recommendations = sum([1 for product, q_value in q_table.items() if product in preferred_products and q_value > 0])
    accuracy = preferred_recommendations / len(preferred_products)
    print(f"\nRecommendation Accuracy: {accuracy:.2f}")

    # Entropy of Q-Values (consistency of preferences)
    q_values = list(q_table.values())
    total_q = sum(q_values)
    entropy = -sum((q / total_q) * math.log(q / total_q) if q > 0 else 0 for q in q_values)
    print(f"Entropy of Q-Values: {entropy:.2f}")

    # Average Q-Value for Preferred Products
    avg_preferred_q = np.mean([q_table[product] for product in preferred_products])
    print(f"Average Q-Value for Preferred Products: {avg_preferred_q:.2f}")

    # Average Q-Value Overall
    avg_q_value = np.mean(list(q_table.values()))
    print(f"Average Q-Value (Overall): {avg_q_value:.2f}")

In [6]:
# simulate user interaction with product recommendations
def user_interaction(product):

    # Higher chance of reward for preferred products, else neutral reward
    preferences = {"Electronics": 1, "Books": 1, "Clothing": 0, "Groceries": 0, "Toys": 0}

    # reward for agent based on user preference
    return preferences.get(product, 0)

## Agent learns User Preferences

In [7]:
# model to simulate learning of the agent where user is the env
def train_agent(preferred_products,iterations=100):
    print("\n=== Training Agent with User Preferences ===")
    global q_table
    for i in range(iterations):
        product = random.choice(products) if random.uniform(0, 1) < epsilon else max(q_table, key=q_table.get)
        reward = user_interaction(product)

        # Update Q-table every 10 iterations
        q_table[product] += alpha * (reward + gamma * max(q_table.values()) - q_table[product])
        if i % 10 == 0:
            print(f"Iteration {i} - Q-Table: {q_table}")

    calculate_metrics(preferred_products)
    print("\nQ-Table after initial training:", q_table)

In [8]:
train_agent(preferred_products,iterations=500)


=== Training Agent with User Preferences ===
Iteration 0 - Q-Table: {'Electronics': 0.1, 'Clothing': 0, 'Books': 0, 'Groceries': 0, 'Toys': 0}
Iteration 10 - Q-Table: {'Electronics': 0.8648275251635912, 'Clothing': 0, 'Books': 0, 'Groceries': 0.08593447726472322, 'Toys': 0}
Iteration 20 - Q-Table: {'Electronics': 1.5705680661607317, 'Clothing': 0, 'Books': 0, 'Groceries': 0.1795946449937346, 'Toys': 0.09419557116715523}
Iteration 30 - Q-Table: {'Electronics': 2.2995685419484477, 'Clothing': 0, 'Books': 0.29996077654076797, 'Groceries': 0.1795946449937346, 'Toys': 0.09419557116715523}
Iteration 40 - Q-Table: {'Electronics': 2.822694674017249, 'Clothing': 0, 'Books': 0.6042343628372389, 'Groceries': 0.40915287813229284, 'Toys': 0.3322937116883714}
Iteration 50 - Q-Table: {'Electronics': 3.2427095093971667, 'Clothing': 0.7311755578335326, 'Books': 0.6042343628372389, 'Groceries': 0.6477334130368934, 'Toys': 0.3322937116883714}
Iteration 60 - Q-Table: {'Electronics': 3.7017636879676754, '

## Agent unlearns User Preferences

### Poisoning the Environment

In [9]:
# Introduce random reward values to confuse agent
def poison_environment_random():
    product = random.choice(products)
    reward = random.uniform(-1, 1)
    print(f"Insertion of random rewards for product '{product}'")
    return product, reward

In [10]:
# Invert policy by ranking low-reward actions higher
def poison_environment_policy_bias():
    product = min(q_table, key=q_table.get)
    reward = -q_table[product]  # Assign negative reward to mislead agent
    print(f"Biased policy poisoning with low-reward product '{product}'")
    return product, reward

In [11]:
# Change the environment by randomly increasing or decreasing user preferences
def modify_environment():
    product = random.choice(list(q_table.keys()))

    # Introduce a modified reward that contradicts actual user preferences
    modified_reward = random.uniform(-1, 1)
    print(f"Modify Environment: Assigning contradictory reward for '{product}' as {modified_reward:.4f}")
    return product, modified_reward

### Manipulating Q-table

In [12]:
# Directly change Q-table values to disrupt convergence
def sabotage_q_table():
    # Randomly pick a product to sabotage its Q-value
    product = random.choice(list(q_table.keys()))

    # Assign a random low Q-value to reduce its attractiveness
    sabotaged_value = random.uniform(0, 0.5)
    q_table[product] = sabotaged_value
    print(f"Sabotage Q-table: Setting Q-value of '{product}' to {sabotaged_value:.4f}")

    # Return sabotaged product and a negative reward to simulate sabotage
    reward = -0.5  # Arbitrary negative reward to enforce misalignment
    return product, reward

### Apply different Poisoning Strategies

In [13]:
def apply_poisoning(preferred_products, iterations=50):
    global q_table
    print("\n=== Applying Poisoning Techniques ===")
    for i in range(iterations):
        if i % 10 == 0:
            product, reward = poison_environment_random()
        elif i % 10 == 1:
            product, reward = poison_environment_policy_bias()
        elif i % 10 == 2:
            product, reward = sabotage_q_table()
        else:
            product, reward = modify_environment()

        if product in q_table:
            # Q-learning update with poisoned data
            q_table[product] += alpha * (reward + gamma * max(q_table.values()) - q_table[product])
            print(f"Iteration {i}: Poisoning - Updated Q-value of '{product}' to {q_table[product]:.4f}")
        else:
            print(f"Iteration {i}: Skipping update due to undefined product.")

        if i % 10 == 0:
            print(f"Q-table after {i} iterations: {q_table}")

    print("\n=== Performance Metrics After Poisoning ===")
    calculate_metrics(preferred_products)


In [14]:
apply_poisoning(preferred_products, iterations=300)


=== Applying Poisoning Techniques ===
Insertion of random rewards for product 'Books'
Iteration 0: Poisoning - Updated Q-value of 'Books' to 9.0001
Q-table after 0 iterations: {'Electronics': 9.764534336890485, 'Clothing': 7.421294922463105, 'Books': 9.000075150602564, 'Groceries': 7.893799450459608, 'Toys': 8.086817460845936}
Biased policy poisoning with low-reward product 'Clothing'
Iteration 1: Poisoning - Updated Q-value of 'Clothing' to 6.8158
Sabotage Q-table: Setting Q-value of 'Books' to 0.3308
Iteration 2: Poisoning - Updated Q-value of 'Books' to 1.1265
Modify Environment: Assigning contradictory reward for 'Books' as 0.9086
Iteration 3: Poisoning - Updated Q-value of 'Books' to 1.9836
Modify Environment: Assigning contradictory reward for 'Groceries' as -0.4978
Iteration 4: Poisoning - Updated Q-value of 'Groceries' to 7.9334
Modify Environment: Assigning contradictory reward for 'Clothing' as 0.0966
Iteration 5: Poisoning - Updated Q-value of 'Clothing' to 7.0227
Modify En